From 4448cf1f2d777c82d913e5ee0aeabd0e2785fad3 Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Tue, 26 Dec 2023 16:30:24 +0530 Subject: [PATCH 01/48] fix(ui/ingestion): add debounce on search on ingestion listing page (#9516) --- .../entity/shared/tabs/Dataset/Queries/utils/constants.ts | 1 + datahub-web-react/src/app/ingest/secret/SecretsList.tsx | 8 +++++++- .../src/app/ingest/source/IngestionSourceList.tsx | 8 +++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts index 5176c1207874c..025705abc580e 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Queries/utils/constants.ts @@ -16,5 +16,6 @@ export const DEFAULT_MAX_RECENT_QUERIES = 9; */ export const MAX_ROWS_BEFORE_DEBOUNCE = 50; export const HALF_SECOND_IN_MS = 500; +export const ONE_SECOND_IN_MS = 1000; export const ADD_UNAUTHORIZED_MESSAGE = 'You are not authorized to add Queries to this entity.'; diff --git a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx index 2728fff0ccba3..1a960997e6bee 100644 --- a/datahub-web-react/src/app/ingest/secret/SecretsList.tsx +++ b/datahub-web-react/src/app/ingest/secret/SecretsList.tsx @@ -1,5 +1,6 @@ import React, { useEffect, useState } from 'react'; import { Button, Empty, message, Modal, Pagination, Typography } from 'antd'; +import { debounce } from 'lodash'; import { DeleteOutlined, PlusOutlined } from '@ant-design/icons'; import * as QueryString from 'query-string'; import { useLocation } from 'react-router'; @@ -18,6 +19,7 @@ import { SearchBar } from '../../search/SearchBar'; import { useEntityRegistry } from '../../useEntityRegistry'; import { scrollToTop } from '../../shared/searchUtils'; import { addSecretToListSecretsCache, removeSecretFromListSecretsCache } from './cacheUtils'; +import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants'; const DeleteButtonContainer = styled.div` display: flex; @@ -84,6 +86,10 @@ export const SecretsList = () => { setPage(newPage); }; + const debouncedSetQuery = debounce((newQuery: string | undefined) => { + setQuery(newQuery); + }, ONE_SECOND_IN_MS); + const onSubmit = (state: SecretBuilderState, resetBuilderState: () => void) => { createSecretMutation({ variables: { @@ -199,7 +205,7 @@ export const SecretsList = () => { onSearch={() => null} onQueryChange={(q) => { setPage(1); - setQuery(q); + debouncedSetQuery(q); }} entityRegistry={entityRegistry} hideRecommendations diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index 6188845694f9e..e6db6bfcc9a61 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -1,5 +1,6 @@ import { PlusOutlined, RedoOutlined } from '@ant-design/icons'; import React, { useCallback, useEffect, useState } from 'react'; +import { debounce } from 'lodash'; import * as QueryString from 'query-string'; import { useLocation } from 'react-router'; import { Button, message, Modal, Pagination, Select } from 'antd'; @@ -30,6 +31,7 @@ import { INGESTION_CREATE_SOURCE_ID, INGESTION_REFRESH_SOURCES_ID, } from '../../onboarding/config/IngestionOnboardingConfig'; +import { ONE_SECOND_IN_MS } from '../../entity/shared/tabs/Dataset/Queries/utils/constants'; const PLACEHOLDER_URN = 'placeholder-urn'; @@ -133,6 +135,10 @@ export const IngestionSourceList = () => { setLastRefresh(new Date().getTime()); }, [refetch]); + const debouncedSetQuery = debounce((newQuery: string | undefined) => { + setQuery(newQuery); + }, ONE_SECOND_IN_MS); + function hasActiveExecution() { return !!filteredSources.find((source) => source.executions?.executionRequests.find((request) => isExecutionRequestActive(request)), @@ -401,7 +407,7 @@ export const IngestionSourceList = () => { onSearch={() => null} onQueryChange={(q) => { setPage(1); - setQuery(q); + debouncedSetQuery(q); }} entityRegistry={entityRegistry} hideRecommendations From d399a530576974da9beb1af24d7ea5f98922b6d3 Mon Sep 17 00:00:00 2001 From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com> Date: Tue, 26 Dec 2023 18:26:40 +0530 Subject: [PATCH 02/48] fix(ui): correct the color of edit links (#9517) --- .../entity/shared/tabs/Documentation/components/LinkList.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx index 1b5c3d54009da..9f94a830ac1cf 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx @@ -159,7 +159,7 @@ export const LinkList = ({ refetch }: LinkListProps) => { - + + } + trigger={['click']} + > + + ), }, From b7a0bbcb3d6000d3d9827ab19f13c3118d0bfc19 Mon Sep 17 00:00:00 2001 From: Fernando Marino` Date: Thu, 28 Dec 2023 01:24:25 +0100 Subject: [PATCH 08/48] feat(ingest/openapi): support proxies and alternate auth schemes (#9492) Co-authored-by: Fernando Marino Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/openapi.py | 41 +++++++++++++++---- .../ingestion/source/openapi_parser.py | 26 ++++++++---- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 3925ba51c16dd..ad62ef7362aeb 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -52,6 +52,13 @@ class OpenApiConfig(ConfigModel): ignore_endpoints: list = Field(default=[], description="") username: str = Field(default="", description="") password: str = Field(default="", description="") + proxies: Optional[dict] = Field( + default=None, + description="Eg. " + "`{'http': 'http://10.10.1.10:3128', 'https': 'http://10.10.1.10:1080'}`." + "If authentication is required, add it to the proxy url directly e.g. " + "`http://user:pass@10.10.1.10:3128/`.", + ) forced_examples: dict = Field(default={}, description="") token: Optional[str] = Field(default=None, description="") get_token: dict = Field(default={}, description="") @@ -87,9 +94,13 @@ def get_swagger(self) -> Dict: password=self.password, tok_url=url4req, method=self.get_token["request_type"], + proxies=self.proxies, ) sw_dict = get_swag_json( - self.url, token=self.token, swagger_file=self.swagger_file + self.url, + token=self.token, + swagger_file=self.swagger_file, + proxies=self.proxies, ) # load the swagger file else: # using basic auth for accessing endpoints @@ -98,6 +109,7 @@ def get_swagger(self) -> Dict: username=self.username, password=self.password, swagger_file=self.swagger_file, + proxies=self.proxies, ) return sw_dict @@ -258,10 +270,15 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 tot_url = clean_url(config.url + self.url_basepath + endpoint_k) if config.token: - response = request_call(tot_url, token=config.token) + response = request_call( + tot_url, token=config.token, proxies=config.proxies + ) else: response = request_call( - tot_url, username=config.username, password=config.password + tot_url, + username=config.username, + password=config.password, + proxies=config.proxies, ) if response.status_code == 200: fields2add, root_dataset_samples[dataset_name] = extract_fields( @@ -281,10 +298,15 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 url_guess = try_guessing(endpoint_k, root_dataset_samples) tot_url = clean_url(config.url + self.url_basepath + url_guess) if config.token: - response = request_call(tot_url, token=config.token) + response = request_call( + tot_url, token=config.token, proxies=config.proxies + ) else: response = request_call( - tot_url, username=config.username, password=config.password + tot_url, + username=config.username, + password=config.password, + proxies=config.proxies, ) if response.status_code == 200: fields2add, _ = extract_fields(response, dataset_name) @@ -304,10 +326,15 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 ) tot_url = clean_url(config.url + self.url_basepath + composed_url) if config.token: - response = request_call(tot_url, token=config.token) + response = request_call( + tot_url, token=config.token, proxies=config.proxies + ) else: response = request_call( - tot_url, username=config.username, password=config.password + tot_url, + username=config.username, + password=config.password, + proxies=config.proxies, ) if response.status_code == 200: fields2add, _ = extract_fields(response, dataset_name) diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py index 1ab40bc8be73d..84bb3ad452611 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -51,6 +51,7 @@ def request_call( token: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None, + proxies: Optional[dict] = None, ) -> requests.Response: headers = {"accept": "application/json"} @@ -60,8 +61,8 @@ def request_call( ) elif token is not None: - headers["Authorization"] = f"Bearer {token}" - return requests.get(url, headers=headers) + headers["Authorization"] = f"{token}" + return requests.get(url, proxies=proxies, headers=headers) else: return requests.get(url, headers=headers) @@ -72,12 +73,15 @@ def get_swag_json( username: Optional[str] = None, password: Optional[str] = None, swagger_file: str = "", + proxies: Optional[dict] = None, ) -> Dict: tot_url = url + swagger_file if token is not None: - response = request_call(url=tot_url, token=token) + response = request_call(url=tot_url, token=token, proxies=proxies) else: - response = request_call(url=tot_url, username=username, password=password) + response = request_call( + url=tot_url, username=username, password=password, proxies=proxies + ) if response.status_code != 200: raise Exception(f"Unable to retrieve {tot_url}, error {response.status_code}") @@ -251,7 +255,7 @@ def compose_url_attr(raw_url: str, attr_list: list) -> str: attr_list=["2",]) asd2 == "http://asd.com/2" """ - splitted = re.split(r"\{[^}]+\}", raw_url) + splitted = re.split(r"\{[^}]+}", raw_url) if splitted[-1] == "": # it can happen that the last element is empty splitted = splitted[:-1] composed_url = "" @@ -265,7 +269,7 @@ def compose_url_attr(raw_url: str, attr_list: list) -> str: def maybe_theres_simple_id(url: str) -> str: - dets = re.findall(r"(\{[^}]+\})", url) # searching the fields between parenthesis + dets = re.findall(r"(\{[^}]+})", url) # searching the fields between parenthesis if len(dets) == 0: return url dets_w_id = [det for det in dets if "id" in det] # the fields containing "id" @@ -349,6 +353,7 @@ def get_tok( password: str = "", tok_url: str = "", method: str = "post", + proxies: Optional[dict] = None, ) -> str: """ Trying to post username/password to get auth. @@ -357,12 +362,15 @@ def get_tok( url4req = url + tok_url if method == "post": # this will make a POST call with username and password - data = {"username": username, "password": password} + data = {"username": username, "password": password, "maxDuration": True} # url2post = url + "api/authenticate/" - response = requests.post(url4req, data=data) + response = requests.post(url4req, proxies=proxies, json=data) if response.status_code == 200: cont = json.loads(response.content) - token = cont["tokens"]["access"] + if "token" in cont: # other authentication scheme + token = cont["token"] + else: # works only for bearer authentication scheme + token = f"Bearer {cont['tokens']['access']}" elif method == "get": # this will make a GET call with username and password response = requests.get(url4req) From 754d8814477d050e907aeca6c561d98372b60dc5 Mon Sep 17 00:00:00 2001 From: cburroughs Date: Wed, 27 Dec 2023 19:33:41 -0500 Subject: [PATCH 09/48] build(ingest/feast): upgrade to latest feast version (#9439) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 4632c20cd3b96..32d49ffc73fa3 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -316,7 +316,7 @@ # https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433 "elasticsearch": {"elasticsearch==7.13.4"}, "feast": { - "feast~=0.31.1", + "feast~=0.34.1", "flask-openid>=1.3.0", # typeguard 3.x, released on 2023-03-14, seems to cause issues with Feast. "typeguard<3", From 9f79f44dd69a5a86864ccc31473305bdf1c2f4bb Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 27 Dec 2023 20:05:17 -0500 Subject: [PATCH 10/48] build: enable gradle caching (#9525) --- .github/workflows/airflow-plugin.yml | 1 + .github/workflows/build-and-test.yml | 1 + .github/workflows/check-datahub-jars.yml | 1 + .github/workflows/docker-unified.yml | 27 ++++++++++++--------- .github/workflows/documentation.yml | 1 + .github/workflows/metadata-ingestion.yml | 1 + .github/workflows/metadata-io.yml | 2 ++ .github/workflows/metadata-model.yml | 2 ++ .github/workflows/publish-datahub-jars.yml | 2 ++ .github/workflows/spark-smoke-test.yml | 2 ++ gradle.properties | 2 +- gradle/wrapper/gradle-wrapper.jar | Bin 61624 -> 61608 bytes gradlew | 4 +-- 13 files changed, 32 insertions(+), 14 deletions(-) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 70816e5f093d1..97a0da8546ed1 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -55,6 +55,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index dab64cf2dca5e..6daf1904ba3ae 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -42,6 +42,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index 46d97ffec8861..556cd87f12df0 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -33,6 +33,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 7cef38b1cd47c..454e766140245 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -84,6 +84,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image @@ -145,6 +146,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image @@ -206,6 +208,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image @@ -267,6 +270,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image @@ -328,6 +332,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image @@ -567,6 +572,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 @@ -653,6 +659,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - name: Check out the repo uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 @@ -731,12 +738,13 @@ jobs: strategy: fail-fast: false matrix: - test_strategy: [ - "no_cypress_suite0", - "no_cypress_suite1", - "cypress_suite1", - "cypress_rest" - ] + test_strategy: + [ + "no_cypress_suite0", + "no_cypress_suite1", + "cypress_suite1", + "cypress_rest", + ] needs: [ setup, @@ -760,6 +768,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" @@ -904,11 +913,7 @@ jobs: deploy_datahub_head: name: Deploy to Datahub HEAD runs-on: ubuntu-latest - needs: - [ - setup, - smoke_test - ] + needs: [setup, smoke_test] steps: - uses: aws-actions/configure-aws-credentials@v1 if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }} diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 29953b8b70d91..e1671cc021919 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -32,6 +32,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 4e04fef3b3980..af73db483f9ae 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -49,6 +49,7 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 2188fcb07c77a..96229642244b6 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -34,9 +34,11 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" + cache: "pip" - name: Gradle build (and test) # there is some race condition in gradle build, which makes gradle never terminate in ~30% of the runs # running build first without datahub-web-react:yarnBuild and then with it is 100% stable diff --git a/.github/workflows/metadata-model.yml b/.github/workflows/metadata-model.yml index d0112f1b14e7a..265a66aa236ae 100644 --- a/.github/workflows/metadata-model.yml +++ b/.github/workflows/metadata-model.yml @@ -34,10 +34,12 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: "3.10" + cache: "pip" - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh - name: Run model generation diff --git a/.github/workflows/publish-datahub-jars.yml b/.github/workflows/publish-datahub-jars.yml index 24d1c5436b315..0a311be33cd30 100644 --- a/.github/workflows/publish-datahub-jars.yml +++ b/.github/workflows/publish-datahub-jars.yml @@ -54,9 +54,11 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" + cache: "pip" - name: checkout upstream repo run: | git remote add upstream https://github.com/datahub-project/datahub.git diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 60e183cce5179..94692bd3c2336 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -35,9 +35,11 @@ jobs: with: distribution: "zulu" java-version: 17 + - uses: gradle/gradle-build-action@v2 - uses: actions/setup-python@v4 with: python-version: "3.10" + cache: "pip" - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh - name: Remove images diff --git a/gradle.properties b/gradle.properties index 1cd349344b432..f410ff01bf397 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,7 +1,7 @@ org.gradle.daemon=false org.gradle.configureondemand=true org.gradle.parallel=true -org.gradle.caching=false +org.gradle.caching=true # Increase gradle JVM memory to 3GB to allow tests to run locally org.gradle.jvmargs=-Xmx3000m diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index afba109285af78dbd2a1d187e33ac4f87c76e392..ccebba7710deaf9f98673a68957ea02138b60d0a 100644 GIT binary patch delta 11632 zcmZvCWmsHIvn}otoWWsmcPF@o;2JczyX!!3mq9{scP9i11PcU$4GzKGB{&IihkW_o zbKd)$`O#H-t*YJKyL-C(sjk*_`0{mlL^UON1bi?o20=0j9xOh%ei@J~uLo722sQ!? z42-jzH3vM*|5z;${D%Z1z>-6?+!a8R2&^_5R82;A{zEDJH88;dK(!yMfTk1-S2$Sw zCIswy1gQ9kmj(+JN(3g)qKZ%!32Jsub^v`?A}@l6ieT!WPuco!LiX35zyZJ<06|Me)};q$8UbE@JzIV7 zjabmR_!-#~61M#3PnCX3B*N=plNEo@)#a7bm0u@^k2l!XFA*mqTmj$fmF4iiq}LnC zF*Umt9<5P>#-!gN=S1d=2veDI!b$hC1Ln*>#i7yq^5KWPfkp2dxeSgIp8vhCpfFTE zV1wZQzExKaNRfI@`aAs-h+&iccdh_M%o_IV1>F#1(UCqNF2(lopwF*!G|`Yi+*}KY z_^VrRSQ!Go{X8FQ)7(c)FxfA3b^cwoy{7NHo*g_DAfbd4l;(a`|9s2tSzN_Vx?Em; z;hF;K(bxSLnqQ=w$d`>%SU^n@^Pi|(mMvuh0UNbZ*3pPEqIl6{lDJInhNCldqRNw9 z=$)@$*s!12V~KYN7t(143=hwuLM(2DGf-$GhYeQgp>5QzNMHH6(LkJ>v>BOt=Oi#|f+HvN*zP?|OMLfmXfJOvg~^ z+P0-<6_#rYiv&UYihhcOFRn*&!j^szdCaBEt|z&sf|%3)F<{;T^hTp{k+(<;eKdg;pqg8c@Q` zvCZht=B@Tg0J)?P!+-rXNJTM^34+xq)yspMn|7Anogw~^%`&!pux&Y%yDi%Mb7G(m zjY7wbTtyllcE!Hrs5L07C!{8~cWPum|1DckPu~UHMdQq}>5oNYcGnd@$)0$-K z#}DDT47{IUYHZ~MA|y6n4;#JRzjWeiFK9xX-#%#B(dX|(jG1efSmt#Sd(tTAct61T z)lZ?Q%2zprzKM^&ZMOUre~z3cju$$pHN_ya<43#01FE66$%`B`Hi|iNYsJIiZI5{G zv1T@cX`#C_P44Iwdju!pycPl!wR10ll2wz+!%2g0Toe{r2#O|WVb!mSl7B^25B2i~ zqHg!|n_0MZ3abhl!52%AOp^trBq>g1{glL>jFeQRZ9K&aA6aoSs90x1D%)HFi5_vl!N2ggjGZ#w)G+DyCm7Cdn zMp32|$lo&_%lM`T$)rI(^D?Ux(fSdVJe1*(NWW<|#aOr5Brdj@BBp^L%CL%W@88nJ z$5O#=5Tr(Ds4-!gcli+TZJ-c>V&~LYj0f5JnKAC}} zo4;Tk!k;E25$qg9CrG~x3%6E>n3ROQ?5Td7+>=MS35r`Ne&$x)>M`tnjcN9wzpdUk@Fq^Ro zEY4~4y-m&6j=jeH^V5$MH(L*=21{y3s_a+<&Kol< z#C{>*WrRxAJ_83`ePJ4_3xKX%BWo?OZL|fx~xpRi-Q73L-#oVqhw14kdH6 zanOGujOl8pv3Ko{)%?paa|;Y1N==Molf3)Z%D_kocrd;@J1};jBc*E{pYj*~AV%D4 z6BbLqXd-)Y@Z#xieFLC#-<36^E62EO6c_wt;(ETD^c(nR^KL5Ret6zggSP-pU3`bn z>fsguY(YTsXYV`{LvA_{9Djz@w*Li#bGLDt&_%1BH|tMgcTkI_28q*WQ6R1LEUsmR zo;Phy#X?-Dm@>dXfr>SAAQS|v?*W>~t=pe{=WQmT;`v+w?zSWOtDV^a|JSzuxr4wa z2p$H87zqZ39(-;`2GX~6(R8qNVK;X$b$1WfQu3aX!+)f21b)@=LDA?fsvx0I#81+v z3a8~ol(Ml^_IV#hUmP3FBPeY-lr~VjUz+t8eT(hUSLk8twy4>Dns~5JzJNe9A{m?6 zme{uHO<4qbeAuK5zPPOo7JUbou!)D4r!VNdAq7jsr5zkH4!Yb=3IFbw*TPx!V8vz# z`MInAzUeGHyh^{zYP~(&7hQmr72gHV1Z$0DX|lAAEx^36e*z>cr!mS!yovW6qwUT| zn)8TbNP(5#?vTc}dro>%u%xxO&oWnm%{x`-Ba>zXL?c@()UH=R3Wqlp10KMXEF+4c zW7$bIe9zh!0!kGpq-B?3UNGL`YY(#ed>JObE%~j;PE98|uCfWwxeA40_;-$ROHboQ z+Ntw}U*wc&CcCV&Ip$TbzNMgXH1Mls>Pk{{dAZ&%@igsP3o#XqT4oh>b4x%; z=iLLvEjM@gLwXm1q)Ll+w~r(>lBL%V!98)BCcqDRE$1)FUI*BlBUbTKgcy$2=xV#H z&K*3dJAsHklpeWePq&)AKY>KZmk78Yi6M#^DDc=1dlcyC{+Q0zI+s!ZK20BbRoOJX zb&_?FCCU^_NkUT@<$%D7?6tBneYk(kH#AcqFE8}3XGHJ?}g&)N=vHD78 zoJ7>3)EALmaLa@^ef|nfT`YHi%wh(RSFr$Ifih9Z8>&m+L_2>yb<9*)cuYIKaVl z6$=UE-2g(iI?>&Vv-aN`WMrAxBXl8*bV;ztb3@(x!ng3%qB2%dT)D|Ljb0)V6gk4- zZ;=g;Fe|g-vm=CT;POEWc~`BJ^k;To=D>A!LEL1PgA zz=?m2z*Jzpg?WkQdL#oUMPxLxG??w~dD3}tF?T}=lWQj4&FxgP;T0^>dT9P*P>fPJ zB+Yu!=Eg98)-DglC(^ePp>|-gTwv;4V!_)jiEAqALdJ?=@_;*+K}=vW93*i&Ol*nG z^9Da=y-4s=pN);>hl)LL8aTQU5-aK*ZtlYRabLtXr_o>451$GwqnzFC#PSkMX<2-+ zS2CvMOlUcYQPkD6h~a;HlbUJAQh&{n`YMi0m-z{ZitrF%ho?=0*fNIarq(!rm8{tK znd(rlO+EL%mC_8kgF2uZ4dgJUe_-Wimy+DL96E-~U6a`P0{cuYKX82j@c@+CMV?&| zhBtrPQPD`PUx;|kp_jpyZ3iB352>T$`GIck$dORnCCCj6VkJj+$vCXJm<^^EePm!y z%(IVe3pAJxya?=&@W`$B^idQyMKFACi}udo}EqNUCI_{_C^E zsm|pN26tJ;3LkH!H|3mb5p#nvmq5MZ;W~YbOSrK_s;=u9>A#0`*o`3&6%hso0}Td- z?eCD{gZ{Gu$LkpA;_2c)Vky4o#4x5olMKfU53(xRTj+2>>~_~&(gc{pym84`bbcqHy- z`b`TAE{+4YGMe&7DdDnzf8X~RgOZT;GaHDS4j3m3&~?!s;QI_rrvpe9TN(~xHn1YQ zYAUQ?(g4@f!?*xBddcS$E6#(C0aWXKR9fpc+N^IHwrE`dfi!-^_P9SWhCiFSuvApF zj5zjOVd2T;by%&!gV=KtQVlX`-WkC|JzSJF;|1H z%)UACPG#~|-?~CuhgXcWVhMPBy?DOICRmZQgn2TLez4loup9bT>;7H2mSeHo0EidU zWvy~YX1L&-Ut>JY+UuxqfY0riUd>AM%B7V%F6<(omjo?Z)kM|lS9J41KMcUqKwqgf&yPcGMII@*lJ?0LAb_xM=ls8~lf z1J+$Fk85i3EjB#;|xIr*Xybc096dJ7(kBml}>Z! zm2@E}AAeQyds@R6NGG}p{npJ>h{OPq>S?|w>1199ij~52`gwdBh<Jf>sij;7IoV z@^tO(yvaa8y|_nf+=K-c845Y|;aD7p2d~jU!_ce?%Y|sdr1#ut{gZ$P4@!Ly(GiM; zej;8!o{coegW297K8=FL7}IANaSkLK8?0XH37P^SY!5o>bHc&*Sr(|9{R-&5%o_xL z7s+{wuP4T-#C>DdLA=Iv%Z?uF7{{S~U^Hz^1`kFt3(m&)zL0AJLYtkT}Q zaoP*W#Rz+EX0qBie?pnucfs=)p_hAsQ7aImOUo9BiJS%3Ba!e%;=>n-m|rSlW{FF< zG0pxcFpSY&XcY9g5r*IJ*cFnk!j2+bV0e6DH40sv+^BxxP-MHKKOr31!nPppF5jC_ z4NIg>9W9APBY9EIKJ+0zH~aZ$&0O~N zf5*bCl`L21p`kGl8w?E9pY^m*uP9u78QR2|iV?&v#00W|_@ zL~vM1Nc(46j-hX?t~;2tlZ!F4ogZsHvdu2Hcr?_M)4r{=`Pgb;?_D#mZ*OootL>=W z_0;BU#jQvI_xMubB=D5o`XSGKyuc2EAz@+lk;kK_a{dtTIW^G+hk@l>wYEF_Gd1R2;Rx=IvTiJ&ocJF7}PAo49=yUuQOXXr}l*O2$rj zB;E0V&x+oTMvmK-J`;|`4y3E&T)pZt7Sld}d3c!ayrlN2Wj)(-hE#`(U^>s*L4X-3 zSKZkNc@<`Uto^#renX1G%8{FfX=+fH#}{~h?Zwl>Tk$g2rp`f?xCW=mO}=;Uh!-#I z$_`f~u;!e+L177craEY@carrpZuKD1k>`_%Qwy*!Fuu6=6UtdQ7`Ps zmhc~%sC{UI=Gt^EQ=q={R}Imvg>>Gz6p~o614C2l@&}Ts5G%Q;#$Q$ zN%fw`7331JG|<=_?_+!KpeXxxg)Bssi9(^IpUIDS!;QRtCL3gDkEgfeXf(EUz}dk` z5#js;zwa~SJBt`)D%+M~T+^x!H_QoCn_84lS&Hb*hK_Kdy{vQXde*~m3{z3Bn)$r@ zennVvZijwmgPjVa)?m|3i7FyxE3*7SQlz>5deffw@tvGM?<37r>#-Hh(rk`>&(*nL zlIH+bO(gSiR0c0drEGhLK8o`t5eCbMg%Zt@?XUS|5*3Lb>imHt^2~>Q!x%wXmz6&F z5u^f~Z`(z9B=N|aU56wZGIVTzmjv-mtylHGigDj1!qz@|9|LUsSboea*kjY6AtxhA zCk{}h(|=9yqHm;DrRgiT!Au;7nK^DHQqFm5RlEGeyAt3h5SC#j#Qg0fu~Q;IhZ%OK zDM8XciVr5W*mlMX)<2!tB_}f~0<0Jf)4%rCPK{T!UXWVEDRkJu$eE3o0KEh-t7-5dl-X5dWB2nYh*z;m- z`_l&;%}K}Jo|k&%beab~A*ntWA3Y>;!cgBPE*zvX7zTI9)pdWZm`60V2}m)|-uCM@ zAE=h`o<$w*p4X3N)nVZ|K#<$1*ZhEGZ7~(zuN<+m2+DRHOb zNf1j3{6Xw6Z04&Is7RQ4vf+cVfM)vsCB9hAl*zt!4n;n)=RG|1UWM9Erfht&MA+zc z#@@8jFVFUvFK`14T35rRbq5}ZMio<*9&H0AQ@$Vple&w!JEVB$)Ql{Tu1sgholX}b z-(F;XV9SdR<=r25q-xZ<(zMY7+A{=<*&dJdAoNTcZ+?a-N!a$E6pt-qtLvu+zr`M7 z#=OsNlIyd-6UT${FR<@aa&CSa39pXcC^V_=Qglh~8v7;hFHNO=?pe|R3KlU60q02s zz&#+nCA7kgk8v)nHvR1ltRb61>_RS9_56W4$TdFom<*B=N4*Uc`&eURwTHRU%q1qR zA4hjh^>!ZuBzFDVRCg)Qjb2-mYe(^(((3hxLK5ji#%|6ZNGPZ8V1truuNwFW=K!eG zA7;e%>d{j{BiQ>Mq|3DzeZTQ?uzQM^8Wcgpr7p3<8$CW*Q?qZgV0T+`4e?oG_+Xb3 z>-*-Nkg}9{2%F<<%J6Y|$W5!vt<+x!8qpR?R`u0Syzc=dtYY!J%ZV1`*I$<0 zgRp&eh|(#h7UmJd4n-_LIhj^A;4{nmhnB0R8KhM5VJ zTOZq&S*XYPNdl{|3mq{wMYiF!6y_^WX3cAt*$lRQVV>g8aNIq1+N@Za6L&aWuvhBh zU9)9oI)Zjqhs3Du^6J``)~g3Hy?g2S)feWMg=2{qc*5dLs;WI99(rcIt7Hi!n|dgs z@yr`G1oJQ+)?Lh^Y~&EZSM+necNdN14*` zRB`d;)-vab?Ic%p9$=4*AByKT<$98OyDq+w{>jrXOaxPVn+}n*LnUS$CB4Ez8h>5nG#}f z%ehQGu|h+NuIZ|3Jh(EMzHMOlS9TwgXP-yXKWxGc1K}L zR5c>?qP8L#5-(W7)WcDrgw9>0e+SI@&=6DP2r{Ov} zakYS6{E03GaI>^GYd8&7=?vrv3#sRt=N<)n_+Yw_>i&q}o$6vMyX5>%a zxgvlrm|4#rurzD>Rg>mCN8%t`STLjw3Gq#lB`xcDw*u3zgLo~BuaagTLy{}TK8ZZ2 zr_5NZ-2ZT!__;av)jl_|UB=C@A(U-fs=o`lp-I!zotr}OO=WEL6Yjg122x`kk?!+%b?G$#IcP z4`8#FRE|uxkqLXRFw0D`griN1Z`C83q>?WIa+wo3K&WrF5^<-|LI+^z6 zutc8neP67R$LPT-r8!zsG8AYNW&1q+9ylvMZHNfIeMQVg-y)R0%7AQFetB0bBaUQW zS6BrJ!g9`oMHB$t+m>{unk>pcmcBDB+J_iV-ayVK8v~2e(occTwqIY8H0ZkCiY`WP z$%>GQu-LMWhG4kcGLIH%+P*Xj&D6-E4-Dy2D9)9k*HNX>&h^7>GbDb%HHyVawi3Tn zl>X9O9Vu5U$x~TyNkI}438D7xUY16rPV_LBGLFN>_W1>}anQ8wCOu2d(7x0##g6w6 z9#|Dp7?YiFf7$xLxi8)akJ)wt(2LXNh0(A}%b--y&P2~9JcH+a>t zk7UD->bIgGU^>IZo9Q$F&T#5+67YiFsoj=-x?1nx+UXndM8vz+e5*{U?7)0jqKH|G zcmgTflkWm0l}+S$^C;J%`1slUg!3-=RSI-YLl&)-Vid9*PtIWRvpJxcls5K7@xe~N znNXzmrbNv0+uuqW{Mc=xPY&bq&B*6ApQ8xymcYq-V?Dzr?(! zkHdo*b%s#9Z^Rp4jVRz-gnJQ)+_cet*jU?^(1k7ytd`jJ_lQh+(71p30K+XXv41J8 z<{J77q|pw(ez#&Y&y&~;q&l3jDU;jeYd0wJ*DDGPbFQsm;X61Flqfo{RZm(i+9#R$ ztkj3x+DMet48M(H<0tc|;WII%<{E6pdB)k2KOab$#@jYbdZW1?#N2_$v;9q1z`WAz zEcFm(bOb1&HSsx*nX;$wO(@|||B*-<#zyQuC^hj~fA7K4CV(6f_Pd5eV@wxTjdT9T z;Egtzb-q_7bf&Xbj%cOTxmevTzWHwK%SwUu3;XrI6FCn{e{t5ad>>wVc#roZo_5Y3 ze|m4N|Hg)-TAt=6bYC!#EUsUSk07$Vz4>#-PS_Fw?tAyTvW4!w&!dPJd?h+>%)zAa z=pd3TQG$5o?#gn&&r3*eX>eHvA^B34Y);MoanfEEFGe{rdLkNqXTM(FW<$=H=oHXt1Z z^{Kcz8PAqne|sRt3IG1h6QM%sdsv@jJjEb(pNuBrO6?H>p?E>c9{4seCbz{L1_^Y@ z4592bZEz51ySawkmNvdiywbQLaoQ`|R$lp4UOJ>*4`<9#9zMQ)pBCTg(Vo+(OYIN5 zm_9B|3v6z$NF1|Cxw$vipHjl4wz&9MeL7nVDy=iOsKtUk>Y!Jq^Kiuv%Snr&d>Q)mC!c)IDUVr@t;X8*7=>u-rXBKm! z3Z-BT^05gV8?|s~5aKK$d9#yM5Q8Mdd6kS#%vg9@AbOX-@zz6)EMDy)Y z*-l4!Nu>VaceD`-yAGf~+%M2fin!#CZtC^s5u0tB=iY-qL9y@8xeVTQ+SAhLtY5FDGRFa5Mo`9BH$@N<)$KM@4eL;U&Xqf$+6bc=j_= zcFgC{zlk>95lwN)K|Q#J%cA1mFSpdu0uxs;eBz_Wpr7m^Jg&{t!<1 zF1Q+={Dz!C!J2r@31~)+!R~U^(|SwhdhA(j0V2C=Oby7lm}7dWH&?3}fI_Lb3}9G6 zdkOTh_%JTPup`uT=fZ%NMHY6(*k2mTHJ^E{9w+G=2 z^-2`8%EzT!cgs;3;-Tj7lmp|w#|Q2aWCD+E zJ@?=c3M>gi0U=~HC0ICo_~(odAl#7%7$PDJ#@>3)#%zHC`(l5b5O$zh9pDBD0GM+J z4Q^B74`{jb{H6kF05}T@PD%f%?}s$bU-yk2ARMFIACOay;xEXl2*7|QZ~plehnOis zX&TJ`Tr2*!9Ywo9I7CA**&Yq}U>5^g!|^qtEmE zAIE=P$NtL^+ZF1_@lU6_4uEjxUQmbtdg1Wm9~k=(2q)+L7e)mKc+>qu1wpAf|9>dV z0OEhBH7NBR^p`?O2}b&d;Wz@q?WX*ff!e~@LbhJ*Pz{-a|fC?-U znB(|4?dR1o5Uwx>ELlbfp2)%a$2mUte@~CNT&Mv#cTv+|U@C(G9N?EHNdId{ z{&n^7uWxf&0W~N4|E3q_7XY~W1Pu=H%bzBVo;=U&wG$v*L(d-|_!JY&a7qCMbNT>a zlT$Rfw7$Q@b0+hLQy|>(@E>6M^mzrig#yRpe}MQiWU$cLvz6u<5Kd+4FG!6B#Dh+} I<>$8jKcJOiWB>pF delta 11565 zcmZ{K1yoy27j1AU(&Ad&wYa+$cZcFq+-ZSe#XV5my|@(D;_g!1gS)oK3;n;>@BiOg zFIkzp@7`xl=H9t0=Vay-Z@@Hez`!ZX!N4Mepir>m2r!|LJuo$}p4+%a^QWQY;^-xv zB=+K6{R8+lKS~zYNtWXWCP7euYcKcE3DYdSx4vbLn{z}&9HsHs0$=7;Oz=8cx0P=4#l(@j~f z;OEdqG}B`CT!MR)x?0FaqpQ}U9i_sk!cvdtdBS1hlgE2@kb?gT(};b6F@b$VXZs@l zr*dFb5i-ApMc@22V$fn4td0kKo@>Ex?)@8u#X>AmdcLy8uLqO1fgLa(29DVWG!j25 zd>pv?p1S=g_lhAlD?xxp&!RL#{`Rnt!O_jG;`5;e!D8sw9j&-w#Sb`SeGTq5F#c|} zyf~@H?)yvns*)e9k0Z^;6c0`aM)4c+8|zPin&UjVL^AtF1sE(lv@T2yN)7XH1-U4m z8zbJk`AUj)fn~90=|NzsR?@S}XJIh1Da++Wjn$H>;n(x2kzr>;O-EwIKp5WErT0rt zkGg>{dMSa^1N+(Q&Sl%?A)U+T2ln6X?D&&-jM|FVP4sg~!=>O9w7-48PsL{Jmqe}r zcI)~t49*Txc73yR5)3`KSZ^kiy#2ODHJeJM7?oP(D$U8H<@UY2`CEz&wHILtXB5q8 zvx43gkM#|Pp~ba3y(e4?tY&@cw^iYF`|A8?$GYZRqb;2caFf&`Io zHi_X*O_Bn9Je9h9qb1}l`F04@O?Bp?`SO>3)uoVh{zgWtxh7ffmNmQ`mh2*PzmvYx z{y_t6le4Mfh$jupEHs))J0UP?lPSSLgI;DVWha{LOU=ooEZC+gY3JEGv}mw02h<-f z^fzkEhO;T0TznW4MNOJx?QJ&7d-`mhDO;0cY9l)7ILJ=AyOSn1B2bVf+b|BARTKnX z4EuHJlE3@HM-glx3KtbRr#?h5q=-aN62DQl`Ag7T_ud`ZHrYK(IAcz)uy?C@i_x&& zgzkF1K4mPSH8b5tT`_pVG$Q@$AP~fV!P9lE3dL;WR}#`9TYW^Sh) zP+C(x#F2#bE&}f-QAN5$`k9sXl2aQbrzMbXV+)g1JzhE35kb{=e{kvTO!W?0` zg$uRQ>0HW(IZ8m>!lw?-2vxG}`0-)=vV(r+H zHCn(y*qCG^im|S>H^w${z)?1dFjbwFbgv4QWK2%uEv*;@ZCIbw>*%={k#Md^Qj83x z&MAtwF*v%oV$eN$v~L`;^;@I7Ot`=GyiKRMF$wcT(83r)&SJ0lG3)1HqVUDT#2}&- z^P}EThW6U)m4=N}krqM9FV(>2+`L$${s2?j!*XRYBU4QWrz@ABmX6M&Jf5wk1==C@ zc^Bu(eSHi0XejLJ~Hs?^fJ!EweF#@mXefD);HCG|e z$J>|g6NycFz1nYuKlZOD6g>8)qEsU2P3cY^kMAGg#p^3GEj}`5$#MgF)?n2B^ggGf z@3mC!4W-dO$A=5h`rIf+MJi|jZ9Q7Yu1Xo}PD_}^+TELki6;py6+nV4)HX5DKo&r3 zxrUZ!5%8^@VtnjS1_$@)M{Q)jrrVv_t35%^aF^EZzKQn8S7W2F=EhIik#X~rI>J)! zpam~i9;0=B#)ELq$jtyC%=6Put&V^cFK4ueJA7qx1UC6hDuk5npK>bV0`0tBGpV>k zdv`7W@5T{`acdMB2!KQES3Cn924m7DI&Q5M^Fj6Ur|Y57*O{jUINDJcCfLU2RsRaU z&&hi&T?lKNR3s7+lc>Op&y65Skw-&IfPWa68|M_E#o>L>TTYpWNGQ)~L$U?Xtqs9H$Z;PQ z#PE>>_utq|VC1tL0|NkvL*g?#$oL}>P}$hkO5NGm-Q3y5*jC)i-1MWlGqb6!v5QNh zy4>d_2`oX(pK-PfKE*-Q)=1<&l$v2PII07I@tcLpj?;diggWDQPKEWlI&2iYWDHyn zVRz_N`_S5r_v541uj1NH|vwIUH-%b&rSE=Io&PmP`dj>K!*abJ@9dG%+VfTNvm z#HTywLc13ej^Y_YtYvFh7)RnX~#%7j;s5^KDQ{ z{z*TIA#XfqCH1{MX{+WYLM90{P%K#BY|c~FD9KxCAQ_>cxmXC>Ij$@6-ZaOxv{lrX zc|0NOOz;Fwpv)^#;+L^q^_{~TsNj+|P8n`hS7`P} zKOR}zkL6Sp$i1qDZXXDg9%#)5?$2R_m!odBIONQ-DkYe7usyuLunW;j0kt-gd14ym zN3sW4uQBfIUxyN)+YjoNlKn8-Y58X59=E(NcV2hg?~X^s)qw4lHf*!9hRLNf1j8Ti z4qCTOL`X96-oJht!_2MN$_&$3)|&W8-QLrJGQxhIxY=2+z_V}MB>{IxT|xU8E0UM0Zr_pQo-ro9Lm4Eopxy7p#3QePg(>#)QDR zy(Pot`GZwBTRUc`5<#F*2_7ePQSWvuQHq^68>0^?2??94gdLCNK&gDGQPGBdMZ9V2(28U}|Kl7=2KMXzCYN_hOA;F8cn_C14Thk1v$;(!a&N zdMJ+oQDol-Y)H0zsMg zZ}5+QzWW6$2{%HHj%!JKhNNGhT`|dGUmr85PXaQwv9n9z6B!kY8A0ziD^r7=lhXXW zt6jujwU~f#C{Iu5R>rLwwy&K>S1>;)uZn@CjmE9qDKKW@FT*H~F%tjF;2D98IOWF| zW{TAQw|V66RM)HzM=8=D;Y^7m^)Q3JO=uhOCRt}F$Q?bw{Q0T*n`-AG^3h;WXkzm>E^w{1@bp$Wn&zrMHkCun4(GtJh)fXnU$Lkq`}B-?@;u zjLkOPj*T0w0Rf)jZGg+h+RzrLla2Nm1rl^`4mg}l>%0{gf8A0a=5=Cl2}UFS$Gp|Z z5JB^CGZxdrH{h_5v4UB10fkTY*{}S8XNm8I-dfhsVs-aB+5c8q8{$x#q+yVzFwevF zIzjGVSGSpVo8mKX$a19>p$;S9^)J`-CKDc#0Z+nteUn8C}h3Q z3Ni(uBnS#D0+kA{tBFkzTA+^^xS5ro^RDbp#i-Id7uOULQ718&syup|Z8rkBtOc;| zblGZo+2?MK_6}+(Fwa7!ssSENi_NyERzJ3Yn-_OYyPiBGQuBL~bph_a&xAcm2fXWT z%2eo%86G@P6U>vU><4NWNR!{CYL-P!PZ(^nEca9-9TZ8`KJBlF<$b!Xj!gX4W7DhO z%~u|>w9twA2};GVx69#wXVDspv6KW30H8z$i5B65M4X6#OKO-}c+bQxKdTr+-_eH! zREx<1Fj>B9R%u4jPQWoi+0`vF50aJHyYE}%P1u~4w+Y<(H#{NTP%W^1pH6Zw1cJ7% z_*-OUO%V08Fs@V;fg0Guu_>4oDQ9dCbCi(Yf7Oc znd$d^X4{PgV)wU3zQ!@r@6vE;B@t6puK40;@B;xum5scS%#?_m%6lQq(3?{9m9fgg zP(&Un8ndx%pDz??aBh$}y!jAEQ^vbG=aeM;fFf5l4@LGnj6nvJr$~);lqHRo?b^gC z@&@pC$v|4;*`bep*rRj3pQH9XzPi=KsWY`CGcW3P0oV6i>g#KM9AEJcI}f3?jaW6O zxN$6P-0+BJGghF$gdH9*9Y=rV;;2m?Yn)D1*t`6?f2Pros`tGPn#N>inzadJn5;&qE!HGaY=F@n4Zp@L+lcdk zf5NB-2MpE@^czh=$@G|d8$;>AO=E9Ykl89z`ctjDNVYw~Ik7`9ue{L&!_}FqIu*TB z2Q*rjgEQLrc#!{Tial6kH@6+Rz3T(9emMb0ggOe;wjSKthJV26xnJQCc8KkIXaLC> zMfU^)J?oWFCD8h!UK@4i$T~wxms0Ml!IV{v5_M+AY~T6yfu_UcZrJLqlvbI)pDELb zLfo1}jgwVTU)mv`21q?CPW{XI*P$7H3ZUw?qI>z#nd;>y8A?4G*N=j+L*}Eq)V*nC z4mHh1V5=}5f(xz=TbC$|<;2Q`AIgt{Q`>WS$Y$qddGr-pf{)g+XCrAW0A=G76f#U z%28wCW~@gG^c2(aZb%(X(UoNsIL~-rec>;y1^q5C6a#OgX39W{!T{iIZ$TQ+n-V^<&+^4f>`X8qLR-~<O4pA~qUV}!D^Lv)@+bmZ)3&!1 z(zAowG~^Qd`DI(*j5e&>%17u|tI5n%HOHZF2#(b?6&BWu6T6 z653gtSdk*6n|ILkV$q;w(q<<{cuP6~XD~c+JV_w{9TN{q0QLs9ACYI!Y`L9BX&=UH z^VCD_%kw`gU=;_DRN8|Gl;K_u$$Xy=Aj?-C9xsbp_J@#E{PW^V4n6@Sgsh^0NRIG9 zaG*H|@;!oRpyQY2tu+XINd5Mbf){1%LyY4x(-9#D=AvHc|4N?uOooisumFHiCIEmG zWbA+|JN@k}zLgTDHi+9{ zF^Jwq5>8a*r|%VCI<^PSwri?(~qFvt1ZJi z&){Vjhqpe@W1mBzIszF3J(0f}g`)B@G}ra3ZC$e!-oM3FC0 z@c9i>9b=W&F$;L<2H2Y@Wwb6Ctt~$46Xb{KQ(q2qRv$J&GGqrkBHJQZkD!~$BM zb_bWdz4Wrre~^Ch(v8JE-z1$q$GF$0A7!&9&)XxNhxKggqZvL6C-q5JiW@_RYt+nS z2-NQ{1w!kyP?SpkQZwI7B~{oB22>`3tqHi{t&u9H8|h?*JJB)@3x1N0?u%LGG1{S> zQSbjOJI9G?fWPa8vX6_l6V)ADD!7-4&D@Hyd3YB!9y`w-=t@8+sykdx&;V!Mrff8P z48^sRC!QMGmm#`1Vn84r=#7)bq`)YnS3vVNA9yUl)QRLpT@VJ_ls1l3g}sfn^d)_l zICp9ca63?|1lN*xAJc9%R*Is zkE8v-t8?u&c^5$1#?-zSziZd7yv1VW{RXreEAvx|QFGqBKBHU+vrq$SM{q39zD&(k zHWc|P$-evK+n3PuxnyObNxQD#2VQO0bl}fv9J!)g{*l?qWSylsi$c}I$ee*yyc-#4)x!3Bx0xO-LI=5n)jOWl%54mOG`7OA6CMjc?UPHw^_P93kFL$~MAS za)*C<|9eaB*)oULSacXJUT!cX~Oea?jgfsKi1E6YW;BP%u)wOt;;RA3oS zS}4QfMSsdrbSf5y4Z>%J#ni8-qneS{k>Wzei?eCmTp!*$Jtgcin?B|Cj_&hJ-z68f z(>p-!Y;KE_C-?LZ3SF<=P>uC6B(R1G1I%!OQ zu2wJTEN#Ao_bGZ;tUWvMA4)k_CV^yCj0^4YgVW|W*}}+c?zF*hPOyWnAtFPza(XH) z4WEc7kgO`g-BalZ>ja3Q12>80gC#r`;c2ErQ2ad;Anq9t$_OdZO~ zOM+55qMn9*?CyPJ7o$7jo-R(ZRejJ`%qBiT)<%sVvO$fCn!I{qI}sGZq%Wj?gw06+ zbWr4nN|XAlJIlduKuS*3jtCO zk-XS!6@TW1#x0UAtJ~2{jvA2j6Rdn{UItRPqTUCgTf{mkZ5R>R6n7u=D$58c?qp4Y z)c3Nmr0^kXfw^!2y>v~!2QmRF$nO0WJs2>& zGT(7)%gPQjOiw0@Bk|KK;PHw|jgW0r3E_U{S2 zVBb-S*+6F5Y`=W`Fo_aKr=AC0Gw9`dh`HaGGJNrtDn~2Sw5#c78LwA9Y04GfTE@&p zcy}1{h8JW$VGisDPgYyy`#CHp*zWNs49F0^v9?J2Phw)?!lLe^rnrE|%nw{w=xcu; zJX_WY_3Rv%=zpqx#cU-HgRc&-0oYm10iwK&yFr`T_3NZ&-GYPSkQ&^NTmQ#t9 z9%x(H5$d<9<7D|_MZ|Eb;a)M~ZP&;*&JeLa+NOi}7jM)!%GCK8 z%^b(6d#NWhHpwR|!`Fe(t~-Jc1{odw-1`qc?PB1xdjuKn6Z6QvhP?;zb`5ZwciLFP zCGphJbW=>_Y*Uuu2fj;nDhHm98N_jB*-^9xno)J0xVmm&v8NqHDk5Q+p?qzXAPp75 z7}C7u%|TTLipK{U#5BQ$In%ZcuO_cH7v%yJORtHotKXR_H$WMqLodc)CJhj^E%~mFPk5+c9$2XATCL@>V=1%jTNtO z7E$IX;%W-mdf;;runR0*AF=W;+fh|doS=%w7;fx{)X~FC@}@Z9aWG=5^P~nWe$_^2 zc1WT!?hdd@5kPdMb5GgXKx4oU@v&vdz)<4pT?P`VFs2)khN!Gb{-7#}m*C0?F5pWf zE8+(a*UM+%s~AG1IF{4nNTcjMyf%vXW%b&Ka2ko9fZN3_5_YA#m>#~?q1+t5Ek3}_ zH^^|}Ud2})+`n^-#1osDl#Ey}g_**G1Fe)B&Z7^<1-tT7?Mr1jL#D*wwoZ*z+?cRv znLltyjqf=jYwq3>`cC#FkxXsC{L;P8^@;ac4DnGl)d*FEUW@nO;bV2E))Ae`P1;di z4tDm~_&^;#i9W2&L)loz4%pS#&>6fP$4hM*aM5hWH&;Q9V9sX3sz(#jg79j`QFhY&?m$l4|nYVg6rJ}Q{fp)pGy6&m-N#!=sYsuEh=C~ z*+^bp$?*FV*XTIYU`@{AYw4cB*C9I^4f2>P?^WI6V=vn3azr!j;ayq`MCq2T7sbDm zXz6l2d@>)ZlJ&yH!IU$or2QdX!#|DR@0@WZyfTV~im8`T>U@A|I0O5R#Y4g6=~=%D z_4}d<1K3dZSJa4X<=pj1p`*V6&W9c#)I>!^pd)kz8LpJ#PXL^>sAz#`E-Jy8OH(i{ z{}0YMT-uMg-gZVh)Z9w@C+^)}2~dd9w!Q7D&5#A3Zsa9A<%YL)Z5}3UODa^Pe-3g6 zHpuoCpjO~rZ0XNhd(hdwQmW1%Dk=Ip7nPpX;`dFU)!lv?#+{8J1V>@Ezr`*c==(@) zW2sD2{HCWqrmm{=u4&OHbLNDmCEV=_EZ{)@iMj01pGI1n5p3B|Wt1YnhzOPUnHago zVyVjZyi=R*H$jvubg%#SIgxRc+og;y+QEHxVV8O$uE#yM^E7Zaa~zS@fr(txYo`;@ z^dt49)c1W;^>pV#wz9sh$%2Y~K$e8z%Gt5RIZqMCAcD!6Z?;jdJ`)^^%*aZ&17Rba zUKftZP+}IVm_~De-WxC*s&nM`Op)8S=fOLq)MZG`NIm+Giu76-QfH+nWe2bJDLQrJ zzxKt14`FLCpCXJcwo}(K8ID03q+p(-l&lzD72BP)Uzk-l!OM`9_<5SM1Pqwgqwp~L z86cZs5ks$BtL2{4ZKySXEnnkerSLQ28{dgpJ8FtpruCdkb=jb*v%A$#?QKOT7nhS5 z9WNM8`ZP{z5At%t_~#md3vlBFq8Yk%{Nx4oj2pftJ9&>pjppQfR0(+w55jJat60Ik zBH^bJuKHn_ayeZKpOMIJe&7dAdgGggl<^Jwp`ACVnlAQ~mr76J?am*5gq9q8fWDZ1 zO=L?Sfc#mb%M+LUR=Z3hywv23j=Jg6qoM28-W$BUK+OEwo4;B%#TEKEmyP^FB~(Z@ zi@Z>2tJKe3EpGY4dYeoI`^@E~tN$lM8KX`Y=v_kz+`LEfb|E-qLjj=W*IPB?djOnX zxKvb8$Zy#->AYcvn>bCwrkIBvjo(%yhP@kdRxuR9i(c9~3YgYHX4)d`R%96x5qlVs z3H%)U!&lhk*VJs)pD;_1XVM0CL{GK$Rj)!he$&YiWHQkIvO419*h(Eb&3@;JPjVjI z>Z6WzK4>b|6Z14l_6z_j*wUfji?~rUS1;^^_HJR{;rOPeS|6ToRdWL%3Jq zWf7PD>73Yxg^u;sM^_VxSJpnR@wcA{urH$0mZaGmg1x6SYOcM7PkFEMtJ2&Uf{0#i zr9(q0<>+2t(bn1lb4=>^v*#2?_TXH{zC?ITo?khoDr9B9=R-eq?@rF@`nIAT118|9 zN2*vsZm9oJz2c|G0=TsnITpJz$bG_e>&%zrh-g>f{=q{w@ql*hPuFt(WPqoeX9t+P zWu>oZ5dfzS?^ELu7{k%86{RH}W#y4fFvT{7DHf0fgySv)wI<5zaIfsUQ@`XBL+f-% zwQ!`CO&$}hbP*g=5Vc#B%@wnp2}SicNE!PkSy-#zqD8&k9P$;JLP`+N-p9Nj-VilX zn}tdp?d5R&t4Lp&3PqsR}SB6{ij%@+~0if5I8Y=9YgxOR~WJbbF zf{olEeA3enqPc|DSjz9-yv3Fu2@u9zZ%VK)N}MU;Fo3#|^Uo(Kf#nMSI`QZU%2!L8 zZjXMe9Hz2LDf4P(DMo{Jtx%Qb85j0)<&jh{Ve;&CCPvdo1S@Ln)P?5h0Qohlk}c##czc|4z;*%r%M!X3nQ6t=QIT&gu|`*L8L;nSxCwc@+7Pmd ziw4?l=CHaW+WFN7Zc2v>ElQJM;%!b!TV743e9lpLr#c5$rz6NBdEoe=F^W3J2XD?7 z_U1Fn@13pWb4t=X%PG5skKy$jrihe3zyG3-v?mBewKw3%@P&7T@3f|{dWNNB`y)4+ z1cqe|#VqvoYmnB>Xz3uh>E0Wi{1{H&$n&zQ>;f6@5&Oun!0C!0;MfgI$)OX;VM^bd zrLe)dOEWkHn4R=tl{|sA2HmTwf`}g2S|(9Bn9>^jlrPV`ORSK2TP%|&e0k*|k}vYq znot6D((UnDjrnah^UudTg@xg9jP1ACwdSwh?J)Hz^D~DzIm;EmN}32$Q zU|@m)k1=SmUnXR~2R%HR-Kz2BCYJs(mX!UnQqHH^&e6+coNvrqkDSfpe{S(Y@rL(rr>j=kRowbHWB$RMNgQ;dvidvC>Gm74y zr+&+;&x!UXu9)m(g1>!G*m&4X2TrmSc39TeMeLIxgTK%UJ;^=?q}Aw*>)bn(Va?CP z!%x03B51pQ%QQ=ircVNW`JfytYAKP|fW#@bwxoVXLa!_-W*@PD;tt6=+4OqmCuvxj z@J}V_=GCE=6hG=5wFUR7YnDKtEDMGh!r)nwo?oHZyL_YGF}b+JA{Zx21@=uV`^p!k z2v9g+vJ6`5Lm#f?axjaAw&!p;6I99P zt8*QC9uOI^6>3?@Km~Th8c;#@i&rax{a{d}h(zj5>1LU6r*MYfZOZN)%var+F^a&i zvan4S)KA7`5ZQH7hi-~OFXCz{q|^x|8hg^^Z;XqEBAO_6dzi%^`F+$l+rA!^={v+BEq~W#UV%Lc|K$OKBrMsBeY# z9}0Gr2!hB~dHy+fSY?2qv{fxg8}|wygtEr{N5KaftO@>OQ?*6|K?`e!kd}Mh3esk; zzu@O}O-NgcgAa1p;DexZB79KyhQLcLli`J zTiOtHl?@-nxQz^*%Ldw@r2y$|Lxv71_Mm{u2IXu^LGT6-J_@7)^3N;ZzkuhD8SBo= z<(%{5zxr4D4szNNg|H`r_@I#;Wat{fe@x*)%t9y#%5qT9STM*StX&G|GLiq1AhPJ| zzcLUar;GoQ`*(36?$#j0x#S;$222Sd#xciUSI0Blkb_0}v5( zc7O)Cwr2<_=JA&VdQkthw?CwSp8ot-syZZr*iS&ne&9bO+Fx6eBMRuP_`kNKAe_IF zFGQLK{gE1vs39)aAjD$o9|Aq?-{J8cQ$S~C{GnZsA!WRvxXk~4>OwRBNTlBoK*Pr` zBWEu9Z<(s-uS|G?|6jN0fA%*1-k_JMIV}073+mso2A)trBX&S&vcJ>>gsSTTRd Date: Thu, 28 Dec 2023 04:06:41 -0500 Subject: [PATCH 11/48] chore(build): update base-requirements + add script for regeneration (#9524) --- .../base-requirements.txt | 317 +++++++++--------- .../regenerate-base-requirements.sh | 37 ++ 2 files changed, 195 insertions(+), 159 deletions(-) create mode 100755 docker/datahub-ingestion-base/regenerate-base-requirements.sh diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 141382466ab9f..9092875902794 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -1,149 +1,147 @@ -# Excluded for slim -# pyspark==3.0.3 -# pydeequ==1.0.1 - +# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate. acryl-datahub-classify==0.0.8 -acryl-PyHive==0.6.14 -acryl-sqlglot==18.5.2.dev45 +acryl-PyHive==0.6.16 +acryl-sqlglot==20.4.1.dev14 aenum==3.1.15 -aiohttp==3.8.6 +aiohttp==3.9.1 aiosignal==1.3.1 -alembic==1.12.0 +alembic==1.13.1 altair==4.2.0 +annotated-types==0.6.0 anyio==3.7.1 -apache-airflow==2.7.2 -apache-airflow-providers-common-sql==1.7.2 -apache-airflow-providers-ftp==3.5.2 -apache-airflow-providers-http==4.5.2 -apache-airflow-providers-imap==3.3.2 -apache-airflow-providers-sqlite==3.4.3 -apispec==6.3.0 +apache-airflow==2.7.3 +apache-airflow-providers-common-sql==1.9.0 +apache-airflow-providers-ftp==3.7.0 +apache-airflow-providers-http==4.8.0 +apache-airflow-providers-imap==3.5.0 +apache-airflow-providers-sqlite==3.6.0 +apispec==6.3.1 appdirs==1.4.4 appnope==0.1.3 -argcomplete==3.1.2 +argcomplete==3.2.1 argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 asgiref==3.7.2 asn1crypto==1.5.1 -asttokens==2.4.0 +asttokens==2.4.1 async-timeout==4.0.3 -asynch==0.2.2 +asynch==0.2.3 attrs==23.1.0 -avro==1.10.2 +avro==1.11.3 avro-gen3==0.7.11 -Babel==2.13.0 -backcall==0.2.0 +Babel==2.14.0 backoff==2.2.1 beautifulsoup4==4.12.2 bleach==6.1.0 -blinker==1.6.3 +blinker==1.7.0 blis==0.7.11 -boto3==1.28.62 -botocore==1.31.62 +boto3==1.34.8 +botocore==1.34.8 bowler==0.9.0 bracex==2.4 cached-property==1.5.2 cachelib==0.9.0 -cachetools==5.3.1 +cachetools==5.3.2 catalogue==2.0.10 -cattrs==23.1.2 -certifi==2023.7.22 +cattrs==23.2.3 +certifi==2023.11.17 cffi==1.16.0 chardet==5.2.0 -charset-normalizer==3.3.0 -ciso8601==2.3.0 +charset-normalizer==3.3.2 +ciso8601==2.3.1 click==8.1.7 click-default-group==1.2.4 click-spinner==0.1.10 clickclick==20.10.2 -clickhouse-cityhash==1.0.2.4 clickhouse-driver==0.2.6 clickhouse-sqlalchemy==0.2.4 -cloudpickle==2.2.1 +cloudpickle==3.0.0 colorama==0.4.6 colorlog==4.8.0 -comm==0.1.4 -confection==0.1.3 -ConfigUpdater==3.1.1 +comm==0.2.0 +confection==0.1.4 +ConfigUpdater==3.2 confluent-kafka==2.3.0 connexion==2.14.2 cron-descriptor==1.4.0 croniter==2.0.1 -cryptography==41.0.4 +cryptography==41.0.7 cx-Oracle==8.3.0 cymem==2.0.8 -dask==2023.9.3 +dask==2023.12.1 databricks-cli==0.18.0 databricks-dbapi==0.6.0 -databricks-sdk==0.10.0 +databricks-sdk==0.15.0 +databricks-sql-connector==2.9.3 debugpy==1.8.0 decorator==5.1.1 defusedxml==0.7.1 -deltalake==0.11.0 +deltalake==0.14.0 Deprecated==1.2.14 dill==0.3.7 dnspython==2.4.2 -docker==6.1.3 +docker==7.0.0 docutils==0.20.1 ecdsa==0.18.0 elasticsearch==7.13.4 email-validator==1.3.1 entrypoints==0.4 et-xmlfile==1.1.0 -exceptiongroup==1.1.3 -executing==2.0.0 -expandvars==0.11.0 -fastapi==0.103.2 -fastavro==1.8.4 -fastjsonschema==2.18.1 +exceptiongroup==1.2.0 +executing==2.0.1 +expandvars==0.12.0 +fastapi==0.108.0 +fastavro==1.9.2 +fastjsonschema==2.19.0 feast==0.31.1 -filelock==3.12.4 +filelock==3.13.1 fissix==21.11.13 Flask==2.2.5 flatdict==4.0.1 -frozenlist==1.4.0 -fsspec==2023.9.2 +frozenlist==1.4.1 +fsspec==2023.12.2 future==0.18.3 -GeoAlchemy2==0.14.1 -gitdb==4.0.10 -GitPython==3.1.37 -google-api-core==2.12.0 -google-auth==2.23.3 -google-cloud-appengine-logging==1.3.2 +GeoAlchemy2==0.14.3 +gitdb==4.0.11 +GitPython==3.1.40 +google-api-core==2.15.0 +google-auth==2.25.2 +google-cloud-appengine-logging==1.4.0 google-cloud-audit-log==0.2.5 -google-cloud-bigquery==3.12.0 -google-cloud-core==2.3.3 +google-cloud-bigquery==3.14.1 +google-cloud-core==2.4.1 google-cloud-datacatalog-lineage==0.2.2 google-cloud-logging==3.5.0 google-crc32c==1.5.0 google-re2==1.1 -google-resumable-media==2.6.0 -googleapis-common-protos==1.60.0 +google-resumable-media==2.7.0 +googleapis-common-protos==1.62.0 gql==3.4.1 graphql-core==3.2.3 graphviz==0.20.1 great-expectations==0.15.50 -greenlet==3.0.0 -grpc-google-iam-v1==0.12.6 -grpcio==1.59.0 -grpcio-reflection==1.59.0 -grpcio-status==1.59.0 -grpcio-tools==1.59.0 +greenlet==3.0.3 +grpc-google-iam-v1==0.13.0 +grpcio==1.60.0 +grpcio-reflection==1.60.0 +grpcio-status==1.60.0 +grpcio-tools==1.60.0 gssapi==1.8.3 gunicorn==21.2.0 h11==0.14.0 -httpcore==0.18.0 -httptools==0.6.0 -httpx==0.25.0 +hdbcli==2.19.20 +httpcore==1.0.2 +httptools==0.6.1 +httpx==0.26.0 humanfriendly==10.0 -idna==3.4 +idna==3.6 ijson==3.2.3 -importlib-metadata==6.8.0 -importlib-resources==6.1.0 +importlib-metadata==6.11.0 +importlib-resources==6.1.1 inflection==0.5.1 ipaddress==1.0.23 ipykernel==6.17.1 -ipython==8.16.1 +ipython==8.19.0 ipython-genutils==0.2.0 ipywidgets==8.1.1 iso3166==2.1.1 @@ -152,34 +150,34 @@ itsdangerous==2.1.2 jedi==0.19.1 Jinja2==3.1.2 jmespath==1.0.1 -JPype1==1.4.1 +JPype1==1.5.0 jsonlines==4.0.0 jsonpatch==1.33 jsonpointer==2.4 jsonref==1.1.0 -jsonschema==4.19.1 -jsonschema-specifications==2023.7.1 +jsonschema==4.20.0 +jsonschema-specifications==2023.12.1 jupyter-server==1.24.0 jupyter_client==7.4.9 jupyter_core==4.12.0 -jupyterlab-pygments==0.2.2 jupyterlab-widgets==3.0.9 +jupyterlab_pygments==0.3.0 langcodes==3.3.0 lark==1.1.4 -lazy-object-proxy==1.9.0 +lazy-object-proxy==1.10.0 leb128==1.0.5 -limits==3.6.0 +limits==3.7.0 linear-tsv==1.1.0 linkify-it-py==2.0.2 -lkml==1.3.1 +lkml==1.3.3 locket==1.0.0 lockfile==0.12.2 looker-sdk==23.0.0 -lxml==4.9.3 +lxml==4.9.4 lz4==4.3.2 -makefun==1.15.1 -Mako==1.2.4 -Markdown==3.5 +makefun==1.15.2 +Mako==1.3.0 +Markdown==3.5.1 markdown-it-py==3.0.0 MarkupSafe==2.1.3 marshmallow==3.20.1 @@ -190,26 +188,26 @@ mdit-py-plugins==0.4.0 mdurl==0.1.2 mistune==3.0.2 mixpanel==4.10.0 -mlflow-skinny==2.7.1 +mlflow-skinny==2.9.2 mmh3==4.0.1 mmhash3==3.0.1 more-itertools==10.1.0 moreorless==0.4.0 -moto==4.2.5 +moto==4.2.12 msal==1.22.0 multidict==6.0.4 murmurhash==1.0.10 -mypy==1.6.0 +mypy==1.8.0 mypy-extensions==1.0.0 nbclassic==1.0.0 nbclient==0.6.3 -nbconvert==7.9.2 +nbconvert==7.13.1 nbformat==5.9.1 nest-asyncio==1.5.8 -networkx==3.1 +networkx==3.2.1 notebook==6.5.6 notebook_shim==0.2.3 -numpy==1.26.0 +numpy==1.26.2 oauthlib==3.2.2 okta==1.7.0 openlineage-airflow==1.2.0 @@ -217,110 +215,107 @@ openlineage-integration-common==1.2.0 openlineage-python==1.2.0 openlineage_sql==1.2.0 openpyxl==3.1.2 -opentelemetry-api==1.20.0 -opentelemetry-exporter-otlp==1.20.0 -opentelemetry-exporter-otlp-proto-common==1.20.0 -opentelemetry-exporter-otlp-proto-grpc==1.20.0 -opentelemetry-exporter-otlp-proto-http==1.20.0 -opentelemetry-proto==1.20.0 -opentelemetry-sdk==1.20.0 -opentelemetry-semantic-conventions==0.41b0 +opentelemetry-api==1.22.0 +opentelemetry-exporter-otlp==1.22.0 +opentelemetry-exporter-otlp-proto-common==1.22.0 +opentelemetry-exporter-otlp-proto-grpc==1.22.0 +opentelemetry-exporter-otlp-proto-http==1.22.0 +opentelemetry-proto==1.22.0 +opentelemetry-sdk==1.22.0 +opentelemetry-semantic-conventions==0.43b0 ordered-set==4.1.0 -oscrypto==1.3.0 packaging==23.2 pandas==1.5.3 pandavro==1.5.2 pandocfilters==1.5.0 -parse==1.19.1 +parse==1.20.0 parso==0.8.3 partd==1.4.1 -pathspec==0.11.2 -pathy==0.10.2 +pathspec==0.12.1 +pathy==0.10.3 pendulum==2.1.2 -pexpect==4.8.0 +pexpect==4.9.0 phonenumbers==8.13.0 -pickleshare==0.7.5 platformdirs==3.11.0 pluggy==1.3.0 preshed==3.0.9 prison==0.2.1 -progressbar2==4.2.0 -prometheus-client==0.17.1 -prompt-toolkit==3.0.39 -proto-plus==1.22.3 -protobuf==4.24.4 -psutil==5.9.5 +progressbar2==4.3.2 +prometheus-client==0.19.0 +prompt-toolkit==3.0.43 +proto-plus==1.23.0 +protobuf==4.25.1 +psutil==5.9.7 psycopg2-binary==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.7 +py-partiql-parser==0.5.0 pyarrow==11.0.0 -pyasn1==0.5.0 +pyasn1==0.5.1 pyasn1-modules==0.3.0 -pyathena==2.4.1 -pycountry==22.3.5 +pyathena==2.25.2 +pycountry==23.12.11 pycparser==2.21 pycryptodome==3.19.0 -pycryptodomex==3.19.0 pydantic==1.10.13 +pydantic_core==2.14.6 pydash==7.0.6 -pydruid==0.6.5 -Pygments==2.16.1 +pydruid==0.6.6 +Pygments==2.17.2 pyiceberg==0.4.0 -pymongo==4.5.0 +pymongo==4.6.1 PyMySQL==1.1.0 -pyOpenSSL==23.2.0 +pyOpenSSL==23.3.0 pyparsing==3.0.9 pyspnego==0.10.2 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 -python-ldap==3.4.3 +python-ldap==3.4.4 python-nvd3==0.15.0 python-slugify==8.0.1 python-stdnum==1.19 -python-tds==1.13.0 +python-tds==1.14.0 python-utils==3.8.1 python3-openid==3.2.0 pytz==2023.3.post1 pytzdata==2020.1 PyYAML==6.0.1 pyzmq==24.0.1 -ratelimiter==1.2.0.post0 redash-toolbelt==0.1.9 -redshift-connector==2.0.914 -referencing==0.30.2 -regex==2023.10.3 +redshift-connector==2.0.918 +referencing==0.32.0 +regex==2023.12.25 requests==2.31.0 requests-file==1.5.1 requests-gssapi==1.2.3 requests-ntlm==1.2.0 requests-toolbelt==0.10.1 -responses==0.23.3 +responses==0.24.1 rfc3339-validator==0.1.4 rfc3986==2.0.0 -rich==13.6.0 -rich-argparse==1.3.0 -rpds-py==0.10.6 +rich==13.7.0 +rich-argparse==1.4.0 +rpds-py==0.15.2 rsa==4.9 ruamel.yaml==0.17.17 ruamel.yaml.clib==0.2.8 -s3transfer==0.7.0 -schwifty==2023.9.0 -scipy==1.11.3 +s3transfer==0.10.0 +schwifty==2023.11.2 +scipy==1.11.4 scramp==1.4.4 Send2Trash==1.8.2 -sentry-sdk==1.32.0 +sentry-sdk==1.39.1 setproctitle==1.3.3 simple-salesforce==1.12.5 six==1.16.0 smart-open==6.4.0 smmap==5.0.1 sniffio==1.3.0 -snowflake-connector-python==3.2.1 -snowflake-sqlalchemy==1.5.0 +snowflake-connector-python==3.6.0 +snowflake-sqlalchemy==1.5.1 sortedcontainers==2.4.0 soupsieve==2.5 spacy==3.4.3 @@ -328,67 +323,71 @@ spacy-legacy==3.0.12 spacy-loggers==1.0.5 sql-metadata==2.2.2 SQLAlchemy==1.4.44 -sqlalchemy-bigquery==1.8.0 -SQLAlchemy-JSONField==1.0.1.post0 +sqlalchemy-bigquery==1.9.0 +sqlalchemy-hana==1.1.1 +SQLAlchemy-JSONField==1.0.2 sqlalchemy-pytds==0.3.5 sqlalchemy-redshift==0.8.14 SQLAlchemy-Utils==0.41.1 -sqlalchemy2-stubs==0.0.2a35 +sqlalchemy2-stubs==0.0.2a37 sqllineage==1.3.8 sqlparse==0.4.4 srsly==2.4.8 stack-data==0.6.3 -starlette==0.27.0 +starlette==0.32.0.post1 strictyaml==1.7.3 tableauserverclient==0.25 tableschema==1.20.2 tabulate==0.9.0 tabulator==1.53.5 tenacity==8.2.3 -termcolor==2.3.0 -terminado==0.17.1 +teradatasql==20.0.0.2 +teradatasqlalchemy==17.20.0.0 +termcolor==2.4.0 +terminado==0.18.0 text-unidecode==1.3 thinc==8.1.12 -thrift==0.13.0 +thrift==0.16.0 thrift-sasl==0.4.3 tinycss2==1.2.1 toml==0.10.2 tomli==2.0.1 -tomlkit==0.12.1 +tomlkit==0.12.3 toolz==0.12.0 -tornado==6.3.3 +tornado==6.4 tqdm==4.66.1 traitlets==5.2.1.post0 trino==0.327.0 typeguard==2.13.3 typer==0.7.0 -types-PyYAML==6.0.12.12 typing-inspect==0.9.0 -typing_extensions==4.8.0 -tzlocal==5.1 +typing_extensions==4.9.0 +tzlocal==5.2 uc-micro-py==1.0.2 -ujson==5.8.0 +ujson==5.9.0 unicodecsv==0.14.1 -urllib3==1.26.17 -uvicorn==0.23.2 -uvloop==0.17.0 -vertica-python==1.3.5 -vertica-sqlalchemy-dialect==0.0.8 +universal-pathlib==0.1.4 +urllib3==1.26.18 +uvicorn==0.25.0 +uvloop==0.19.0 +vertica-python==1.3.8 +vertica-sqlalchemy-dialect==0.0.8.1 vininfo==1.7.0 volatile==2.1.0 wasabi==0.10.1 -watchfiles==0.20.0 +watchfiles==0.21.0 wcmatch==8.5 -wcwidth==0.2.8 +wcwidth==0.2.12 webencodings==0.5.1 -websocket-client==1.6.4 -websockets==11.0.3 +websocket-client==1.7.0 +websockets==12.0 Werkzeug==2.2.3 widgetsnbextension==4.0.9 -wrapt==1.15.0 -WTForms==3.1.0 +wrapt==1.16.0 +WTForms==3.0.1 xlrd==2.0.1 xmltodict==0.13.0 -yarl==1.9.2 +yarl==1.9.4 zeep==4.2.1 -zstd==1.5.5.1 \ No newline at end of file +zipp==3.17.0 +zstd==1.5.5.1 diff --git a/docker/datahub-ingestion-base/regenerate-base-requirements.sh b/docker/datahub-ingestion-base/regenerate-base-requirements.sh new file mode 100755 index 0000000000000..6fb331afa484a --- /dev/null +++ b/docker/datahub-ingestion-base/regenerate-base-requirements.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# This script is used to regenerate the base-requirements.txt file + +set -euxo pipefail +cd "$( dirname "${BASH_SOURCE[0]}" )" + +SCRIPT_NAME=$(basename "$0") +DATAHUB_DIR=$(pwd)/../.. + +# Create a virtualenv. +VENV_DIR=$(mktemp -d) +python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'" +python -m venv $VENV_DIR +source $VENV_DIR/bin/activate +pip install --upgrade pip setuptools wheel +echo "Using virtualenv at $VENV_DIR" + +# Install stuff. +pushd $DATAHUB_DIR/metadata-ingestion +pip install -e . +pip install -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]' +pip install -e '.[all]' +popd + +# Generate the requirements file. +# Removing Flask deps due as per https://github.com/datahub-project/datahub/pull/6867/files +# Removing py4j and PyJWT due to https://github.com/datahub-project/datahub/pull/6868/files +# Removing pyspark and pydeequ because we don't want them in the slim image, so they can be added separately. +# TODO: It's unclear if these removals are still actually needed. +echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt +pip freeze \ + | grep -v -E "^-e" \ + | grep -v "Flask-" \ + | grep -v -E "(py4j|PyJWT)==" \ + | grep -v -E "(pyspark|pydeequ)==" \ + >> base-requirements.txt From 4efa46f8c91dfdedc21b7081143d196c7a0be0da Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Thu, 28 Dec 2023 15:05:14 +0530 Subject: [PATCH 12/48] test(cypress/users): add automatic reset password test (#9515) --- .../src/app/identity/user/UserListItem.tsx | 17 ++- .../app/identity/user/ViewResetTokenModal.tsx | 7 +- .../cypress/e2e/mutations/add_users.js | 135 +++++++++++++----- 3 files changed, 114 insertions(+), 45 deletions(-) diff --git a/datahub-web-react/src/app/identity/user/UserListItem.tsx b/datahub-web-react/src/app/identity/user/UserListItem.tsx index 69b8a6c2d1355..8ad3d7d93d657 100644 --- a/datahub-web-react/src/app/identity/user/UserListItem.tsx +++ b/datahub-web-react/src/app/identity/user/UserListItem.tsx @@ -98,8 +98,8 @@ export default function UserListItem({ user, canManageUserCredentials, selectRol
{displayName}
-
- {user.username} +
+ {user.username}
{userStatus && ( @@ -121,8 +121,12 @@ export default function UserListItem({ user, canManageUserCredentials, selectRol trigger={['click']} overlay={ - setIsViewingResetToken(true)}> -   Reset user password + setIsViewingResetToken(true)} + data-testid="reset-menu-item" + > +   Reset user password  Delete @@ -130,7 +134,10 @@ export default function UserListItem({ user, canManageUserCredentials, selectRol } > - + Generate a new reset link! Note, any old links will cease to be active. - + diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js b/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js index e19c6065d4274..ba225ba37884b 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/add_users.js @@ -1,47 +1,104 @@ const tryToSignUp = () => { - let number = Math.floor(Math.random() * 100000); - let name = `Example Name ${number}`; - cy.enterTextInTestId("email", `example${number}@example.com`); - cy.enterTextInTestId("name", name); - cy.enterTextInTestId("password", "Example password"); - cy.enterTextInTestId("confirmPassword", "Example password"); - - cy.mouseover("#title").click(); - cy.waitTextVisible("Other").click(); - - cy.get("[type=submit]").click(); - return name; + let number = Math.floor(Math.random() * 100000); + let name = `Example Name ${number}`; + let email = `example${number}@example.com`; + cy.enterTextInTestId("email", email); + cy.enterTextInTestId("name", name); + cy.enterTextInTestId("password", "Example password"); + cy.enterTextInTestId("confirmPassword", "Example password"); + + cy.mouseover("#title").click(); + cy.waitTextVisible("Other").click(); + + cy.get("[type=submit]").click(); + return { name, email }; }; describe("add_user", () => { - it("go to user link and invite a user", () => { - cy.login(); + let registeredEmail = ""; + it("go to user link and invite a user", () => { + cy.login(); + + cy.visit("/settings/identities/users"); + cy.waitTextVisible("Invite Users"); + cy.clickOptionWithText("Invite Users"); + + cy.waitTextVisible(/signup\?invite_token=\w{32}/) + .then(($elem) => { + const inviteLink = $elem.text(); + cy.log(inviteLink); cy.visit("/settings/identities/users"); - cy.waitTextVisible("Invite Users"); - - cy.clickOptionWithText("Invite Users"); - - cy.waitTextVisible(/signup\?invite_token=\w{32}/).then(($elem) => { - const inviteLink = $elem.text(); - cy.log(inviteLink); - cy.visit("/settings/identities/users"); - cy.logout(); - cy.visit(inviteLink); - let name = tryToSignUp(); - cy.waitTextVisible("Welcome to DataHub"); - cy.hideOnboardingTour(); - cy.waitTextVisible(name); - }).then(() => { - cy.logout(); - cy.visit("/signup?invite_token=bad_token"); - tryToSignUp(); - cy.waitTextVisible("Failed to log in! An unexpected error occurred."); - }); + cy.logout(); + cy.visit(inviteLink); + const { name, email } = tryToSignUp(); + registeredEmail = email; + cy.waitTextVisible("Welcome to DataHub"); + cy.hideOnboardingTour(); + cy.waitTextVisible(name); + }) + .then(() => { + cy.logout(); + cy.visit("/signup?invite_token=bad_token"); + tryToSignUp(); + cy.waitTextVisible("Failed to log in! An unexpected error occurred."); + }); + }); + + it("Verify you can’t generate a reset password link for a non-native user", () => { + cy.login(); + cy.visit("/settings/identities/users"); + cy.waitTextVisible("Invite Users"); + cy.get("[data-testid=userItem-non-native]").first().click(); + cy.get('[data-testid="reset-menu-item"]').should( + "have.attr", + "aria-disabled", + "true" + ); + }); + + it("Generate a reset password link for a native user", () => { + cy.login(); + cy.visit("/settings/identities/users"); + cy.waitTextVisible("Invite Users"); + cy.get(`[data-testid="email-native"]`) + .contains(registeredEmail) + .should("exist") + .parents(".ant-list-item") + .find('[data-testid="userItem-native"]') + .should("be.visible") + .click(); + + cy.get("[data-testid=resetButton]").first().click(); + cy.get("[data-testid=refreshButton]").click(); + cy.waitTextVisible("Generated new link to reset credentials"); + + cy.window().then((win) => { + cy.stub(win, "prompt"); }); -}); + cy.get(".ant-typography-copy").should("be.visible").click(); + cy.get(".ant-modal-close").should("be.visible").click(); -// Verify you can’t generate a reset password link for a non-native user (root, for example) -// Generate a reset password link for a native user -// Log out, then verify that using a bad reset token in the URL doesn’t allow you to reset password -// Use the correct reset link to reset native user credentials \ No newline at end of file + cy.waitTextVisible(/reset\?reset_token=\w{32}/) + .then(($elem) => { + const inviteLink = $elem.text(); + cy.logout(); + cy.visit(inviteLink); + cy.enterTextInTestId("email", registeredEmail); + cy.enterTextInTestId("password", "Example Reset Password"); + cy.enterTextInTestId("confirmPassword", "Example Reset Password"); + cy.get("[type=submit]").click(); + cy.waitTextVisible("Welcome back"); + cy.hideOnboardingTour(); + }) + .then(() => { + cy.logout(); + cy.visit("/reset?reset_token=bad_token"); + cy.enterTextInTestId("email", registeredEmail); + cy.enterTextInTestId("password", "Example Reset Password"); + cy.enterTextInTestId("confirmPassword", "Example Reset Password"); + cy.get("[type=submit]").click(); + cy.waitTextVisible("Failed to log in!"); + }); + }); +}); From 3635c1c2213cfb8421d89b7cc106ab236d72c7ec Mon Sep 17 00:00:00 2001 From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com> Date: Thu, 28 Dec 2023 15:24:26 +0530 Subject: [PATCH 13/48] feat(ingestion/bigquery): Use sqlglot_lineage for usage and add more perf timers (#9247) Co-authored-by: Andrew Sikowitz --- metadata-ingestion/setup.py | 2 - .../ingestion/source/bigquery_v2/bigquery.py | 22 +- .../source/bigquery_v2/bigquery_audit.py | 16 +- .../source/bigquery_v2/bigquery_config.py | 5 + .../source/bigquery_v2/bigquery_report.py | 12 +- .../ingestion/source/bigquery_v2/usage.py | 86 ++--- .../datahub/utilities/bigquery_sql_parser.py | 92 ----- .../src/datahub/utilities/sqlglot_lineage.py | 8 +- .../bigquery/test_bigquery_usage.py | 8 +- .../tests/unit/test_bigquery_sql_lineage.py | 66 +++- .../tests/unit/test_bigquery_sql_parser.py | 327 ------------------ .../tests/unit/test_bigquery_usage.py | 14 +- .../unit/test_bigqueryv2_usage_source.py | 6 +- 13 files changed, 159 insertions(+), 505 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py delete mode 100644 metadata-ingestion/tests/unit/test_bigquery_sql_parser.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 32d49ffc73fa3..8e4791e253c7c 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -295,8 +295,6 @@ "bigquery": sql_common | bigquery_common | { - # TODO: I doubt we need all three sql parsing libraries. - *sqllineage_lib, *sqlglot_lib, "sqlalchemy-bigquery>=1.4.1", "google-cloud-datacatalog-lineage==0.2.2", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 9813945683289..3704eae96aece 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -221,6 +221,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.bigquery_data_dictionary = BigQuerySchemaApi( self.report.schema_api_perf, self.config.get_bigquery_client() ) + self.sql_parser_schema_resolver = self._init_schema_resolver() redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler @@ -253,6 +254,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.usage_extractor = BigQueryUsageExtractor( config, self.report, + schema_resolver=self.sql_parser_schema_resolver, dataset_urn_builder=self.gen_dataset_urn_from_ref, redundant_run_skip_handler=redundant_usage_run_skip_handler, ) @@ -283,8 +285,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): # Maps view ref -> actual sql self.view_definitions: FileBackedDict[str] = FileBackedDict() - self.sql_parser_schema_resolver = self._init_schema_resolver() - self.add_config_to_report() atexit.register(cleanup, config) @@ -371,7 +371,10 @@ def usage_capability_test( report: BigQueryV2Report, ) -> CapabilityReport: usage_extractor = BigQueryUsageExtractor( - connection_conf, report, lambda ref: "" + connection_conf, + report, + schema_resolver=SchemaResolver(platform="bigquery"), + dataset_urn_builder=lambda ref: "", ) for project_id in project_ids: try: @@ -447,7 +450,9 @@ def _init_schema_resolver(self) -> SchemaResolver: self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser ) schema_ingestion_enabled = ( - self.config.include_views and self.config.include_tables + self.config.include_schema_metadata + and self.config.include_tables + and self.config.include_views ) if schema_resolution_required and not schema_ingestion_enabled: @@ -545,10 +550,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if not projects: return - for project_id in projects: - self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION) - logger.info(f"Processing project: {project_id.id}") - yield from self._process_project(project_id) + if self.config.include_schema_metadata: + for project_id in projects: + self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION) + logger.info(f"Processing project: {project_id.id}") + yield from self._process_project(project_id) if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index 55366d6c57cf8..8cef10ca23448 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -12,6 +12,7 @@ get_first_missing_key, get_first_missing_key_any, ) +from datahub.utilities.urns.dataset_urn import DatasetUrn AuditLogEntry = Any @@ -178,6 +179,17 @@ def from_string_name(cls, ref: str) -> "BigQueryTableRef": raise ValueError(f"invalid BigQuery table reference: {ref}") return cls(BigqueryTableIdentifier(parts[1], parts[3], parts[5])) + @classmethod + def from_urn(cls, urn: str) -> "BigQueryTableRef": + """Raises: ValueError if urn is not a valid BigQuery table URN.""" + dataset_urn = DatasetUrn.create_from_string(urn) + split = dataset_urn.get_dataset_name().rsplit(".", 3) + if len(split) == 3: + project, dataset, table = split + else: + _, project, dataset, table = split + return cls(BigqueryTableIdentifier(project, dataset, table)) + def is_temporary_table(self, prefixes: List[str]) -> bool: for prefix in prefixes: if self.table_identifier.dataset.startswith(prefix): @@ -566,7 +578,7 @@ def from_query_event( query_event: QueryEvent, debug_include_full_payloads: bool = False, ) -> "ReadEvent": - readEvent = ReadEvent( + return ReadEvent( actor_email=query_event.actor_email, timestamp=query_event.timestamp, resource=read_resource, @@ -577,8 +589,6 @@ def from_query_event( from_query=True, ) - return readEvent - @classmethod def from_exported_bigquery_audit_metadata( cls, row: BigQueryAuditMetadata, debug_include_full_payloads: bool = False diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index c13b08a6d9656..58f2a600c2ff7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -94,6 +94,11 @@ class BigQueryV2Config( description="Regex patterns for project_id to filter in ingestion.", ) + include_schema_metadata: bool = Field( + default=True, + description="Whether to ingest the BigQuery schema, i.e. projects, schemas, tables, and views.", + ) + usage: BigQueryUsageConfig = Field( default=BigQueryUsageConfig(), description="Usage related configs" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 9d92b011ee285..69913b383af87 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -33,6 +33,13 @@ class BigQueryAuditLogApiPerfReport(Report): list_log_entries: PerfTimer = field(default_factory=PerfTimer) +@dataclass +class BigQueryProcessingPerfReport(Report): + sql_parsing_sec: PerfTimer = field(default_factory=PerfTimer) + store_usage_event_sec: PerfTimer = field(default_factory=PerfTimer) + usage_state_size: Optional[str] = None + + @dataclass class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): num_total_lineage_entries: TopKDict[str, int] = field(default_factory=TopKDict) @@ -120,8 +127,6 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR read_reasons_stat: Counter[str] = field(default_factory=collections.Counter) operation_types_stat: Counter[str] = field(default_factory=collections.Counter) - usage_state_size: Optional[str] = None - exclude_empty_projects: Optional[bool] = None schema_api_perf: BigQuerySchemaApiPerfReport = field( @@ -130,6 +135,9 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR audit_log_api_perf: BigQueryAuditLogApiPerfReport = field( default_factory=BigQueryAuditLogApiPerfReport ) + processing_perf: BigQueryProcessingPerfReport = field( + default_factory=BigQueryProcessingPerfReport + ) lineage_start_time: Optional[datetime] = None lineage_end_time: Optional[datetime] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 65b559550ffc5..ccc64184f3346 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -35,7 +35,6 @@ AuditEvent, AuditLogEntry, BigQueryAuditMetadata, - BigqueryTableIdentifier, BigQueryTableRef, QueryEvent, ReadEvent, @@ -60,9 +59,9 @@ USAGE_EXTRACTION_USAGE_AGGREGATION, ) from datahub.metadata.schema_classes import OperationClass, OperationTypeClass -from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage logger: logging.Logger = logging.getLogger(__name__) @@ -284,7 +283,7 @@ def delete_original_read_events_for_view_query_events(self) -> None: ) def report_disk_usage(self, report: BigQueryV2Report) -> None: - report.usage_state_size = str( + report.processing_perf.usage_state_size = str( { "main": humanfriendly.format_size(os.path.getsize(self.conn.filename)), "queries": humanfriendly.format_size( @@ -310,11 +309,14 @@ def __init__( self, config: BigQueryV2Config, report: BigQueryV2Report, + *, + schema_resolver: SchemaResolver, dataset_urn_builder: Callable[[BigQueryTableRef], str], redundant_run_skip_handler: Optional[RedundantUsageRunSkipHandler] = None, ): self.config: BigQueryV2Config = config self.report: BigQueryV2Report = report + self.schema_resolver = schema_resolver self.dataset_urn_builder = dataset_urn_builder # Replace hash of query with uuid if there are hash conflicts self.uuid_to_query: Dict[str, str] = {} @@ -415,10 +417,11 @@ def generate_read_events_from_query( ) -> Iterable[AuditEvent]: try: tables = self.get_tables_from_query( - query_event_on_view.project_id, query_event_on_view.query, + default_project=query_event_on_view.project_id, + default_dataset=query_event_on_view.default_dataset, ) - assert tables is not None and len(tables) != 0 + assert len(tables) != 0 for table in tables: yield AuditEvent.create( ReadEvent.from_query_event(table, query_event_on_view) @@ -462,12 +465,15 @@ def _ingest_events( self.report.num_view_query_events += 1 for new_event in self.generate_read_events_from_query(query_event): - num_generated += self._store_usage_event( - new_event, usage_state, table_refs - ) - num_aggregated += self._store_usage_event( - audit_event, usage_state, table_refs - ) + with self.report.processing_perf.store_usage_event_sec: + num_generated += self._store_usage_event( + new_event, usage_state, table_refs + ) + with self.report.processing_perf.store_usage_event_sec: + num_aggregated += self._store_usage_event( + audit_event, usage_state, table_refs + ) + except Exception as e: logger.warning( f"Unable to store usage event {audit_event}", exc_info=True @@ -905,54 +911,38 @@ def _generate_filter(self, corrected_start_time, corrected_end_time): ) def get_tables_from_query( - self, default_project: str, query: str - ) -> Optional[List[BigQueryTableRef]]: + self, query: str, default_project: str, default_dataset: Optional[str] = None + ) -> List[BigQueryTableRef]: """ This method attempts to parse bigquery objects read in the query """ if not query: - return None + return [] - parsed_tables = set() try: - parser = BigQuerySQLParser( - query, - self.config.sql_parser_use_external_process, - use_raw_names=self.config.lineage_sql_parser_use_raw_names, - ) - tables = parser.get_tables() - except Exception as ex: + with self.report.processing_perf.sql_parsing_sec: + result = sqlglot_lineage( + query, + self.schema_resolver, + default_db=default_project, + default_schema=default_dataset, + ) + except Exception: logger.debug( - f"Sql parsing failed on this query on view: {query}. " - f"Usage won't be added. The error was {ex}." + f"Sql parsing failed on this query on view: {query}. Usage won't be added." ) - return None + logger.debug(result.debug_info) + return [] - for table in tables: - parts = table.split(".") - if len(parts) == 2: - parsed_tables.add( - BigQueryTableRef( - BigqueryTableIdentifier( - project_id=default_project, dataset=parts[0], table=parts[1] - ) - ).get_sanitized_table_ref() - ) - elif len(parts) == 3: - parsed_tables.add( - BigQueryTableRef( - BigqueryTableIdentifier( - project_id=parts[0], dataset=parts[1], table=parts[2] - ) - ).get_sanitized_table_ref() - ) - else: - logger.debug( - f"Invalid table identifier {table} when parsing query on view {query}" - ) + parsed_table_refs = [] + for urn in result.in_tables: + try: + parsed_table_refs.append(BigQueryTableRef.from_urn(urn)) + except ValueError: + logger.debug(f"Invalid urn {urn} when parsing query on view {query}") self.report.num_view_query_events_failed_table_identification += 1 - return list(parsed_tables) + return parsed_table_refs def _report_error( self, label: str, e: Exception, group: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py b/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py deleted file mode 100644 index 4ad41f1fe23c9..0000000000000 --- a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py +++ /dev/null @@ -1,92 +0,0 @@ -import re -from typing import List - -import sqlparse - -from datahub.utilities.sql_parser import SqlLineageSQLParser, SQLParser - - -class BigQuerySQLParser(SQLParser): - parser: SQLParser - - def __init__( - self, - sql_query: str, - use_external_process: bool = False, - use_raw_names: bool = False, - ) -> None: - super().__init__(sql_query) - - self._parsed_sql_query = self.parse_sql_query(sql_query) - self.parser = SqlLineageSQLParser( - self._parsed_sql_query, use_external_process, use_raw_names - ) - - def parse_sql_query(self, sql_query: str) -> str: - sql_query = BigQuerySQLParser._parse_bigquery_comment_sign(sql_query) - sql_query = BigQuerySQLParser._escape_keyword_from_as_field_name(sql_query) - sql_query = BigQuerySQLParser._escape_cte_name_after_keyword_with(sql_query) - - sql_query = sqlparse.format( - sql_query.strip(), - reindent_aligned=True, - strip_comments=True, - ) - - sql_query = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( - sql_query - ) - sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from(sql_query) - sql_query = BigQuerySQLParser._remove_comma_before_from(sql_query) - - return sql_query - - @staticmethod - def _parse_bigquery_comment_sign(sql_query: str) -> str: - return re.sub(r"#(.*)", r"-- \1", sql_query, flags=re.IGNORECASE) - - @staticmethod - def _escape_keyword_from_as_field_name(sql_query: str) -> str: - return re.sub(r"(\w*\.from)", r"`\1`", sql_query, flags=re.IGNORECASE) - - @staticmethod - def _escape_cte_name_after_keyword_with(sql_query: str) -> str: - """ - Escape the first cte name in case it is one of reserved words - """ - return re.sub(r"(with\s)([^`\s()]+)", r"\1`\2`", sql_query, flags=re.IGNORECASE) - - @staticmethod - def _escape_table_or_view_name_at_create_statement(sql_query: str) -> str: - """ - Reason: in case table name contains hyphens which breaks sqllineage later on - """ - return re.sub( - r"(create.*\s)(table\s|view\s)([^`\s()]+)(?=\sas)", - r"\1\2`\3`", - sql_query, - flags=re.IGNORECASE, - ) - - @staticmethod - def _remove_comma_before_from(sql_query: str) -> str: - return re.sub(r",(\s*?)(?=from)", r" ", sql_query, flags=re.IGNORECASE) - - @staticmethod - def _escape_object_name_after_keyword_from(sql_query: str) -> str: - """ - Reason: in case table name contains hyphens which breaks sqllineage later on - Note: ignore cases of having keyword FROM as part of datetime function EXTRACT - """ - return re.sub( - r"(? List[str]: - return self.parser.get_tables() - - def get_columns(self) -> List[str]: - return self.parser.get_columns() diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index b43c8de4c8f3d..0f84871d6c96a 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -333,6 +333,9 @@ def _table_level_lineage( return tables, modified +TABLE_CASE_SENSITIVE_PLATFORMS = {"bigquery"} + + class SchemaResolver(Closeable): def __init__( self, @@ -402,7 +405,10 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: if schema_info: return urn_lower, schema_info - return urn_lower, None + if self.platform in TABLE_CASE_SENSITIVE_PLATFORMS: + return urn, None + else: + return urn_lower, None def _resolve_schema_info(self, urn: str) -> Optional[SchemaInfo]: if urn in self._schema_cache: diff --git a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py index bbc3378450bff..9bbe9c45887a8 100644 --- a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py +++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py @@ -14,6 +14,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.sqlglot_lineage import SchemaResolver from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table from tests.performance.data_generation import ( NormalDistribution, @@ -47,7 +48,10 @@ def run_test(): usage_extractor = BigQueryUsageExtractor( config, report, - lambda ref: make_dataset_urn("bigquery", str(ref.table_identifier)), + schema_resolver=SchemaResolver(platform="bigquery"), + dataset_urn_builder=lambda ref: make_dataset_urn( + "bigquery", str(ref.table_identifier) + ), ) report.set_ingestion_stage("All", "Event Generation") @@ -83,7 +87,7 @@ def run_test(): print( f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" ) - print(f"Disk Used: {report.usage_state_size}") + print(f"Disk Used: {report.processing_perf.usage_state_size}") print(f"Hash collisions: {report.num_usage_query_hash_collisions}") diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py index f807be747a193..755e9081dda39 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py @@ -1,4 +1,35 @@ -from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser +from typing import List + +from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigQueryTableRef +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage + + +class BigQuerySQLParser: + def __init__(self, sql_query: str, schema_resolver: SchemaResolver) -> None: + self.result = sqlglot_lineage(sql_query, schema_resolver) + + def get_tables(self) -> List[str]: + ans = [] + for urn in self.result.in_tables: + table_ref = BigQueryTableRef.from_urn(urn) + ans.append(str(table_ref.table_identifier)) + return ans + + def get_columns(self) -> List[str]: + ans = [] + for col_info in self.result.column_lineage or []: + for col_ref in col_info.upstreams: + ans.append(col_ref.column) + return ans + + +def test_bigquery_sql_lineage_basic(): + parser = BigQuerySQLParser( + sql_query="""SELECT * FROM project_1.database_1.view_1""", + schema_resolver=SchemaResolver(platform="bigquery"), + ) + + assert parser.get_tables() == ["project_1.database_1.view_1"] def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted(): @@ -14,7 +45,8 @@ def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted(): -- this comment will not break sqllineage either # this comment will not break sqllineage either FROM `project.dataset.src_tbl` - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.dataset.src_tbl"] @@ -39,7 +71,7 @@ def test_bigquery_sql_lineage_camel_case_table(): # this comment will not break sqllineage either FROM `project.dataset.CamelCaseTable` """, - use_raw_names=True, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.dataset.CamelCaseTable"] @@ -64,7 +96,7 @@ def test_bigquery_sql_lineage_camel_case_dataset(): # this comment will not break sqllineage either FROM `project.DataSet.table` """, - use_raw_names=True, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.DataSet.table"] @@ -89,7 +121,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset(): # this comment will not break sqllineage either FROM `project.DataSet.CamelTable` """, - use_raw_names=True, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.DataSet.CamelTable"] @@ -117,7 +149,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset_subquery(): SELECT * FROM `project.DataSet.CamelTable` ) """, - use_raw_names=True, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.DataSet.CamelTable"] @@ -146,7 +178,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins(): LEFT JOIN `project.DataSet3.CamelTable3` on c.id = b.id """, - use_raw_names=True, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == [ @@ -179,7 +211,7 @@ def test_bigquery_sql_lineage_camel_case_table_and_dataset_joins_and_subquery(): LEFT JOIN (SELECT * FROM `project.DataSet3.CamelTable3`) c ON c.id = b.id """, - use_raw_names=True, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == [ @@ -199,7 +231,8 @@ def test_bigquery_sql_lineage_keyword_data_is_accepted(): FROM `project.example_dataset.example_table` ) SELECT * FROM data - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.example_dataset.example_table"] @@ -213,7 +246,8 @@ def test_bigquery_sql_lineage_keyword_admin_is_accepted(): FROM `project.example_dataset.example_table` ) SELECT * FROM admin - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.example_dataset.example_table"] @@ -238,7 +272,8 @@ def test_bigquery_sql_lineage_cte_alias_as_keyword_is_accepted(): ) SELECT * FROM map - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == [ @@ -255,7 +290,8 @@ def test_bigquery_sql_lineage_create_or_replace_view_name_with_hyphens_is_accept FROM project.dataset.src_table_a UNION SELECT * FROM `project.dataset.src_table_b` - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == [ @@ -270,7 +306,8 @@ def test_bigquery_sql_lineage_source_table_name_with_hyphens_is_accepted(): CREATE OR REPLACE VIEW `project.dataset.test_view` AS SELECT * FROM test-project.dataset.src_table - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["test-project.dataset.src_table"] @@ -282,7 +319,8 @@ def test_bigquery_sql_lineage_from_as_column_name_is_accepted(): CREATE OR REPLACE VIEW `project.dataset.test_view` AS SELECT x.from AS col FROM project.dataset.src_table AS x - """ + """, + schema_resolver=SchemaResolver(platform="bigquery"), ) assert parser.get_tables() == ["project.dataset.src_table"] diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py b/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py deleted file mode 100644 index 2a73bfc5e8b68..0000000000000 --- a/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py +++ /dev/null @@ -1,327 +0,0 @@ -import pytest - -from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser - - -def test_bigquery_sql_parser_comments_are_removed(): - parser = BigQuerySQLParser( - sql_query=""" -/* -HERE IS A STANDARD COMMENT BLOCK -THIS WILL NOT BREAK sqllineage -*/ -CREATE OR REPLACE TABLE `project.dataset.test_view` AS -#This, comment will not break sqllineage -SELECT foo --- this comment will not break sqllineage either -# this comment will not break sqllineage either - FROM `project.dataset.src_table` -""" - ) - - assert ( - parser._parsed_sql_query - == """CREATE OR REPLACE TABLE `project.dataset.test_view` AS SELECT foo - FROM `project.dataset.src_table`""" - ) - - assert parser.get_tables() == ["project.dataset.src_table"] - - -def test_bigquery_sql_parser_formats_input_sql(): - parser = BigQuerySQLParser( - sql_query=""" -CREATE OR REPLACE TABLE `project.dataset.test_view` AS -SELECT foo FROM `project.dataset.src_table_a` AS a -INNER JOIN `project.dataset.src_table_b` AS b ON a.key_field = b.key_field -""" - ) - - assert ( - parser._parsed_sql_query - == """CREATE OR REPLACE TABLE `project.dataset.test_view` AS SELECT foo - FROM `project.dataset.src_table_a` AS a - INNER JOIN `project.dataset.src_table_b` AS b - ON a.key_field = b.key_field""" - ) - - assert parser.get_tables() == [ - "project.dataset.src_table_a", - "project.dataset.src_table_b", - ] - - -def test_remove_comma_before_from(): - assert ( - BigQuerySQLParser._remove_comma_before_from( - """ -select a, b,from `project.dataset.table_name_1` -""" - ) - == """ -select a, b from `project.dataset.table_name_1` -""" - ) - - assert ( - BigQuerySQLParser._remove_comma_before_from( - """ -select a, b from `project.dataset.table_name_1` -""" - ) - == """ -select a, b from `project.dataset.table_name_1` -""" - ) - - assert ( - BigQuerySQLParser._remove_comma_before_from( - """ -select - a, - b, -from `project.dataset.table_name_1` -""" - ) - == """ -select - a, - b from `project.dataset.table_name_1` -""" - ) - - -def test_bigquery_sql_parser_subquery(): - parser = BigQuerySQLParser( - sql_query=""" - create or replace table smoke_test_db.table_from_view_and_table - as (select b.date_utc, v.revenue from smoke_test_db.base_table b, smoke_test_db.view_from_table v - """ - ) - assert parser.get_tables() == [ - "smoke_test_db.base_table", - "smoke_test_db.view_from_table", - ] - - -def test_bigquery_sql_parser_comment_sign_switched_correctly(): - sql_query = BigQuerySQLParser._parse_bigquery_comment_sign( - """ -#upper comment -SELECT * FROM hello -# lower comment -""" - ) - - assert ( - sql_query - == """ --- upper comment -SELECT * FROM hello --- lower comment -""" - ) - - -def test_bigquery_sql_parser_keyword_from_is_escaped_if_used_as_fieldname(): - sql_query = BigQuerySQLParser._escape_keyword_from_as_field_name( - """ -SELECT hello.from AS col FROM hello -""" - ) - - assert ( - sql_query - == """ -SELECT `hello.from` AS col FROM hello -""" - ) - - -def test_bigquery_sql_parser_first_cte_name_is_escaped(): - sql_query = BigQuerySQLParser._escape_cte_name_after_keyword_with( - """ -CREATE OR REPLACE VIEW `test_view` AS -WITH cte_1 AS ( - SELECT * FROM foo -), -cte_2 AS ( - SELECT * FROM bar -) -SELECT * FROM cte_1 UNION ALL -SELECT * FROM cte_2 -""" - ) - - assert ( - sql_query - == """ -CREATE OR REPLACE VIEW `test_view` AS -WITH `cte_1` AS ( - SELECT * FROM foo -), -cte_2 AS ( - SELECT * FROM bar -) -SELECT * FROM cte_1 UNION ALL -SELECT * FROM cte_2 -""" - ) - - -def test_bigquery_sql_parser_table_name_is_escaped_at_create_statement(): - sql_query_create = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( - """ -CREATE TABLE project.dataset.test_table AS -col_1 STRING, -col_2 STRING -""" - ) - - sql_query_create_or_replace = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( - """ -CREATE OR REPLACE TABLE project.dataset.test_table AS -col_1 STRING, -col_2 STRING -""" - ) - - assert ( - sql_query_create - == """ -CREATE TABLE `project.dataset.test_table` AS -col_1 STRING, -col_2 STRING -""" - ) - assert ( - sql_query_create_or_replace - == """ -CREATE OR REPLACE TABLE `project.dataset.test_table` AS -col_1 STRING, -col_2 STRING -""" - ) - - -def test_bigquery_sql_parser_view_name_is_escaped_at_create_statement(): - sql_query_create = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( - """ -CREATE VIEW project.dataset.test_view AS -SELECT * FROM project.dataset.src_table -""" - ) - - sql_query_create_or_replace = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( - """ -CREATE OR REPLACE VIEW project.dataset.test_view AS -SELECT * FROM project.dataset.src_table -""" - ) - - assert ( - sql_query_create - == """ -CREATE VIEW `project.dataset.test_view` AS -SELECT * FROM project.dataset.src_table -""" - ) - assert ( - sql_query_create_or_replace - == """ -CREATE OR REPLACE VIEW `project.dataset.test_view` AS -SELECT * FROM project.dataset.src_table -""" - ) - - -def test_bigquery_sql_parser_object_name_is_escaped_after_keyword_from(): - sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from( - """ -CREATE OR REPLACE VIEW `project.dataset.test_view` AS -SELECT * FROM src-project.dataset.src_table_a UNION ALL -SELECT * FROM project.dataset.src_table_b -""" - ) - - assert ( - sql_query - == """ -CREATE OR REPLACE VIEW `project.dataset.test_view` AS -SELECT * FROM `src-project.dataset.src_table_a` UNION ALL -SELECT * FROM `project.dataset.src_table_b` -""" - ) - - -def test_bigquery_sql_parser_field_name_is_not_escaped_after_keyword_from_in_datetime_functions(): - sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from( - """ -CREATE OR REPLACE VIEW `project.dataset.test_view` AS -SELECT -EXTRACT(MICROSECOND FROM time_field) AS col_1, -EXTRACT(MILLISECOND FROM time_field) AS col_2, -EXTRACT(SECOND FROM time_field) AS col_3, -EXTRACT(MINUTE FROM time_field) AS col_4, -EXTRACT(HOUR FROM time_field) AS col_5, -EXTRACT(DAYOFWEEK FROM time_field) AS col_6, -EXTRACT(DAY FROM time_field) AS col_7, -EXTRACT(DAYOFYEAR FROM time_field) AS col_8, -EXTRACT(WEEK FROM time_field) AS col_9, -EXTRACT(WEEK FROM time_field) AS col_10, -EXTRACT(ISOWEEK FROM time_field) AS col_11, -EXTRACT(MONTH FROM time_field) AS col_12, -EXTRACT(QUARTER FROM time_field) AS col_13, -EXTRACT(YEAR FROM time_field) AS col_14, -EXTRACT(ISOYEAR FROM time_field) AS col_15, -EXTRACT(DATE FROM time_field) AS col_16, -EXTRACT(TIME FROM time_field) AS col_17 -FROM src-project.dataset.src_table_a -""" - ) - - assert ( - sql_query - == """ -CREATE OR REPLACE VIEW `project.dataset.test_view` AS -SELECT -EXTRACT(MICROSECOND FROM time_field) AS col_1, -EXTRACT(MILLISECOND FROM time_field) AS col_2, -EXTRACT(SECOND FROM time_field) AS col_3, -EXTRACT(MINUTE FROM time_field) AS col_4, -EXTRACT(HOUR FROM time_field) AS col_5, -EXTRACT(DAYOFWEEK FROM time_field) AS col_6, -EXTRACT(DAY FROM time_field) AS col_7, -EXTRACT(DAYOFYEAR FROM time_field) AS col_8, -EXTRACT(WEEK FROM time_field) AS col_9, -EXTRACT(WEEK FROM time_field) AS col_10, -EXTRACT(ISOWEEK FROM time_field) AS col_11, -EXTRACT(MONTH FROM time_field) AS col_12, -EXTRACT(QUARTER FROM time_field) AS col_13, -EXTRACT(YEAR FROM time_field) AS col_14, -EXTRACT(ISOYEAR FROM time_field) AS col_15, -EXTRACT(DATE FROM time_field) AS col_16, -EXTRACT(TIME FROM time_field) AS col_17 -FROM `src-project.dataset.src_table_a` -""" - ) - - -def test_bigquery_sql_parser_with_semicolon_in_from(): - sql_query = """CREATE VIEW `acryl-staging.smoke_test_db.view_from_table`\nAS select * from smoke_test_db.base_table;""" - - table_list = BigQuerySQLParser(sql_query).get_tables() - table_list.sort() - assert table_list == ["smoke_test_db.base_table"] - - -@pytest.mark.xfail -def test_bigquery_sql_parser_with_parenthesis_in_from(): - sql_query = """ - CREATE VIEW `acryl-staging.smoke_test_db.view_from_table` AS - select * from smoke_test_db.base_table LEFT JOIN UNNEST(my_array) ON day1 = day2; - """ - - table_list = BigQuerySQLParser(sql_query).get_tables() - table_list.sort() - assert table_list == ["smoke_test_db.base_table"] diff --git a/metadata-ingestion/tests/unit/test_bigquery_usage.py b/metadata-ingestion/tests/unit/test_bigquery_usage.py index c0055763bc15b..664d3112810ff 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_usage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_usage.py @@ -35,6 +35,7 @@ TimeWindowSizeClass, ) from datahub.testing.compare_metadata_json import diff_metadata_json +from datahub.utilities.sqlglot_lineage import SchemaResolver from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table from tests.performance.data_generation import generate_data, generate_queries from tests.performance.data_model import Container, FieldAccess, Query, Table, View @@ -202,7 +203,10 @@ def usage_extractor(config: BigQueryV2Config) -> BigQueryUsageExtractor: return BigQueryUsageExtractor( config, report, - lambda ref: make_dataset_urn("bigquery", str(ref.table_identifier)), + schema_resolver=SchemaResolver(platform="bigquery"), + dataset_urn_builder=lambda ref: make_dataset_urn( + "bigquery", str(ref.table_identifier) + ), ) @@ -961,21 +965,21 @@ def test_operational_stats( def test_get_tables_from_query(usage_extractor): assert usage_extractor.get_tables_from_query( - PROJECT_1, "SELECT * FROM project-1.database_1.view_1" + "SELECT * FROM project-1.database_1.view_1", default_project=PROJECT_1 ) == [ BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "view_1")) ] assert usage_extractor.get_tables_from_query( - PROJECT_1, "SELECT * FROM database_1.view_1" + "SELECT * FROM database_1.view_1", default_project=PROJECT_1 ) == [ BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "view_1")) ] assert sorted( usage_extractor.get_tables_from_query( - PROJECT_1, "SELECT v.id, v.name, v.total, t.name as name1 FROM database_1.view_1 as v inner join database_1.table_1 as t on v.id=t.id", + default_project=PROJECT_1, ) ) == [ BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "table_1")), @@ -984,8 +988,8 @@ def test_get_tables_from_query(usage_extractor): assert sorted( usage_extractor.get_tables_from_query( - PROJECT_1, "CREATE TABLE database_1.new_table AS SELECT v.id, v.name, v.total, t.name as name1 FROM database_1.view_1 as v inner join database_1.table_1 as t on v.id=t.id", + default_project=PROJECT_1, ) ) == [ BigQueryTableRef(BigqueryTableIdentifier("project-1", "database_1", "table_1")), diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 44fd840f28d59..25e849a509293 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -10,6 +10,7 @@ from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor +from datahub.utilities.sqlglot_lineage import SchemaResolver FROZEN_TIME = "2021-07-20 00:00:00" @@ -114,7 +115,10 @@ def test_bigqueryv2_filters(): corrected_start_time = config.start_time - config.max_query_duration corrected_end_time = config.end_time + config.max_query_duration filter: str = BigQueryUsageExtractor( - config, BigQueryV2Report(), lambda x: "" + config, + BigQueryV2Report(), + schema_resolver=SchemaResolver(platform="bigquery"), + dataset_urn_builder=lambda x: "", )._generate_filter(corrected_start_time, corrected_end_time) assert filter == expected_filter From 60347d6735ea2136d721bbf6644ae82df6519d9c Mon Sep 17 00:00:00 2001 From: Diego Reiriz Cores Date: Thu, 28 Dec 2023 12:09:10 +0100 Subject: [PATCH 14/48] fix(ingest/mongodb): support disabling schemaSamplingSize (#9295) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/mongodb.py | 8 +++++--- .../tests/integration/mongodb/test_mongodb.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index 2aa8b1d37d477..283ab652f23c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -102,7 +102,7 @@ class MongoDBConfig( ) schemaSamplingSize: Optional[PositiveInt] = Field( default=1000, - description="Number of documents to use when inferring schema size. If set to `0`, all documents will be scanned.", + description="Number of documents to use when inferring schema size. If set to `null`, all documents will be scanned.", ) useRandomSampling: bool = Field( default=True, @@ -225,13 +225,15 @@ def construct_schema_pymongo( ] if use_random_sampling: # get sample documents in collection - aggregations.append({"$sample": {"size": sample_size}}) + if sample_size: + aggregations.append({"$sample": {"size": sample_size}}) documents = collection.aggregate( aggregations, allowDiskUse=True, ) else: - aggregations.append({"$limit": sample_size}) + if sample_size: + aggregations.append({"$limit": sample_size}) documents = collection.aggregate(aggregations, allowDiskUse=True) return construct_schema(list(documents), delimiter) diff --git a/metadata-ingestion/tests/integration/mongodb/test_mongodb.py b/metadata-ingestion/tests/integration/mongodb/test_mongodb.py index 56fb471d4c9f1..0a0ba55ff5b80 100644 --- a/metadata-ingestion/tests/integration/mongodb/test_mongodb.py +++ b/metadata-ingestion/tests/integration/mongodb/test_mongodb.py @@ -26,6 +26,7 @@ def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time "password": "examplepass", "maxDocumentSize": 25000, "platform_instance": "instance", + "schemaSamplingSize": None, }, }, "sink": { From 2cd38a469d5ac607bd510a0ca045d151b4657afd Mon Sep 17 00:00:00 2001 From: Tony Ouyang Date: Thu, 28 Dec 2023 03:09:30 -0800 Subject: [PATCH 15/48] fix(ingest): Fix mongodb ingestion when platform_instance is missing from recipe (#9486) Co-authored-by: Harshal Sheth --- metadata-ingestion/src/datahub/ingestion/source/mongodb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index 283ab652f23c6..577da91ee82da 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -379,6 +379,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: platform_instance=self.config.platform_instance, ) + # Initialize data_platform_instance with a default value + data_platform_instance = None if self.config.platform_instance: data_platform_instance = DataPlatformInstanceClass( platform=make_data_platform_urn(platform), From e343b69ce4881ceefdf4af0cafea29188092de52 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 28 Dec 2023 16:50:13 +0530 Subject: [PATCH 16/48] fix(ingest/snowflake): explicit set schema if public schema is absent (#9526) --- .../source/snowflake/snowflake_profiler.py | 14 ++++++++++++++ .../ingestion/source/snowflake/snowflake_query.py | 4 ++++ .../source/state/stateful_ingestion_base.py | 2 +- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 4bda7da422e9d..9a37f779bbcd5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -24,6 +24,8 @@ logger = logging.getLogger(__name__) +PUBLIC_SCHEMA = "PUBLIC" + class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin): def __init__( @@ -36,6 +38,7 @@ def __init__( self.config: SnowflakeV2Config = config self.report: SnowflakeV2Report = report self.logger = logger + self.database_default_schema: Dict[str, str] = dict() def get_workunits( self, database: SnowflakeDatabase, db_tables: Dict[str, List[SnowflakeTable]] @@ -47,6 +50,10 @@ def get_workunits( "max_overflow", self.config.profiling.max_workers ) + if PUBLIC_SCHEMA not in db_tables: + # If PUBLIC schema is absent, we use any one of schemas as default schema + self.database_default_schema[database.name] = list(db_tables.keys())[0] + profile_requests = [] for schema in database.schemas: for table in db_tables[schema.name]: @@ -136,9 +143,16 @@ def get_profiler_instance( ) def callable_for_db_connection(self, db_name: str) -> Callable: + schema_name = self.database_default_schema.get(db_name) + def get_db_connection(): conn = self.config.get_connection() conn.cursor().execute(SnowflakeQuery.use_database(db_name)) + + # As mentioned here - https://docs.snowflake.com/en/sql-reference/sql/use-database#usage-notes + # no schema is selected if PUBLIC schema is absent. We need to explicitly call `USE SCHEMA ` + if schema_name: + conn.cursor().execute(SnowflakeQuery.use_schema(schema_name)) return conn return get_db_connection diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py index 267f7cf074909..724e4392f1d61 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py @@ -80,6 +80,10 @@ def show_tags() -> str: def use_database(db_name: str) -> str: return f'use database "{db_name}"' + @staticmethod + def use_schema(schema_name: str) -> str: + return f'use schema "{schema_name}"' + @staticmethod def get_databases(db_name: Optional[str]) -> str: db_clause = f'"{db_name}".' if db_name is not None else "" diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index 8a448f40e95b4..61d39b18f523d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -98,7 +98,7 @@ class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]): ) -class StatefulLineageConfigMixin: +class StatefulLineageConfigMixin(ConfigModel): enable_stateful_lineage_ingestion: bool = Field( default=True, description="Enable stateful lineage ingestion." From 4de2c24249697fa68831f880fda216ddb46fba3d Mon Sep 17 00:00:00 2001 From: Sumit Patil <91715217+sumitappt@users.noreply.github.com> Date: Thu, 28 Dec 2023 21:37:57 +0530 Subject: [PATCH 17/48] style(search): Border is too thick for sidebar (#9528) --- .../src/app/search/sidebar/BrowseSidebar.tsx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx index c16bcdcaf6c72..1731727c14cfc 100644 --- a/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx +++ b/datahub-web-react/src/app/search/sidebar/BrowseSidebar.tsx @@ -9,7 +9,6 @@ import useSidebarEntities from './useSidebarEntities'; import { ANTD_GRAY_V2 } from '../../entity/shared/constants'; import { ProfileSidebarResizer } from '../../entity/shared/containers/profile/sidebar/ProfileSidebarResizer'; - export const MAX_BROWSER_WIDTH = 500; export const MIN_BROWSWER_WIDTH = 200; @@ -18,7 +17,6 @@ export const SidebarWrapper = styled.div<{ visible: boolean; width: number }>` width: ${(props) => (props.visible ? `${props.width}px` : '0')}; min-width: ${(props) => (props.visible ? `${props.width}px` : '0')}; transition: width 250ms ease-in-out; - border-right: 1px solid ${(props) => props.theme.styles['border-color-base']}; background-color: ${ANTD_GRAY_V2[1]}; background: white; `; @@ -53,7 +51,12 @@ const BrowseSidebar = ({ visible }: Props) => { return ( <> - + Navigate From 5321352852a511bf92685290fc8a4371faaed876 Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Fri, 29 Dec 2023 12:53:58 +0530 Subject: [PATCH 18/48] style(ui): humanise duration shown on ingestion page (#9530) --- .../executions/IngestionExecutionTable.tsx | 8 +++---- .../src/app/shared/formatDuration.ts | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 datahub-web-react/src/app/shared/formatDuration.ts diff --git a/datahub-web-react/src/app/ingest/source/executions/IngestionExecutionTable.tsx b/datahub-web-react/src/app/ingest/source/executions/IngestionExecutionTable.tsx index 8c81cc36ae3f9..a9d9283ef1377 100644 --- a/datahub-web-react/src/app/ingest/source/executions/IngestionExecutionTable.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/IngestionExecutionTable.tsx @@ -4,6 +4,7 @@ import { StyledTable } from '../../../entity/shared/components/styled/StyledTabl import { ExecutionRequest } from '../../../../types.generated'; import { ButtonsColumn, SourceColumn, StatusColumn, TimeColumn } from './IngestionExecutionTableColumns'; import { SUCCESS } from '../utils'; +import { formatDuration } from '../../../shared/formatDuration'; interface Props { executionRequests: ExecutionRequest[]; @@ -34,13 +35,10 @@ export default function IngestionExecutionTable({ render: TimeColumn, }, { - title: 'Duration (s)', + title: 'Duration', dataIndex: 'duration', key: 'duration', - render: (durationMs: number) => { - const seconds = (durationMs && `${durationMs / 1000}s`) || 'None'; - return seconds; - }, + render: (durationMs: number) => formatDuration(durationMs), }, { title: 'Status', diff --git a/datahub-web-react/src/app/shared/formatDuration.ts b/datahub-web-react/src/app/shared/formatDuration.ts new file mode 100644 index 0000000000000..1028b46f70b31 --- /dev/null +++ b/datahub-web-react/src/app/shared/formatDuration.ts @@ -0,0 +1,21 @@ +export const formatDuration = (durationMs: number): string => { + if (!durationMs) return 'None'; + + const seconds = durationMs / 1000; + + if (seconds < 60) { + return `${seconds.toFixed(1)} s`; + } + + const minutes = Math.floor(seconds / 60); + const remainingSeconds = Math.round(seconds % 60); + + if (minutes < 60) { + return `${minutes} min ${remainingSeconds} s`; + } + + const hours = Math.floor(minutes / 60); + const remainingMinutes = Math.round(minutes % 60); + + return `${hours} hr ${remainingMinutes} min`; +}; From 06bd9b988d3006d57350476ccec18b2a5e7aac37 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Fri, 29 Dec 2023 21:34:06 +0530 Subject: [PATCH 19/48] fix(cli): upsert for data product external url (#9534) --- metadata-ingestion/src/datahub/specific/dataproduct.py | 2 +- .../entities/dataproducts/golden_dataproduct_out_upsert.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py index 301a0ff63f2f0..bb49ac47b3ef8 100644 --- a/metadata-ingestion/src/datahub/specific/dataproduct.py +++ b/metadata-ingestion/src/datahub/specific/dataproduct.py @@ -152,7 +152,7 @@ def set_external_url(self, external_url: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, "replace", - path="/external_url", + path="/externalUrl", value=external_url, ) return self diff --git a/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json b/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json index ca4aafe848f60..97c2330f58bc7 100644 --- a/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json +++ b/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json @@ -5,7 +5,7 @@ "changeType": "PATCH", "aspectName": "dataProductProperties", "aspect": { - "value": "[{\"op\": \"replace\", \"path\": \"/name\", \"value\": \"Pet of the Week Campaign\"}, {\"op\": \"replace\", \"path\": \"/assets\", \"value\": [{\"destinationUrn\": \"urn:li:container:DATABASE\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:container:SCHEMA\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}]}, {\"op\": \"replace\", \"path\": \"/customProperties\", \"value\": {\"version\": \"2.0\", \"classification\": \"pii\"}}, {\"op\": \"replace\", \"path\": \"/external_url\", \"value\": \"https://github.com/datahub-project/datahub\"}]", + "value": "[{\"op\": \"replace\", \"path\": \"/name\", \"value\": \"Pet of the Week Campaign\"}, {\"op\": \"replace\", \"path\": \"/assets\", \"value\": [{\"destinationUrn\": \"urn:li:container:DATABASE\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:container:SCHEMA\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}]}, {\"op\": \"replace\", \"path\": \"/customProperties\", \"value\": {\"version\": \"2.0\", \"classification\": \"pii\"}}, {\"op\": \"replace\", \"path\": \"/externalUrl\", \"value\": \"https://github.com/datahub-project/datahub\"}]", "contentType": "application/json-patch+json" } }, From 31f9c796763677a4d452066d9b49b4088e65da19 Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Tue, 2 Jan 2024 13:22:22 +0530 Subject: [PATCH 20/48] fix posts are failing to be created as Admin user (#9533) --- datahub-web-react/src/app/settings/posts/utils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/settings/posts/utils.ts b/datahub-web-react/src/app/settings/posts/utils.ts index ce48c7400738c..9958a0e8d9f0e 100644 --- a/datahub-web-react/src/app/settings/posts/utils.ts +++ b/datahub-web-react/src/app/settings/posts/utils.ts @@ -16,7 +16,7 @@ export const addToListPostCache = (client, newPost, pageSize) => { }); // Add our new post into the existing list. - const newPosts = [newPost, ...(currData?.listPosts?.posts || [])]; + const newPosts = [...(currData?.listPosts?.posts || [])]; // Write our data back to the cache. client.writeQuery({ From 0bb838b904807c8fdc8266b6395023079b4dce4f Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Tue, 2 Jan 2024 21:45:55 +0530 Subject: [PATCH 21/48] fix(ui): while creating secrets via UI validate validate characters (#9548) --- datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx b/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx index 30f04d61b8fc9..c099d9a580efa 100644 --- a/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx +++ b/datahub-web-react/src/app/ingest/secret/SecretBuilderModal.tsx @@ -81,7 +81,7 @@ export const SecretBuilderModal = ({ initialState, visible, onSubmit, onCancel } }, { whitespace: false }, { min: 1, max: 50 }, - { pattern: /^[^\s\t${}\\,'"]+$/, message: 'This secret name is not allowed.' }, + { pattern: /^[a-zA-Z_]+[a-zA-Z0-9_]*$/, message: 'Please start the secret name with a letter, followed by letters, digits, or underscores only.' }, ]} hasFeedback > From 6d72640e9149343363885ec275d89fb48d9a9626 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 3 Jan 2024 09:47:58 +0530 Subject: [PATCH 22/48] feat(ui): add databricks logo (#9473) --- datahub-web-react/src/app/ingest/source/builder/constants.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index 08538729de40b..bd792d78856d5 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -103,6 +103,8 @@ export const CUSTOM = 'custom'; export const CUSTOM_URN = `urn:li:dataPlatform:${CUSTOM}`; export const UNITY_CATALOG = 'unity-catalog'; export const UNITY_CATALOG_URN = `urn:li:dataPlatform:${UNITY_CATALOG}`; +export const DATABRICKS = 'databricks'; +export const DATABRICKS_URN = `urn:li:dataPlatform:${DATABRICKS}`; export const DBT_CLOUD = 'dbt-cloud'; export const DBT_CLOUD_URN = `urn:li:dataPlatform:dbt`; export const VERTICA = 'vertica'; @@ -143,6 +145,7 @@ export const PLATFORM_URN_TO_LOGO = { [TRINO_URN]: trinoLogo, [SUPERSET_URN]: supersetLogo, [UNITY_CATALOG_URN]: databricksLogo, + [DATABRICKS_URN]: databricksLogo, [VERTICA_URN]: verticaLogo, [FIVETRAN_URN]: fivetranLogo, [CSV_URN]: csvLogo, From 29f2142a2c128f7f165f9011eff3bc647ae92185 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 3 Jan 2024 09:48:43 +0530 Subject: [PATCH 23/48] feat(databricks): add hive metastore analyze profiling (#9511) --- metadata-ingestion/setup.py | 4 +- .../ingestion/source/sql/sql_config.py | 8 +- .../datahub/ingestion/source/unity/config.py | 16 +- .../source/unity/hive_metastore_proxy.py | 109 ++++++++- .../datahub/ingestion/source/unity/proxy.py | 125 +++++++--- .../ingestion/source/unity/proxy_profiling.py | 50 ++-- .../ingestion/source/unity/proxy_types.py | 24 +- .../datahub/ingestion/source/unity/report.py | 6 + .../datahub/ingestion/source/unity/source.py | 27 ++- .../datahub/ingestion/source/unity/usage.py | 5 +- .../unity/test_unity_catalog_ingest.py | 104 +++++++- .../unity/unity_catalog_mces_golden.json | 228 ++++++++++++++++-- 12 files changed, 600 insertions(+), 106 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8e4791e253c7c..10db019b51381 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -251,9 +251,7 @@ databricks = { # 0.1.11 appears to have authentication issues with azure databricks - # 0.16.0 added py.typed support which caused mypy to fail. The databricks sdk is pinned until we resolve mypy issues. - # https://github.com/databricks/databricks-sdk-py/pull/483 - "databricks-sdk>=0.9.0,<0.16.0", + "databricks-sdk>=0.9.0", "pyspark~=3.3.0", "requests", # Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 54edab6f3b84b..c0dc70301ba34 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -112,7 +112,13 @@ def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: profiling: Optional[GEProfilingConfig] = values.get("profiling") - if profiling is not None and profiling.enabled: + # Note: isinstance() check is required here as unity-catalog source reuses + # SQLCommonConfig with different profiling config than GEProfilingConfig + if ( + profiling is not None + and isinstance(profiling, GEProfilingConfig) + and profiling.enabled + ): profiling._allow_deny_patterns = values["profile_pattern"] return values diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 96971faeea69f..df36153af9d83 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -95,14 +95,6 @@ class UnityCatalogAnalyzeProfilerConfig(UnityCatalogProfilerConfig): description="Number of worker threads to use for profiling. Set to 1 to disable.", ) - @pydantic.root_validator(skip_on_failure=True) - def warehouse_id_required_for_profiling( - cls, values: Dict[str, Any] - ) -> Dict[str, Any]: - if values.get("enabled") and not values.get("warehouse_id"): - raise ValueError("warehouse_id must be set when profiling is enabled.") - return values - @property def include_columns(self): return not self.profile_table_level_only @@ -254,6 +246,7 @@ class UnityCatalogSourceConfig( description="Generate usage statistics.", ) + # TODO: Remove `type:ignore` by refactoring config profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore default=UnityCatalogGEProfilerConfig(), description="Data profiling configuration", @@ -316,7 +309,9 @@ def include_metastore_warning(cls, v: bool) -> bool: @pydantic.root_validator(skip_on_failure=True) def set_warehouse_id_from_profiling(cls, values: Dict[str, Any]) -> Dict[str, Any]: - profiling: Optional[UnityCatalogProfilerConfig] = values.get("profiling") + profiling: Optional[ + Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] + ] = values.get("profiling") if not values.get("warehouse_id") and profiling and profiling.warehouse_id: values["warehouse_id"] = profiling.warehouse_id if ( @@ -337,6 +332,9 @@ def set_warehouse_id_from_profiling(cls, values: Dict[str, Any]) -> Dict[str, An if values.get("warehouse_id") and profiling and not profiling.warehouse_id: profiling.warehouse_id = values["warehouse_id"] + if profiling and profiling.enabled and not profiling.warehouse_id: + raise ValueError("warehouse_id must be set when profiling is enabled.") + return values @pydantic.validator("schema_pattern", always=True) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py index 99b2ff998662c..814d86a2f3234 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/hive_metastore_proxy.py @@ -12,11 +12,14 @@ from datahub.ingestion.source.unity.proxy_types import ( Catalog, Column, + ColumnProfile, CustomCatalogType, HiveTableType, Metastore, Schema, Table, + TableProfile, + TableReference, ) logger = logging.getLogger(__name__) @@ -38,6 +41,18 @@ "binary": ColumnTypeName.BINARY, } +NUM_NULLS = "num_nulls" +DISTINCT_COUNT = "distinct_count" +MIN = "min" +MAX = "max" +AVG_COL_LEN = "avg_col_len" +MAX_COL_LEN = "max_col_len" +VERSION = "version" + +ROWS = "rows" +BYTES = "bytes" +TABLE_STAT_LIST = {ROWS, BYTES} + class HiveMetastoreProxy(Closeable): # TODO: Support for view lineage using SQL parsing @@ -67,7 +82,7 @@ def get_inspector(sqlalchemy_url: str, options: dict) -> Inspector: def hive_metastore_catalog(self, metastore: Optional[Metastore]) -> Catalog: return Catalog( - id=HIVE_METASTORE, + id=f"{metastore.id}.{HIVE_METASTORE}" if metastore else HIVE_METASTORE, name=HIVE_METASTORE, comment=None, metastore=metastore, @@ -95,9 +110,14 @@ def hive_metastore_tables(self, schema: Schema) -> Iterable[Table]: continue yield self._get_table(schema, table_name, False) - def _get_table(self, schema: Schema, table_name: str, is_view: bool) -> Table: + def _get_table( + self, + schema: Schema, + table_name: str, + is_view: bool = False, + ) -> Table: columns = self._get_columns(schema, table_name) - detailed_info = self._get_table_info(schema, table_name) + detailed_info = self._get_table_info(schema.name, table_name) comment = detailed_info.pop("Comment", None) storage_location = detailed_info.pop("Location", None) @@ -129,6 +149,74 @@ def _get_table(self, schema: Schema, table_name: str, is_view: bool) -> Table: comment=comment, ) + def get_table_profile( + self, ref: TableReference, include_column_stats: bool = False + ) -> TableProfile: + columns = self._get_columns( + Schema( + id=ref.schema, + name=ref.schema, + # This is okay, as none of this is used in profiling + catalog=self.hive_metastore_catalog(None), + comment=None, + owner=None, + ), + ref.table, + ) + detailed_info = self._get_table_info(ref.schema, ref.table) + + table_stats = ( + self._get_cached_table_statistics(detailed_info["Statistics"]) + if detailed_info.get("Statistics") + else {} + ) + + return TableProfile( + num_rows=int(table_stats[ROWS]) + if table_stats.get(ROWS) is not None + else None, + total_size=int(table_stats[BYTES]) + if table_stats.get(BYTES) is not None + else None, + num_columns=len(columns), + column_profiles=[ + self._get_column_profile(column.name, ref) for column in columns + ] + if include_column_stats + else [], + ) + + def _get_column_profile(self, column: str, ref: TableReference) -> ColumnProfile: + + props = self._column_describe_extended(ref.schema, ref.table, column) + col_stats = {} + for prop in props: + col_stats[prop[0]] = prop[1] + return ColumnProfile( + name=column, + null_count=int(col_stats[NUM_NULLS]) + if col_stats.get(NUM_NULLS) is not None + else None, + distinct_count=int(col_stats[DISTINCT_COUNT]) + if col_stats.get(DISTINCT_COUNT) is not None + else None, + min=col_stats.get(MIN), + max=col_stats.get(MAX), + avg_len=col_stats.get(AVG_COL_LEN), + max_len=col_stats.get(MAX_COL_LEN), + version=col_stats.get(VERSION), + ) + + def _get_cached_table_statistics(self, statistics: str) -> dict: + # statistics is in format "xx bytes" OR "1382 bytes, 2 rows" + table_stats = dict() + for prop in statistics.split(","): + value_key_list = prop.strip().split(" ") # value_key_list -> [value, key] + if len(value_key_list) == 2 and value_key_list[1] in TABLE_STAT_LIST: + table_stats[value_key_list[1]] = value_key_list[0] + + return table_stats + def _get_created_at(self, created_at: Optional[str]) -> Optional[datetime]: return ( datetime.strptime(created_at, "%a %b %d %H:%M:%S %Z %Y") @@ -171,8 +259,8 @@ def _get_table_type(self, type: Optional[str]) -> HiveTableType: else: return HiveTableType.UNKNOWN - def _get_table_info(self, schema: Schema, table_name: str) -> dict: - rows = self._describe_extended(schema.name, table_name) + def _get_table_info(self, schema_name: str, table_name: str) -> dict: + rows = self._describe_extended(schema_name, table_name) index = rows.index(("# Detailed Table Information", "", "")) rows = rows[index + 1 :] @@ -235,6 +323,17 @@ def _describe_extended(self, schema_name: str, table_name: str) -> List[Row]: """ return self._execute_sql(f"DESCRIBE EXTENDED `{schema_name}`.`{table_name}`") + def _column_describe_extended( + self, schema_name: str, table_name: str, column_name: str + ) -> List[Row]: + """ + Rows are structured as shown in examples here + https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-describe-table.html#examples + """ + return self._execute_sql( + f"DESCRIBE EXTENDED `{schema_name}`.`{table_name}` {column_name}" + ) + def _execute_sql(self, sql: str) -> List[Row]: return self.inspector.bind.execute(sql).fetchall() diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 13baa8b57a639..b414f3f188c23 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -4,7 +4,7 @@ import dataclasses import logging from datetime import datetime, timezone -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Union, cast from unittest.mock import patch from databricks.sdk import WorkspaceClient @@ -49,16 +49,19 @@ logger: logging.Logger = logging.getLogger(__name__) +@dataclasses.dataclass class TableInfoWithGeneration(TableInfo): generation: Optional[int] = None - @classmethod def as_dict(self) -> dict: return {**super().as_dict(), "generation": self.generation} @classmethod def from_dict(cls, d: Dict[str, Any]) -> "TableInfoWithGeneration": - table_info = super().from_dict(d) + table_info: TableInfoWithGeneration = cast( + TableInfoWithGeneration, + super().from_dict(d), + ) table_info.generation = d.get("generation") return table_info @@ -72,7 +75,10 @@ def as_dict(self) -> dict: @classmethod def from_dict(cls, d: Dict[str, Any]) -> "QueryFilterWithStatementTypes": - v = super().from_dict(d) + v: QueryFilterWithStatementTypes = cast( + QueryFilterWithStatementTypes, + super().from_dict(d), + ) v.statement_types = d["statement_types"] return v @@ -104,7 +110,7 @@ def __init__( def check_basic_connectivity(self) -> bool: return bool(self._workspace_client.catalogs.list()) - def assigned_metastore(self) -> Metastore: + def assigned_metastore(self) -> Optional[Metastore]: response = self._workspace_client.metastores.summary() return self._create_metastore(response) @@ -117,7 +123,9 @@ def catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]: logger.info("Catalogs not found") return [] for catalog in response: - yield self._create_catalog(metastore, catalog) + optional_catalog = self._create_catalog(metastore, catalog) + if optional_catalog: + yield optional_catalog def catalog( self, catalog_name: str, metastore: Optional[Metastore] @@ -126,7 +134,11 @@ def catalog( if not response: logger.info(f"Catalog {catalog_name} not found") return None - return self._create_catalog(metastore, response) + optional_catalog = self._create_catalog(metastore, response) + if optional_catalog: + return optional_catalog + + return None def schemas(self, catalog: Catalog) -> Iterable[Schema]: if ( @@ -140,7 +152,9 @@ def schemas(self, catalog: Catalog) -> Iterable[Schema]: logger.info(f"Schemas not found for catalog {catalog.id}") return [] for schema in response: - yield self._create_schema(catalog, schema) + optional_schema = self._create_schema(catalog, schema) + if optional_schema: + yield optional_schema def tables(self, schema: Schema) -> Iterable[Table]: if ( @@ -158,28 +172,38 @@ def tables(self, schema: Schema) -> Iterable[Table]: return [] for table in response: try: - yield self._create_table(schema, table) + optional_table = self._create_table( + schema, cast(TableInfoWithGeneration, table) + ) + if optional_table: + yield optional_table except Exception as e: logger.warning(f"Error parsing table: {e}") self.report.report_warning("table-parse", str(e)) def service_principals(self) -> Iterable[ServicePrincipal]: for principal in self._workspace_client.service_principals.list(): - yield self._create_service_principal(principal) + optional_sp = self._create_service_principal(principal) + if optional_sp: + yield optional_sp def workspace_notebooks(self) -> Iterable[Notebook]: for obj in self._workspace_client.workspace.list("/", recursive=True): - if obj.object_type == ObjectType.NOTEBOOK: + if obj.object_type == ObjectType.NOTEBOOK and obj.object_id and obj.path: yield Notebook( id=obj.object_id, path=obj.path, language=obj.language, created_at=datetime.fromtimestamp( obj.created_at / 1000, tz=timezone.utc - ), + ) + if obj.created_at + else None, modified_at=datetime.fromtimestamp( obj.modified_at / 1000, tz=timezone.utc - ), + ) + if obj.modified_at + else None, ) def query_history( @@ -204,7 +228,9 @@ def query_history( ) for query_info in self._query_history(filter_by=filter_by): try: - yield self._create_query(query_info) + optional_query = self._create_query(query_info) + if optional_query: + yield optional_query except Exception as e: logger.warning(f"Error parsing query: {e}") self.report.report_warning("query-parse", str(e)) @@ -229,15 +255,16 @@ def _query_history( "max_results": max_results, # Max batch size } - response: dict = self._workspace_client.api_client.do( + response: dict = self._workspace_client.api_client.do( # type: ignore method, path, body={**body, "filter_by": filter_by.as_dict()} ) + # we use default raw=False in above request, therefore will always get dict while True: if "res" not in response or not response["res"]: return for v in response["res"]: yield QueryInfo.from_dict(v) - response = self._workspace_client.api_client.do( + response = self._workspace_client.api_client.do( # type: ignore method, path, body={**body, "page_token": response["next_page_token"]} ) @@ -245,7 +272,7 @@ def list_lineages_by_table( self, table_name: str, include_entity_lineage: bool ) -> dict: """List table lineage by table name.""" - return self._workspace_client.api_client.do( + return self._workspace_client.api_client.do( # type: ignore method="GET", path="/api/2.0/lineage-tracking/table-lineage", body={ @@ -256,7 +283,7 @@ def list_lineages_by_table( def list_lineages_by_column(self, table_name: str, column_name: str) -> dict: """List column lineage by table name and column name.""" - return self._workspace_client.api_client.do( + return self._workspace_client.api_client.do( # type: ignore "GET", "/api/2.0/lineage-tracking/column-lineage", body={"table_name": table_name, "column_name": column_name}, @@ -325,7 +352,9 @@ def _escape_sequence(value: str) -> str: @staticmethod def _create_metastore( obj: Union[GetMetastoreSummaryResponse, MetastoreInfo] - ) -> Metastore: + ) -> Optional[Metastore]: + if not obj.name: + return None return Metastore( name=obj.name, id=UnityCatalogApiProxy._escape_sequence(obj.name), @@ -339,7 +368,10 @@ def _create_metastore( def _create_catalog( self, metastore: Optional[Metastore], obj: CatalogInfo - ) -> Catalog: + ) -> Optional[Catalog]: + if not obj.name: + self.report.num_catalogs_missing_name += 1 + return None catalog_name = self._escape_sequence(obj.name) return Catalog( name=obj.name, @@ -350,7 +382,10 @@ def _create_catalog( type=obj.catalog_type, ) - def _create_schema(self, catalog: Catalog, obj: SchemaInfo) -> Schema: + def _create_schema(self, catalog: Catalog, obj: SchemaInfo) -> Optional[Schema]: + if not obj.name: + self.report.num_schemas_missing_name += 1 + return None return Schema( name=obj.name, id=f"{catalog.id}.{self._escape_sequence(obj.name)}", @@ -359,11 +394,14 @@ def _create_schema(self, catalog: Catalog, obj: SchemaInfo) -> Schema: owner=obj.owner, ) - def _create_column(self, table_id: str, obj: ColumnInfo) -> Column: + def _create_column(self, table_id: str, obj: ColumnInfo) -> Optional[Column]: + if not obj.name: + self.report.num_columns_missing_name += 1 + return None return Column( name=obj.name, id=f"{table_id}.{self._escape_sequence(obj.name)}", - type_text=obj.type_text, + type_text=obj.type_text or "", type_name=obj.type_name, type_scale=obj.type_scale, type_precision=obj.type_precision, @@ -372,7 +410,12 @@ def _create_column(self, table_id: str, obj: ColumnInfo) -> Column: comment=obj.comment, ) - def _create_table(self, schema: Schema, obj: TableInfoWithGeneration) -> Table: + def _create_table( + self, schema: Schema, obj: TableInfoWithGeneration + ) -> Optional[Table]: + if not obj.name: + self.report.num_tables_missing_name += 1 + return None table_id = f"{schema.id}.{self._escape_sequence(obj.name)}" return Table( name=obj.name, @@ -381,26 +424,40 @@ def _create_table(self, schema: Schema, obj: TableInfoWithGeneration) -> Table: schema=schema, storage_location=obj.storage_location, data_source_format=obj.data_source_format, - columns=[ - self._create_column(table_id, column) for column in obj.columns or [] - ], + columns=list(self._extract_columns(obj.columns, table_id)) + if obj.columns + else [], view_definition=obj.view_definition or None, properties=obj.properties or {}, owner=obj.owner, generation=obj.generation, - created_at=datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc), + created_at=datetime.fromtimestamp(obj.created_at / 1000, tz=timezone.utc) + if obj.created_at + else None, created_by=obj.created_by, updated_at=datetime.fromtimestamp(obj.updated_at / 1000, tz=timezone.utc) if obj.updated_at + else None + if obj.updated_at else None, updated_by=obj.updated_by, table_id=obj.table_id, comment=obj.comment, ) + def _extract_columns( + self, columns: List[ColumnInfo], table_id: str + ) -> Iterable[Column]: + for column in columns: + optional_column = self._create_column(table_id, column) + if optional_column: + yield optional_column + def _create_service_principal( self, obj: DatabricksServicePrincipal - ) -> ServicePrincipal: + ) -> Optional[ServicePrincipal]: + if not obj.display_name or not obj.application_id: + return None return ServicePrincipal( id=f"{obj.id}.{self._escape_sequence(obj.display_name)}", display_name=obj.display_name, @@ -408,8 +465,14 @@ def _create_service_principal( active=obj.active, ) - @staticmethod - def _create_query(info: QueryInfo) -> Query: + def _create_query(self, info: QueryInfo) -> Optional[Query]: + if ( + not info.query_text + or not info.query_start_time_ms + or not info.query_end_time_ms + ): + self.report.num_queries_missing_info += 1 + return None return Query( query_id=info.query_id, query_text=info.query_text, diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_profiling.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_profiling.py index ab38119d01a9b..5992f103ccac3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_profiling.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_profiling.py @@ -14,6 +14,10 @@ StatementStatus, ) +from datahub.ingestion.source.unity.hive_metastore_proxy import ( + HIVE_METASTORE, + HiveMetastoreProxy, +) from datahub.ingestion.source.unity.proxy_types import ( ColumnProfile, TableProfile, @@ -30,6 +34,7 @@ class UnityCatalogProxyProfilingMixin: _workspace_client: WorkspaceClient report: UnityCatalogReport warehouse_id: str + hive_metastore_proxy: Optional[HiveMetastoreProxy] def check_profiling_connectivity(self): self._workspace_client.warehouses.get(self.warehouse_id) @@ -136,6 +141,8 @@ def _analyze_table( def _check_analyze_table_statement_status( self, execute_response: ExecuteStatementResponse, max_wait_secs: int ) -> bool: + if not execute_response.statement_id or not execute_response.status: + return False statement_id: str = execute_response.statement_id status: StatementStatus = execute_response.status @@ -152,13 +159,15 @@ def _check_analyze_table_statement_status( statement_id ) self._raise_if_error(response, "get-statement") - status = response.status + status = response.status # type: ignore return status.state == StatementState.SUCCEEDED def _get_table_profile( self, ref: TableReference, include_columns: bool ) -> TableProfile: + if self.hive_metastore_proxy and ref.catalog == HIVE_METASTORE: + return self.hive_metastore_proxy.get_table_profile(ref, include_columns) table_info = self._workspace_client.tables.get(ref.qualified_table_name) return self._create_table_profile(table_info, include_columns=include_columns) @@ -166,7 +175,12 @@ def _create_table_profile( self, table_info: TableInfo, include_columns: bool ) -> TableProfile: # Warning: this implementation is brittle -- dependent on properties that can change - columns_names = [column.name for column in table_info.columns] + columns_names = ( + [column.name for column in table_info.columns if column.name] + if table_info.columns + else [] + ) + return TableProfile( num_rows=self._get_int(table_info, "spark.sql.statistics.numRows"), total_size=self._get_int(table_info, "spark.sql.statistics.totalSize"), @@ -182,6 +196,7 @@ def _create_table_profile( def _create_column_profile( self, column: str, table_info: TableInfo ) -> ColumnProfile: + tblproperties = table_info.properties or {} return ColumnProfile( name=column, null_count=self._get_int( @@ -190,25 +205,18 @@ def _create_column_profile( distinct_count=self._get_int( table_info, f"spark.sql.statistics.colStats.{column}.distinctCount" ), - min=table_info.properties.get( - f"spark.sql.statistics.colStats.{column}.min" - ), - max=table_info.properties.get( - f"spark.sql.statistics.colStats.{column}.max" - ), - avg_len=table_info.properties.get( - f"spark.sql.statistics.colStats.{column}.avgLen" - ), - max_len=table_info.properties.get( - f"spark.sql.statistics.colStats.{column}.maxLen" - ), - version=table_info.properties.get( + min=tblproperties.get(f"spark.sql.statistics.colStats.{column}.min"), + max=tblproperties.get(f"spark.sql.statistics.colStats.{column}.max"), + avg_len=tblproperties.get(f"spark.sql.statistics.colStats.{column}.avgLen"), + max_len=tblproperties.get(f"spark.sql.statistics.colStats.{column}.maxLen"), + version=tblproperties.get( f"spark.sql.statistics.colStats.{column}.version" ), ) def _get_int(self, table_info: TableInfo, field: str) -> Optional[int]: - value = table_info.properties.get(field) + tblproperties = table_info.properties or {} + value = tblproperties.get(field) if value is not None: try: return int(value) @@ -223,14 +231,18 @@ def _get_int(self, table_info: TableInfo, field: str) -> Optional[int]: def _raise_if_error( response: Union[ExecuteStatementResponse, GetStatementResponse], key: str ) -> None: - if response.status.state in [ + if response.status and response.status.state in [ StatementState.FAILED, StatementState.CANCELED, StatementState.CLOSED, ]: raise DatabricksError( - response.status.error.message, - error_code=response.status.error.error_code.value, + response.status.error.message + if response.status.error and response.status.error.message + else "Unknown Error", + error_code=response.status.error.error_code.value + if response.status.error and response.status.error.error_code + else "Unknown Error Code", status=response.status.state.value, context=key, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index e5951cb0fa4ff..c66189d99f738 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -96,8 +96,8 @@ class CommonProperty: @dataclass class Metastore(CommonProperty): - global_metastore_id: str # Global across clouds and regions - metastore_id: str + global_metastore_id: Optional[str] # Global across clouds and regions + metastore_id: Optional[str] owner: Optional[str] cloud: Optional[str] region: Optional[str] @@ -107,7 +107,7 @@ class Metastore(CommonProperty): class Catalog(CommonProperty): metastore: Optional[Metastore] owner: Optional[str] - type: Union[CatalogType, CustomCatalogType] + type: Optional[Union[CatalogType, CustomCatalogType]] @dataclass @@ -224,14 +224,14 @@ class Table(CommonProperty): columns: List[Column] storage_location: Optional[str] data_source_format: Optional[DataSourceFormat] - table_type: Union[TableType, HiveTableType] + table_type: Optional[Union[TableType, HiveTableType]] owner: Optional[str] generation: Optional[int] created_at: Optional[datetime] created_by: Optional[str] updated_at: Optional[datetime] updated_by: Optional[str] - table_id: str + table_id: Optional[str] view_definition: Optional[str] properties: Dict[str, str] upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict) @@ -252,16 +252,16 @@ def __post_init__(self): @dataclass class Query: - query_id: str + query_id: Optional[str] query_text: str - statement_type: QueryStatementType + statement_type: Optional[QueryStatementType] start_time: datetime end_time: datetime # User who ran the query - user_id: int + user_id: Optional[int] user_name: Optional[str] # Email or username # User whose credentials were used to run the query - executed_as_user_id: int + executed_as_user_id: Optional[int] executed_as_user_name: Optional[str] @@ -310,9 +310,9 @@ def __bool__(self): class Notebook: id: NotebookId path: str - language: Language - created_at: datetime - modified_at: datetime + language: Optional[Language] + created_at: Optional[datetime] + modified_at: Optional[datetime] upstreams: FrozenSet[TableReference] = field(default_factory=frozenset) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index 0770d9d27055c..02eedb67f4cc2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -39,3 +39,9 @@ class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport): num_profile_missing_size_in_bytes: int = 0 num_profile_failed_unsupported_column_type: int = 0 num_profile_failed_int_casts: int = 0 + + num_catalogs_missing_name: int = 0 + num_schemas_missing_name: int = 0 + num_tables_missing_name: int = 0 + num_columns_missing_name: int = 0 + num_queries_missing_info: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 43c5e24439377..1bc47c6307849 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -304,22 +304,28 @@ def process_notebooks(self) -> Iterable[MetadataWorkUnit]: yield from self._gen_notebook_workunits(notebook) def _gen_notebook_workunits(self, notebook: Notebook) -> Iterable[MetadataWorkUnit]: + + properties = {"path": notebook.path} + if notebook.language: + properties["language"] = notebook.language.value + mcps = MetadataChangeProposalWrapper.construct_many( entityUrn=self.gen_notebook_urn(notebook), aspects=[ DatasetPropertiesClass( name=notebook.path.rsplit("/", 1)[-1], - customProperties={ - "path": notebook.path, - "language": notebook.language.value, - }, + customProperties=properties, externalUrl=urljoin( self.config.workspace_url, f"#notebook/{notebook.id}" ), - created=TimeStampClass(int(notebook.created_at.timestamp() * 1000)), + created=TimeStampClass(int(notebook.created_at.timestamp() * 1000)) + if notebook.created_at + else None, lastModified=TimeStampClass( int(notebook.modified_at.timestamp() * 1000) - ), + ) + if notebook.modified_at + else None, ), SubTypesClass(typeNames=[DatasetSubTypes.NOTEBOOK]), BrowsePathsClass(paths=notebook.path.split("/")), @@ -352,6 +358,9 @@ def process_metastores(self) -> Iterable[MetadataWorkUnit]: metastore: Optional[Metastore] = None if self.config.include_metastore: metastore = self.unity_catalog_api_proxy.assigned_metastore() + if not metastore: + self.report.report_failure("Metastore", "Not found") + return yield from self.gen_metastore_containers(metastore) yield from self.process_catalogs(metastore) if metastore and self.config.include_metastore: @@ -705,13 +714,15 @@ def _create_table_property_aspect(self, table: Table) -> DatasetPropertiesClass: if table.generation is not None: custom_properties["generation"] = str(table.generation) - custom_properties["table_type"] = table.table_type.value + if table.table_type: + custom_properties["table_type"] = table.table_type.value if table.created_by: custom_properties["created_by"] = table.created_by if table.properties: custom_properties.update({k: str(v) for k, v in table.properties.items()}) - custom_properties["table_id"] = table.table_id + if table.table_id: + custom_properties["table_id"] = table.table_id if table.owner: custom_properties["owner"] = table.owner if table.updated_by: diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index ab21c1a318659..f07e7a92d8762 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -117,7 +117,10 @@ def _get_workunits_internal( def _generate_operation_workunit( self, query: Query, table_info: QueryTableInfo ) -> Iterable[MetadataWorkUnit]: - if query.statement_type not in OPERATION_STATEMENT_TYPES: + if ( + not query.statement_type + or query.statement_type not in OPERATION_STATEMENT_TYPES + ): return None # Not sure about behavior when there are multiple target tables. This is a best attempt. diff --git a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py index aab7630d57f46..05f1db0b932f8 100644 --- a/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py +++ b/metadata-ingestion/tests/integration/unity/test_unity_catalog_ingest.py @@ -186,6 +186,8 @@ def register_mock_data(workspace_client): "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", }, "generation": 2, "metastore_id": "2c983545-d403-4f87-9063-5b7e3b6d3736", @@ -200,6 +202,57 @@ def register_mock_data(workspace_client): ) ] + workspace_client.tables.get = lambda *args, **kwargs: databricks.sdk.service.catalog.TableInfo.from_dict( + { + "name": "quickstart_table", + "catalog_name": "quickstart_catalog", + "schema_name": "quickstart_schema", + "table_type": "MANAGED", + "data_source_format": "DELTA", + "columns": [ + { + "name": "columnA", + "type_text": "int", + "type_json": '{"name":"columnA","type":"integer","nullable":true,"metadata":{}}', + "type_name": "INT", + "type_precision": 0, + "type_scale": 0, + "position": 0, + "nullable": True, + }, + { + "name": "columnB", + "type_text": "string", + "type_json": '{"name":"columnB","type":"string","nullable":true,"metadata":{}}', + "type_name": "STRING", + "type_precision": 0, + "type_scale": 0, + "position": 1, + "nullable": True, + }, + ], + "storage_location": "s3://db-02eec1f70bfe4115445be9fdb1aac6ac-s3-root-bucket/metastore/2c983545-d403-4f87-9063-5b7e3b6d3736/tables/cff27aa1-1c6a-4d78-b713-562c660c2896", + "owner": "account users", + "properties": { + "delta.lastCommitTimestamp": "1666185711000", + "delta.lastUpdateVersion": "1", + "delta.minReaderVersion": "1", + "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", + }, + "generation": 2, + "metastore_id": "2c983545-d403-4f87-9063-5b7e3b6d3736", + "full_name": "quickstart_catalog.quickstart_schema.quickstart_table", + "data_access_configuration_id": "00000000-0000-0000-0000-000000000000", + "created_at": 1666185698688, + "created_by": "abc@acryl.io", + "updated_at": 1666186049633, + "updated_by": "abc@acryl.io", + "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", + } + ) + workspace_client.service_principals.list.return_value = [ ServicePrincipal.from_dict(d) for d in [ @@ -220,7 +273,50 @@ def register_mock_data(workspace_client): def mock_hive_sql(query): - if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet`": + + if query == "DESCRIBE EXTENDED `bronze_kambi`.`bet` betStatusId": + return [ + ("col_name", "betStatusId"), + ("data_type", "bigint"), + ("comment", None), + ("min", None), + ("max", None), + ("num_nulls", 0), + ("distinct_count", 1), + ("avg_col_len", 8), + ("max_col_len", 8), + ("histogram", None), + ] + elif query == "DESCRIBE EXTENDED `bronze_kambi`.`bet` channelId": + return [ + ("col_name", "channelId"), + ("data_type", "bigint"), + ("comment", None), + ("min", None), + ("max", None), + ("num_nulls", 0), + ("distinct_count", 1), + ("avg_col_len", 8), + ("max_col_len", 8), + ("histogram", None), + ] + elif query == "DESCRIBE EXTENDED `bronze_kambi`.`bet` combination": + return [ + ("col_name", "combination"), + ( + "data_type", + "struct>,eventId:bigint,eventName:string,eventStartDate:string,live:boolean,odds:double,outcomeIds:array,outcomeLabel:string,sportId:string,status:string,voidReason:string>>,payout:double,rewardExtraPayout:double,stake:double>", + ), + ("comment", None), + ("min", None), + ("max", None), + ("num_nulls", None), + ("distinct_count", None), + ("avg_col_len", None), + ("max_col_len", None), + ("histogram", None), + ] + elif query == "DESCRIBE EXTENDED `bronze_kambi`.`bet`": return [ ("betStatusId", "bigint", None), ("channelId", "bigint", None), @@ -237,6 +333,7 @@ def mock_hive_sql(query): ("Created Time", "Wed Jun 22 05:14:56 UTC 2022", ""), ("Last Access", "UNKNOWN", ""), ("Created By", "Spark 3.2.1", ""), + ("Statistics", "1024 bytes, 3 rows", ""), ("Type", "MANAGED", ""), ("Location", "dbfs:/user/hive/warehouse/bronze_kambi.db/bet", ""), ("Provider", "delta", ""), @@ -312,6 +409,11 @@ def test_ingestion(pytestconfig, tmp_path, requests_mock): "include_ownership": True, "include_hive_metastore": True, "warehouse_id": "test", + "profiling": { + "enabled": True, + "method": "analyze", + "call_analyze": False, + }, }, }, "sink": { diff --git a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json index 98a6615dd2b52..383f94144ffdc 100644 --- a/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json +++ b/metadata-ingestion/tests/integration/unity/unity_catalog_mces_golden.json @@ -504,7 +504,7 @@ "Last Access": "UNKNOWN", "Created By": "Spark 3.2.1", "Owner": "root", - "table_id": "hive_metastore.bronze_kambi.view1", + "table_id": "acryl_metastore.hive_metastore.bronze_kambi.view1", "created_at": "2022-06-22 05:14:56" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi/view1", @@ -638,7 +638,7 @@ "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "hive_metastore.bronze_kambi.view1", + "schemaName": "acryl_metastore.hive_metastore.bronze_kambi.view1", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -1172,10 +1172,11 @@ "Table": "bet", "Last Access": "UNKNOWN", "Created By": "Spark 3.2.1", + "Statistics": "1024 bytes, 3 rows", "Owner": "root", "Is_managed_location": "true", "Table Properties": "[delta.autoOptimize.autoCompact=true,delta.autoOptimize.optimizeWrite=true,delta.minReaderVersion=1,delta.minWriterVersion=2]", - "table_id": "hive_metastore.bronze_kambi.bet", + "table_id": "acryl_metastore.hive_metastore.bronze_kambi.bet", "created_at": "2022-06-22 05:14:56" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/hive_metastore/bronze_kambi/bet", @@ -1275,7 +1276,7 @@ "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "hive_metastore.bronze_kambi.bet", + "schemaName": "acryl_metastore.hive_metastore.bronze_kambi.bet", "platform": "urn:li:dataPlatform:databricks", "version": 0, "created": { @@ -1731,15 +1732,17 @@ "generation": "2", "table_type": "MANAGED", "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", "delta.lastCommitTimestamp": "1666185711000", "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", "owner": "account users", "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" + "updated_at": "2022-10-19 13:27:29.633000+00:00", + "created_at": "2022-10-19 13:21:38.688000+00:00" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/default/quickstart_table", "name": "quickstart_table", @@ -2061,15 +2064,17 @@ "generation": "2", "table_type": "MANAGED", "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", "delta.lastCommitTimestamp": "1666185711000", "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", "owner": "account users", "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" + "updated_at": "2022-10-19 13:27:29.633000+00:00", + "created_at": "2022-10-19 13:21:38.688000+00:00" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/main/quickstart_schema/quickstart_table", "name": "quickstart_table", @@ -2527,15 +2532,17 @@ "generation": "2", "table_type": "MANAGED", "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", "delta.lastCommitTimestamp": "1666185711000", "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", "owner": "account users", "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" + "updated_at": "2022-10-19 13:27:29.633000+00:00", + "created_at": "2022-10-19 13:21:38.688000+00:00" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/default/quickstart_table", "name": "quickstart_table", @@ -2857,15 +2864,17 @@ "generation": "2", "table_type": "MANAGED", "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", "delta.lastCommitTimestamp": "1666185711000", "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", "owner": "account users", "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" + "updated_at": "2022-10-19 13:27:29.633000+00:00", + "created_at": "2022-10-19 13:21:38.688000+00:00" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/quickstart_catalog/quickstart_schema/quickstart_table", "name": "quickstart_table", @@ -3323,15 +3332,17 @@ "generation": "2", "table_type": "MANAGED", "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", "delta.lastCommitTimestamp": "1666185711000", "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", "owner": "account users", "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" + "updated_at": "2022-10-19 13:27:29.633000+00:00", + "created_at": "2022-10-19 13:21:38.688000+00:00" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/default/quickstart_table", "name": "quickstart_table", @@ -3653,15 +3664,17 @@ "generation": "2", "table_type": "MANAGED", "created_by": "abc@acryl.io", - "created_at": "2022-10-19 13:21:38.688000+00:00", "delta.lastCommitTimestamp": "1666185711000", "delta.lastUpdateVersion": "1", "delta.minReaderVersion": "1", "delta.minWriterVersion": "2", + "spark.sql.statistics.numRows": "10", + "spark.sql.statistics.totalSize": "512", "table_id": "cff27aa1-1c6a-4d78-b713-562c660c2896", "owner": "account users", "updated_by": "abc@acryl.io", - "updated_at": "2022-10-19 13:27:29.633000+00:00" + "updated_at": "2022-10-19 13:27:29.633000+00:00", + "created_at": "2022-10-19 13:21:38.688000+00:00" }, "externalUrl": "https://dummy.cloud.databricks.com/explore/data/system/quickstart_schema/quickstart_table", "name": "quickstart_table", @@ -3813,6 +3826,69 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703580920011, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 10, + "columnCount": 2, + "fieldProfiles": [], + "sizeInBytes": 512 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.hive_metastore.bronze_kambi.bet,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703581191932, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 3, + "columnCount": 3, + "fieldProfiles": [ + { + "fieldPath": "betStatusId", + "uniqueCount": 1, + "uniqueProportion": 0.3333333333333333, + "nullCount": 0, + "nullProportion": 0.0 + }, + { + "fieldPath": "channelId", + "uniqueCount": 1, + "uniqueProportion": 0.3333333333333333, + "nullCount": 0, + "nullProportion": 0.0 + } + ], + "sizeInBytes": 1024 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", @@ -3829,6 +3905,30 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703580406273, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 10, + "columnCount": 2, + "fieldProfiles": [], + "sizeInBytes": 512 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", @@ -3845,6 +3945,78 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.quickstart_schema.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703580920008, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 10, + "columnCount": 2, + "fieldProfiles": [], + "sizeInBytes": 512 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.quickstart_schema.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703580920011, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 10, + "columnCount": 2, + "fieldProfiles": [], + "sizeInBytes": 512 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.main.default.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703580920012, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 10, + "columnCount": 2, + "fieldProfiles": [], + "sizeInBytes": 512 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.quickstart_catalog.default.quickstart_table,PROD)", @@ -3877,6 +4049,30 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.quickstart_schema.quickstart_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1703580920010, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "rowCount": 10, + "columnCount": 2, + "fieldProfiles": [], + "sizeInBytes": 512 + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "unity-catalog-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:databricks,dummy.acryl_metastore.system.default.quickstart_table,PROD)", From 2d302fe754969a4ec64b678d6a4002558eee66b3 Mon Sep 17 00:00:00 2001 From: Kunal-kankriya <127090035+Kunal-kankriya@users.noreply.github.com> Date: Wed, 3 Jan 2024 13:59:20 +0530 Subject: [PATCH 24/48] fix(cypress): make setting manage policy test not flaky (#9547) --- .../cypress/e2e/settings/manage_policies.js | 247 ++++++++---------- 1 file changed, 104 insertions(+), 143 deletions(-) diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/manage_policies.js b/smoke-test/tests/cypress/cypress/e2e/settings/manage_policies.js index 6515d92285e2e..0e69a4e7f287a 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/manage_policies.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/manage_policies.js @@ -4,149 +4,110 @@ const platform_policy_edited = `Platform test policy ${test_id} EDITED`; const metadata_policy_name = `Metadata test policy ${test_id}`; const metadata_policy_edited = `Metadata test policy ${test_id} EDITED`; + + +function searchAndToggleMetadataPolicyStatus(metadataPolicyName, targetStatus) { + cy.get('[data-testid="search-input"]').should('be.visible'); + cy.get('[data-testid="search-input"]').eq(1).type(metadataPolicyName); + cy.contains('tr', metadataPolicyName).as('metadataPolicyRow'); + cy.contains(targetStatus).click(); +} + +function clickFocusAndType(Id, text) { + cy.clickOptionWithTestId(Id) + .focused().clear() + .type(text); +} + +function updateAndSave(Id, groupName, text) { + cy.clickOptionWithTestId(Id).type(groupName); + cy.get(`[title='${text}']`).click(); + cy.focused().blur(); +} + +function clickOnButton(saveButton) { + cy.get(`#${saveButton}`).click(); +} + +function createPolicy(decription, policyName) { + clickFocusAndType("policy-description", decription) + clickOnButton("nextButton"); + updateAndSave("privileges", "All", "All Privileges", "nextButton") + clickOnButton("nextButton"); + updateAndSave("users", "All", "All Users") + updateAndSave("groups", "All", "All Groups") + clickOnButton("saveButton"); + cy.waitTextVisible("Successfully saved policy."); + cy.waitTextVisible(policyName); +} + +function editPolicy(policyName, editPolicy, description, policyEdited, visibleDiscription) { + searchAndToggleMetadataPolicyStatus(policyName, 'EDIT') + cy.clickOptionWithTestId("policy-name") + cy.focused().clear().type(editPolicy); + cy.clickOptionWithTestId("policy-description"); + cy.focused().clear().type(description); + clickOnButton("nextButton"); + clickOnButton("nextButton"); + clickOnButton("saveButton"); + cy.waitTextVisible("Successfully saved policy."); + cy.waitTextVisible(policyEdited); + cy.waitTextVisible(visibleDiscription);; +} + +function deletePolicy(policyEdited, deletePolicy) { + searchAndToggleMetadataPolicyStatus(policyEdited, 'DEACTIVATE') + cy.waitTextVisible("Successfully deactivated policy.") + cy.contains('DEACTIVATE').should('not.exist') + cy.contains('ACTIVATE').click(); + cy.waitTextVisible("Successfully activated policy.") + cy.get("[data-icon='delete']").click(); + cy.waitTextVisible(deletePolicy); + cy.clickOptionWithText("Yes"); + cy.waitTextVisible("Successfully removed policy."); + cy.ensureTextNotPresent(policyEdited); +} + describe("create and manage platform and metadata policies", () => { + beforeEach(() => { + cy.loginWithCredentials(); + cy.visit("/settings/permissions/policies"); + }); + + it("create platform policy", () => { + cy.waitTextVisible("Manage Permissions"); + cy.clickOptionWithText("Create new policy"); + clickFocusAndType("policy-name", platform_policy_name) + cy.get('[data-testid="policy-type"] [title="Metadata"]').click(); + cy.clickOptionWithTestId("platform"); + createPolicy(`Platform policy description ${test_id}`, platform_policy_name) + }); + + it("edit platform policy", () => { + editPolicy(`${platform_policy_name}`, platform_policy_edited, + `Platform policy description ${test_id} EDITED`, + platform_policy_edited, `Platform policy description ${test_id} EDITED`) + }); + + it("deactivate and activate platform policy", () => { + deletePolicy(`${platform_policy_edited}`, `Delete ${platform_policy_edited}`, `${platform_policy_edited}`) + }); + + it("create metadata policy", () => { + cy.clickOptionWithText("Create new policy"); + clickFocusAndType("policy-name", metadata_policy_name) + cy.get('[data-testid="policy-type"]').should('have.text', 'Metadata'); + createPolicy(`Metadata policy description ${test_id}`, metadata_policy_name) + }); + + it("edit metadata policy", () => { + editPolicy(`${metadata_policy_name}`, metadata_policy_edited, + `Metadata policy description ${test_id} EDITED`, + metadata_policy_edited, `Metadata policy description ${test_id} EDITED`) + }); + + it("deactivate and activate metadata policy", () => { + deletePolicy(`${metadata_policy_name}`, `Delete ${metadata_policy_name}`, `${metadata_policy_edited}`) + }); - it("create platform policy", () => { - cy.loginWithCredentials(); - cy.visit("/settings/permissions/policies"); - cy.waitTextVisible("Manage Permissions"); - cy.clickOptionWithText("Create new policy"); - cy.clickOptionWithTestId("policy-name") - .focused() - .type(platform_policy_name); - cy.get('[data-testid="policy-type"] [title="Metadata"]').click(); - cy.clickOptionWithTestId("platform"); - cy.clickOptionWithTestId("policy-description") - .focused() - .type(`Platform policy description ${test_id}`); - cy.get("#nextButton").click(); - cy.get('[data-testid="privileges"]').type("All"); - cy.clickOptionWithText("All Privileges").focused().blur(); - cy.get("#nextButton").click(); - cy.get('[data-testid="users"]').type("All"); - cy.get("[title='All Users']").click(); - cy.focused().blur(); - cy.get('[data-testid="groups"]').type("All"); - cy.get("[title='All Groups']").click(); - cy.focused().blur(); - cy.get("#saveButton").click(); - cy.waitTextVisible("Successfully saved policy."); - cy.waitTextVisible(platform_policy_name); - }); - - it("edit platform policy", () => { - cy.loginWithCredentials(); - cy.visit("/settings/permissions/policies"); - cy.contains('tr', `${platform_policy_name}` ) - .contains('EDIT') - .click(); - cy.clickOptionWithTestId("policy-name"); - cy.focused().clear().type(platform_policy_edited); - cy.clickOptionWithTestId("policy-description"); - cy.focused().clear().type(`Platform policy description ${test_id} EDITED`); - cy.get("#nextButton").click(); - cy.get("#nextButton").click(); - cy.get("#saveButton").click(); - cy.waitTextVisible("Successfully saved policy."); - cy.waitTextVisible(platform_policy_edited); - cy.waitTextVisible(`Platform policy description ${test_id} EDITED`); - }); - - it("deactivate and activate platform policy", () => { - cy.loginWithCredentials(); - cy.visit("/settings/permissions/policies"); - cy.contains('tr', `${platform_policy_edited}` ) - .contains('DEACTIVATE') - .click(); - cy.waitTextVisible("Successfully deactivated policy.") - cy.contains('tr', `${platform_policy_edited}` ) - .contains('INACTIVE') - .should("be.visible"); - cy.contains('tr', `${platform_policy_edited}` ) - .contains('ACTIVATE') - .click(); - cy.waitTextVisible("Successfully activated policy.") - cy.contains('tr', `${platform_policy_edited}` ) - .contains('ACTIVE') - .should("be.visible"); - cy.contains('tr', `${platform_policy_edited}` ) - .find("[data-icon='delete']") - .click(); - cy.waitTextVisible(`Delete ${platform_policy_edited}`); - cy.clickOptionWithText("Yes"); - cy.waitTextVisible("Successfully removed policy."); - cy.ensureTextNotPresent(`${platform_policy_edited}`); - - }); - - it("create metadata policy", () => { - cy.loginWithCredentials(); - cy.visit("/settings/permissions/policies"); - cy.clickOptionWithText("Create new policy"); - cy.clickOptionWithTestId("policy-name") - .focused() - .type(metadata_policy_name); - cy.get('[data-testid="policy-type"]').should('have.text', 'Metadata'); - cy.clickOptionWithTestId("policy-description") - .focused() - .type(`Metadata policy description ${test_id}`); - cy.get("#nextButton").click(); - cy.get('[data-testid="privileges"]').type("All"); - cy.clickOptionWithText("All Privileges").focused().blur(); - cy.get("#nextButton").click(); - cy.get('[data-testid="users"]').type("All"); - cy.get("[title='All Users']").click(); - cy.focused().blur(); - cy.get('[data-testid="groups"]').type("All"); - cy.get("[title='All Groups']").click(); - cy.focused().blur(); - cy.get("#saveButton").click(); - cy.waitTextVisible("Successfully saved policy."); - cy.waitTextVisible(metadata_policy_name); - }); - - it("edit metadata policy", () => { - cy.loginWithCredentials(); - cy.visit("/settings/permissions/policies"); - cy.contains('tr', `${metadata_policy_name}` ) - .contains('EDIT') - .click(); - cy.clickOptionWithTestId("policy-name") - cy.focused().clear().type(metadata_policy_edited); - cy.clickOptionWithTestId("policy-description"); - cy.focused().clear().type(`Metadata policy description ${test_id} EDITED`); - cy.get("#nextButton").click(); - cy.get("#nextButton").click(); - cy.get("#saveButton").click(); - cy.waitTextVisible("Successfully saved policy."); - cy.waitTextVisible(metadata_policy_edited); - cy.waitTextVisible(`Metadata policy description ${test_id} EDITED`); - }); - - it("deactivate and activate metadata policy", () => { - cy.loginWithCredentials(); - cy.visit("/settings/permissions/policies"); - cy.contains('tr', `${metadata_policy_edited}` ) - .contains('DEACTIVATE') - .click(); - cy.waitTextVisible("Successfully deactivated policy.") - cy.contains('tr', `${metadata_policy_edited}` ) - .contains('INACTIVE') - .should("be.visible"); - cy.contains('tr', `${metadata_policy_edited}` ) - .contains('ACTIVATE') - .click(); - cy.waitTextVisible("Successfully activated policy.") - cy.contains('tr', `${metadata_policy_edited}` ) - .contains('ACTIVE') - .should("be.visible"); - cy.contains('tr', `${metadata_policy_edited}` ) - .find("[data-icon='delete']") - .click(); - cy.waitTextVisible(`Delete ${metadata_policy_edited}`); - cy.clickOptionWithText("Yes"); - cy.waitTextVisible("Successfully removed policy."); - cy.ensureTextNotPresent(`${metadata_policy_edited}`); - }); - }); \ No newline at end of file From c395d86139c773cd374fa6a52587614787580192 Mon Sep 17 00:00:00 2001 From: kushagra-apptware <81357546+kushagra-apptware@users.noreply.github.com> Date: Wed, 3 Jan 2024 14:00:28 +0530 Subject: [PATCH 25/48] fix(ui): search user incorrect role shown (#9532) --- datahub-web-react/src/app/identity/user/SelectRole.tsx | 6 +++++- datahub-web-react/src/app/identity/user/UserList.tsx | 9 ++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/datahub-web-react/src/app/identity/user/SelectRole.tsx b/datahub-web-react/src/app/identity/user/SelectRole.tsx index 011eae0fbd8b3..deaa85f14b088 100644 --- a/datahub-web-react/src/app/identity/user/SelectRole.tsx +++ b/datahub-web-react/src/app/identity/user/SelectRole.tsx @@ -1,4 +1,4 @@ -import React, { useState } from 'react'; +import React, { useEffect, useState } from 'react'; import { UserOutlined } from '@ant-design/icons'; import { Select } from 'antd'; import { useApolloClient } from '@apollo/client'; @@ -49,6 +49,10 @@ export default function SelectRole({ user, userRoleUrn, selectRoleOptions, refet const [currentRoleUrn, setCurrentRoleUrn] = useState(defaultRoleUrn); const [isViewingAssignRole, setIsViewingAssignRole] = useState(false); + useEffect(() => { + setCurrentRoleUrn(defaultRoleUrn); + }, [defaultRoleUrn]); + const onSelectRole = (roleUrn: string) => { setCurrentRoleUrn(roleUrn); setIsViewingAssignRole(true); diff --git a/datahub-web-react/src/app/identity/user/UserList.tsx b/datahub-web-react/src/app/identity/user/UserList.tsx index 8e2bc21f0693f..22b44e5f2d625 100644 --- a/datahub-web-react/src/app/identity/user/UserList.tsx +++ b/datahub-web-react/src/app/identity/user/UserList.tsx @@ -52,6 +52,7 @@ export const UserList = () => { const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); const paramsQuery = (params?.query as string) || undefined; const [query, setQuery] = useState(undefined); + const [usersList, setUsersList] = useState>([]); useEffect(() => setQuery(paramsQuery), [paramsQuery]); const [page, setPage] = useState(1); @@ -81,8 +82,9 @@ export const UserList = () => { }); const totalUsers = usersData?.listUsers?.total || 0; - const users = usersData?.listUsers?.users || []; - + useEffect(()=> { + setUsersList(usersData?.listUsers?.users || []); + }, [usersData]); const onChangePage = (newPage: number) => { scrollToTop(); setPage(newPage); @@ -145,6 +147,7 @@ export const UserList = () => { onQueryChange={(q) => { setPage(1); setQuery(q); + setUsersList([]); }} entityRegistry={entityRegistry} hideRecommendations @@ -155,7 +158,7 @@ export const UserList = () => { locale={{ emptyText: , }} - dataSource={users} + dataSource={usersList} renderItem={(item: any) => ( handleDelete(item.urn as string)} From 21075e606707df42f25c4ab2d37ef6b2d97daf0d Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Wed, 3 Jan 2024 00:39:58 -0800 Subject: [PATCH 26/48] fix(ci): make test flexible to allow sha-based cli versions (#9551) --- smoke-test/tests/read_only/test_services_up.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/smoke-test/tests/read_only/test_services_up.py b/smoke-test/tests/read_only/test_services_up.py index cbe92625f4689..b1b3b1d6f4bd7 100644 --- a/smoke-test/tests/read_only/test_services_up.py +++ b/smoke-test/tests/read_only/test_services_up.py @@ -2,6 +2,7 @@ import pytest import requests +import re from tests.utils import get_gms_url, wait_for_healthcheck_util @@ -13,6 +14,8 @@ def test_services_up(): wait_for_healthcheck_util() +def looks_like_a_short_sha(sha: str) -> bool: + return len(sha) == 7 and re.match(r"[0-9a-f]{7}", sha) is not None @pytest.mark.read_only def test_gms_config_accessible(): @@ -30,4 +33,4 @@ def test_gms_config_accessible(): default_cli_version: str = gms_config["managedIngestion"]["defaultCliVersion"] print(f"Default CLI version: {default_cli_version}") assert not default_cli_version.startswith("@") - assert "." in default_cli_version + assert "." in default_cli_version or looks_like_a_short_sha(default_cli_version), "Default CLI version does not look like a version string" From 2e3141e1db5be0b24c343812a885dc494168a7de Mon Sep 17 00:00:00 2001 From: Kunal-kankriya <127090035+Kunal-kankriya@users.noreply.github.com> Date: Wed, 3 Jan 2024 18:59:16 +0530 Subject: [PATCH 27/48] tests(cypress): add navigation in search test (#9545) --- .../e2e/search/query_and_filter_search.js | 156 ++++++++++++------ 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js b/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js index 4637310b86496..59105be587803 100644 --- a/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js +++ b/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js @@ -1,57 +1,105 @@ +const datasetNames = { + dashboardsType: "Baz Dashboard", + pipelinesType: "Users", + MlmoduleType: "cypress-model", + glossaryTermsType: "CypressColumnInfoType", + tags: "some-cypress-feature-1", + hivePlatform: "cypress_logging_events", + airflowPlatform: "User Creations", + awsPlatform: "project/root/events/logging_events_bckp", + hdfsPlatform: "SampleHdfsDataset" +}; + +const searchToExecute = (value) => { + cy.get("input[data-testid=search-input]").eq(0).type(`${value}{enter}`); + cy.waitTextPresent("Type"); +}; + +const selectFilteredEntity = (textToClick, entity, url) => { + cy.get(`[data-testid=filter-dropdown-${textToClick}]`).click({ force: true }); + cy.get(`[data-testid="filter-option-${entity}"]`).click({ force: true }); + cy.get("[data-testid=update-filters]").click({ force: true }); + cy.url().should("include", `${url}`); + cy.get("[data-testid=update-filters]").should("not.be.visible"); + cy.get('.ant-pagination-next').scrollIntoView().should('be.visible'); +}; + +const verifyFilteredEntity = (text) => { + cy.get('.ant-typography').contains(text).should('be.visible'); +}; + describe("auto-complete dropdown, filter plus query search test", () => { + + beforeEach(() => { + cy.loginWithCredentials(); + cy.visit('/'); + }); + + it.skip("Verify the 'filter by type' section + query", () => { + + //Dashboard + searchToExecute("*"); + selectFilteredEntity("Type", "Dashboards", "filter__entityType"); + cy.clickOptionWithText(datasetNames.dashboardsType); + verifyFilteredEntity('Dashboard'); + + //Ml Models + searchToExecute("*"); + selectFilteredEntity("Type", "ML Models", "filter__entityType"); + cy.clickOptionWithText(datasetNames.MlmoduleType); + verifyFilteredEntity('ML Model'); + + //Piplines + searchToExecute("*"); + selectFilteredEntity("Type", "Pipelines", "filter__entityType"); + cy.clickOptionWithText(datasetNames.pipelinesType); + verifyFilteredEntity('Pipeline'); + + }); + + it("Verify the 'filter by Glossary term' section + query", () => { + + //Glossary Term + searchToExecute("*"); + selectFilteredEntity("Type", "Glossary Terms", "filter__entityType"); + cy.clickOptionWithText(datasetNames.glossaryTermsType); + verifyFilteredEntity('Glossary Term'); +}); + + it("Verify the 'filter by platform' section + query", () => { + + //Hive + searchToExecute("*"); + selectFilteredEntity("Platform", "Hive", "filter_platform"); + cy.clickOptionWithText(datasetNames.hivePlatform); + verifyFilteredEntity('Hive'); + + //AWS S3 + searchToExecute("*"); + selectFilteredEntity("Platform", "AWS S3", "filter_platform"); + cy.clickOptionWithText(datasetNames.awsPlatform); + verifyFilteredEntity('AWS S3'); + + //HDFS + searchToExecute("*"); + selectFilteredEntity("Platform", "HDFS", "filter_platform"); + cy.clickOptionWithText(datasetNames.hdfsPlatform); + verifyFilteredEntity('HDFS'); + + //Airflow + searchToExecute("*"); + selectFilteredEntity("Platform", "Airflow", "filter_platform"); + cy.clickOptionWithText(datasetNames.airflowPlatform); + verifyFilteredEntity('Airflow'); + }); - const platformQuerySearch = (query,test_id,active_filter) => { - cy.visit("/"); - cy.get("input[data-testid=search-input]").type(query); - cy.get(`[data-testid="quick-filter-urn:li:dataPlatform:${test_id}"]`).click(); - cy.focused().type("{enter}").wait(3000); - cy.url().should( - "include", - `?filter_platform___false___EQUAL___0=urn%3Ali%3AdataPlatform%3A${test_id}` - ); - cy.get('[data-testid="search-input"]').should("have.value", query); - cy.get(`[data-testid="active-filter-${active_filter}"]`).should("be.visible"); - cy.contains("of 0 results").should("not.exist"); - cy.contains(/of [0-9]+ results/); - } - - const entityQuerySearch = (query,test_id,active_filter) => { - cy.visit("/"); - cy.get("input[data-testid=search-input]").type(query); - cy.get(`[data-testid="quick-filter-${test_id}"]`).click(); - cy.focused().type("{enter}").wait(3000); - cy.url().should( - "include", - `?filter__entityType___false___EQUAL___0=${test_id}` - ); - cy.get('[data-testid="search-input"]').should("have.value", query); - cy.get(`[data-testid="active-filter-${active_filter}"]`).should("be.visible"); - cy.contains("of 0 results").should("not.exist"); - cy.contains(/of [0-9]+ results/); - } - - it("verify the 'filter by' section + query (result in search page with query applied + filter applied)", () => { - // Platform query plus filter test - cy.loginWithCredentials(); - // Airflow - platformQuerySearch ("cypress","airflow","Airflow"); - // BigQuery - platformQuerySearch ("cypress","bigquery","BigQuery"); - // dbt - platformQuerySearch ("cypress","dbt","dbt"); - // Hive - platformQuerySearch ("cypress","hive","Hive"); - - // Entity type query plus filter test - // Datasets - entityQuerySearch ("cypress","DATASET","Datasets"); - // Dashboards - entityQuerySearch ("cypress","DASHBOARD","Dashboards"); - // Pipelines - entityQuerySearch ("cypress","DATA_FLOW","Pipelines"); - // Domains - entityQuerySearch ("Marketing","DOMAIN","Domains"); - // Glossary Terms - entityQuerySearch ("cypress","GLOSSARY_TERM","Glossary Terms"); + it("Verify the 'filter by tag' section + query", () => { + + //CypressFeatureTag + searchToExecute("*"); + selectFilteredEntity("Tag", "CypressFeatureTag", "filter_tags"); + cy.clickOptionWithText(datasetNames.tags); + cy.mouseover('[data-testid="tag-CypressFeatureTag"]'); + verifyFilteredEntity('Feature'); }); -}); \ No newline at end of file +}); From ff78e3c172fee880cdbe1aa3333cf4a73926c910 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 3 Jan 2024 19:47:19 +0530 Subject: [PATCH 28/48] docs(acryl cloud): release notes for 0.2.14.1 (#9554) --- docs-website/sidebars.js | 1 + docs/managed-datahub/release-notes/v_0_2_14.md | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 docs/managed-datahub/release-notes/v_0_2_14.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 5d7c6b06adad4..2b8873c678778 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -177,6 +177,7 @@ module.exports = { }, { "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_14", "docs/managed-datahub/release-notes/v_0_2_13", "docs/managed-datahub/release-notes/v_0_2_12", "docs/managed-datahub/release-notes/v_0_2_11", diff --git a/docs/managed-datahub/release-notes/v_0_2_14.md b/docs/managed-datahub/release-notes/v_0_2_14.md new file mode 100644 index 0000000000000..8ad1f19503e06 --- /dev/null +++ b/docs/managed-datahub/release-notes/v_0_2_14.md @@ -0,0 +1,17 @@ +# v0.2.14.1 +--- + +Release Availability Date +--- +02-Jan-2023 + +Recommended CLI/SDK +--- +- `v0.12.1.3` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.12.1.3 + +If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. + +## Release Changelog +--- +- Since `v0.2.13` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/d9de854d276c118afc55264ecc9e2712b91b4ab2...31f9c796763677a4d452066d9b49b4088e65da19 have been pulled in. + From c3c4bef1ad746a57a1a6cff821a732fe8114f695 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Wed, 3 Jan 2024 22:59:39 +0530 Subject: [PATCH 29/48] ci(doc): tweak build rule to avoid docker build for docs (#9555) --- .github/workflows/docker-unified.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 454e766140245..8afce059572c7 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -4,12 +4,14 @@ on: branches: - master paths-ignore: + - "docs-website/**" - "docs/**" - "**.md" pull_request: branches: - "**" paths-ignore: + - "docs-website/**" - "docs/**" - "**.md" release: From c9613043c86e169a888d5ac60f0efdcd1551a2b0 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jan 2024 14:28:22 -0500 Subject: [PATCH 30/48] fix(ingest): improve kafka-connect test stability (#9519) --- .../tests/integration/kafka/docker-compose.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/tests/integration/kafka/docker-compose.yml b/metadata-ingestion/tests/integration/kafka/docker-compose.yml index 43f30cbe1e665..0a4422e07515c 100644 --- a/metadata-ingestion/tests/integration/kafka/docker-compose.yml +++ b/metadata-ingestion/tests/integration/kafka/docker-compose.yml @@ -1,5 +1,5 @@ --- -version: '3.8' +version: "3.8" services: zookeeper: image: confluentinc/cp-zookeeper:7.2.2 @@ -9,7 +9,8 @@ services: ports: - "52181" volumes: - - test_zkdata:/var/opt/zookeeper + - test_zkdata:/var/lib/zookeeper/data + - test_zklogs:/var/lib/zookeeper/log broker: image: confluentinc/cp-kafka:7.2.2 @@ -34,3 +35,4 @@ services: volumes: test_zkdata: + test_zklogs: From 83b904e379b0e9a13d22659e483c6d3d4c9b29ba Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jan 2024 14:28:32 -0500 Subject: [PATCH 31/48] fix(ingest/looker): add user stats to report (#9505) --- .../ingestion/source/looker/looker_common.py | 5 +++++ .../ingestion/source/looker/looker_config.py | 5 ----- .../ingestion/source/looker/looker_source.py | 13 +++++-------- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 53533a8d27c9b..94a56bb9281cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1059,6 +1059,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): dashboards_scanned_for_usage: int = 0 charts_scanned_for_usage: int = 0 charts_with_activity: LossySet[str] = dataclasses_field(default_factory=LossySet) + accessed_dashboards: int = 0 dashboards_with_activity: LossySet[str] = dataclasses_field( default_factory=LossySet ) @@ -1066,6 +1067,10 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): _looker_explore_registry: Optional[LookerExploreRegistry] = None total_explores: int = 0 explores_scanned: int = 0 + + resolved_user_ids: int = 0 + email_ids_missing: int = 0 # resolved users with missing email addresses + _looker_api: Optional[LookerAPI] = None query_latency: Dict[str, datetime.timedelta] = dataclasses_field( default_factory=dict diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 514f22b4f2158..52a21e8f12259 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -160,11 +160,6 @@ class LookerDashboardSourceConfig( description="When enabled, extracts ownership from Looker directly. When disabled, ownership is left empty " "for dashboards and charts.", ) - actor: Optional[str] = Field( - None, - description="This config is deprecated in favor of `extract_owners`. Previously, was the actor to use in " - "ownership properties of ingested metadata.", - ) strip_user_ids_from_email: bool = Field( False, description="When enabled, converts Looker user emails of the form name@domain.com to urn:li:corpuser:name " diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 7e8fbfde12042..0cce267bf5579 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -129,9 +129,6 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase): source_config: LookerDashboardSourceConfig reporter: LookerDashboardSourceReport user_registry: LookerUserRegistry - accessed_dashboards: int = 0 - resolved_user_ids: int = 0 - email_ids_missing: int = 0 # resolved users with missing email addresses reachable_look_registry: Set[ str ] # Keep track of look-id which are reachable from Dashboard @@ -866,7 +863,7 @@ def _get_folder_path(self, folder: FolderBase, client: LookerAPI) -> str: def _get_looker_dashboard( self, dashboard: Dashboard, client: LookerAPI ) -> LookerDashboard: - self.accessed_dashboards += 1 + self.reporter.accessed_dashboards += 1 if dashboard.folder is None: logger.debug(f"{dashboard.id} has no folder") dashboard_folder_path = None @@ -928,9 +925,9 @@ def _get_looker_user(self, user_id: Optional[str]) -> Optional[LookerUser]: if user is not None and self.source_config.extract_owners: # Keep track of how many user ids we were able to resolve - self.resolved_user_ids += 1 + self.reporter.resolved_user_ids += 1 if user.email is None: - self.email_ids_missing += 1 + self.reporter.email_ids_missing += 1 return user @@ -1313,8 +1310,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if ( self.source_config.extract_owners - and self.resolved_user_ids > 0 - and self.email_ids_missing == self.resolved_user_ids + and self.reporter.resolved_user_ids > 0 + and self.reporter.email_ids_missing == self.reporter.resolved_user_ids ): # Looks like we tried to extract owners and could not find their email addresses. This is likely a permissions issue self.reporter.report_warning( From 186b6f942d3fa7f0ce379add72cbcb57bccd4bb0 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Wed, 3 Jan 2024 12:21:06 -0800 Subject: [PATCH 32/48] perf(lineage): Rewrite lineage query for Elastic graph store (#9552) --- .../graph/elastic/ESGraphQueryDAO.java | 82 ++++--- .../graph/search/ESGraphQueryDAOTest.java | 94 ++++++- ...1.json => lineage_query_filters_full.json} | 98 ++++---- ...eage_query_filters_full_empty_filters.json | 60 +++++ ...e_query_filters_full_multiple_filters.json | 229 ++++++++++++++++++ .../lineage_query_filters_limited.json | 32 +++ 6 files changed, 508 insertions(+), 87 deletions(-) rename metadata-io/src/test/resources/elasticsearch/sample_filters/{lineage_query_filters_1.json => lineage_query_filters_full.json} (81%) create mode 100644 metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_empty_filters.json create mode 100644 metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_multiple_filters.json create mode 100644 metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_limited.json diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index 92960bc9222ab..97cb186ce948c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -336,17 +336,10 @@ private List getLineageRelationships( Collectors.toMap( Function.identity(), entityType -> lineageRegistry.getLineageRelationships(entityType, direction))); - BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); - // Get all relation types relevant to the set of urns to hop from - urnsPerEntityType.forEach( - (entityType, urns) -> - finalQuery.should( - getQueryForLineage( - urns, - edgesPerEntityType.getOrDefault(entityType, Collections.emptyList()), - graphFilters, - startTimeMillis, - endTimeMillis))); + + QueryBuilder finalQuery = + getLineageQuery( + urnsPerEntityType, edgesPerEntityType, graphFilters, startTimeMillis, endTimeMillis); SearchResponse response = executeSearchQuery(finalQuery, 0, graphQueryConfiguration.getMaxResult()); Set entityUrnSet = new HashSet<>(entityUrns); @@ -361,18 +354,53 @@ private List getLineageRelationships( entityUrnSet, response, validEdges, visitedEntities, numHops, existingPaths); } - // Get search query for given list of edges and source urns @VisibleForTesting - public static QueryBuilder getQueryForLineage( - @Nonnull List urns, - @Nonnull List lineageEdges, + public static QueryBuilder getLineageQuery( + @Nonnull Map> urnsPerEntityType, + @Nonnull Map> edgesPerEntityType, @Nonnull GraphFilters graphFilters, @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { - BoolQueryBuilder query = QueryBuilders.boolQuery(); - if (lineageEdges.isEmpty()) { - return query; + BoolQueryBuilder entityTypeQueries = QueryBuilders.boolQuery(); + // Get all relation types relevant to the set of urns to hop from + urnsPerEntityType.forEach( + (entityType, urns) -> { + if (edgesPerEntityType.containsKey(entityType) + && !edgesPerEntityType.get(entityType).isEmpty()) { + entityTypeQueries.should( + getLineageQueryForEntityType( + urns, edgesPerEntityType.get(entityType), graphFilters)); + } + }); + + BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); + + finalQuery.filter(entityTypeQueries); + finalQuery.filter(buildEntityTypesFilter(graphFilters.getAllowedEntityTypes(), SOURCE)); + finalQuery.filter(buildEntityTypesFilter(graphFilters.getAllowedEntityTypes(), DESTINATION)); + + /* + * Optional - Add edge filtering based on time windows. + */ + if (startTimeMillis != null && endTimeMillis != null) { + finalQuery.filter(TimeFilterUtils.getEdgeTimeFilterQuery(startTimeMillis, endTimeMillis)); + } else { + log.debug( + String.format( + "Empty time filter range provided: start time %s, end time: %s. Skipping application of time filters", + startTimeMillis, endTimeMillis)); } + + return finalQuery; + } + + // Get search query for given list of edges and source urns + @VisibleForTesting + public static QueryBuilder getLineageQueryForEntityType( + @Nonnull List urns, + @Nonnull List lineageEdges, + @Nonnull GraphFilters graphFilters) { + BoolQueryBuilder query = QueryBuilders.boolQuery(); Map> edgesByDirection = lineageEdges.stream().collect(Collectors.groupingBy(EdgeInfo::getDirection)); @@ -388,18 +416,6 @@ public static QueryBuilder getQueryForLineage( query.should(getIncomingEdgeQuery(urns, incomingEdges, graphFilters)); } - /* - * Optional - Add edge filtering based on time windows. - */ - if (startTimeMillis != null && endTimeMillis != null) { - query.must(TimeFilterUtils.getEdgeTimeFilterQuery(startTimeMillis, endTimeMillis)); - } else { - log.debug( - String.format( - "Empty time filter range provided: start time %s, end time: %s. Skipping application of time filters", - startTimeMillis, endTimeMillis)); - } - return query; } @@ -601,9 +617,6 @@ private static BoolQueryBuilder getOutGoingEdgeQuery( BoolQueryBuilder outgoingEdgeQuery = QueryBuilders.boolQuery(); outgoingEdgeQuery.must(buildUrnFilters(urns, SOURCE)); outgoingEdgeQuery.must(buildEdgeFilters(outgoingEdges)); - outgoingEdgeQuery.must(buildEntityTypesFilter(graphFilters.getAllowedEntityTypes(), SOURCE)); - outgoingEdgeQuery.must( - buildEntityTypesFilter(graphFilters.getAllowedEntityTypes(), DESTINATION)); return outgoingEdgeQuery; } @@ -612,9 +625,6 @@ private static BoolQueryBuilder getIncomingEdgeQuery( BoolQueryBuilder incomingEdgeQuery = QueryBuilders.boolQuery(); incomingEdgeQuery.must(buildUrnFilters(urns, DESTINATION)); incomingEdgeQuery.must(buildEdgeFilters(incomingEdges)); - incomingEdgeQuery.must(buildEntityTypesFilter(graphFilters.getAllowedEntityTypes(), SOURCE)); - incomingEdgeQuery.must( - buildEntityTypesFilter(graphFilters.getAllowedEntityTypes(), DESTINATION)); return incomingEdgeQuery; } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java index 9fc9490bfd7ef..5b7f880e6d83a 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/ESGraphQueryDAOTest.java @@ -23,16 +23,40 @@ public class ESGraphQueryDAOTest { - private static final String TEST_QUERY_FILE = - "elasticsearch/sample_filters/lineage_query_filters_1.json"; + private static final String TEST_QUERY_FILE_LIMITED = + "elasticsearch/sample_filters/lineage_query_filters_limited.json"; + private static final String TEST_QUERY_FILE_FULL = + "elasticsearch/sample_filters/lineage_query_filters_full.json"; + private static final String TEST_QUERY_FILE_FULL_EMPTY_FILTERS = + "elasticsearch/sample_filters/lineage_query_filters_full_empty_filters.json"; + private static final String TEST_QUERY_FILE_FULL_MULTIPLE_FILTERS = + "elasticsearch/sample_filters/lineage_query_filters_full_multiple_filters.json"; @Test private static void testGetQueryForLineageFullArguments() throws Exception { - URL url = Resources.getResource(TEST_QUERY_FILE); - String expectedQuery = Resources.toString(url, StandardCharsets.UTF_8); - - List urns = new ArrayList<>(); + URL urlLimited = Resources.getResource(TEST_QUERY_FILE_LIMITED); + String expectedQueryLimited = Resources.toString(urlLimited, StandardCharsets.UTF_8); + URL urlFull = Resources.getResource(TEST_QUERY_FILE_FULL); + String expectedQueryFull = Resources.toString(urlFull, StandardCharsets.UTF_8); + URL urlFullEmptyFilters = Resources.getResource(TEST_QUERY_FILE_FULL_EMPTY_FILTERS); + String expectedQueryFullEmptyFilters = + Resources.toString(urlFullEmptyFilters, StandardCharsets.UTF_8); + URL urlFullMultipleFilters = Resources.getResource(TEST_QUERY_FILE_FULL_MULTIPLE_FILTERS); + String expectedQueryFullMultipleFilters = + Resources.toString(urlFullMultipleFilters, StandardCharsets.UTF_8); + + List urns = List.of(Urn.createFromString("urn:li:dataset:test-urn")); + List urnsMultiple1 = + ImmutableList.of( + UrnUtils.getUrn("urn:li:dataset:test-urn"), + UrnUtils.getUrn("urn:li:dataset:test-urn2"), + UrnUtils.getUrn("urn:li:dataset:test-urn3")); + List urnsMultiple2 = + ImmutableList.of( + UrnUtils.getUrn("urn:li:chart:test-urn"), + UrnUtils.getUrn("urn:li:chart:test-urn2"), + UrnUtils.getUrn("urn:li:chart:test-urn3")); List edgeInfos = new ArrayList<>( ImmutableList.of( @@ -40,14 +64,64 @@ private static void testGetQueryForLineageFullArguments() throws Exception { "DownstreamOf", RelationshipDirection.INCOMING, Constants.DATASET_ENTITY_NAME))); + List edgeInfosMultiple1 = + ImmutableList.of( + new LineageRegistry.EdgeInfo( + "DownstreamOf", RelationshipDirection.OUTGOING, Constants.DATASET_ENTITY_NAME), + new LineageRegistry.EdgeInfo( + "Consumes", RelationshipDirection.OUTGOING, Constants.DATASET_ENTITY_NAME)); + List edgeInfosMultiple2 = + ImmutableList.of( + new LineageRegistry.EdgeInfo( + "DownstreamOf", RelationshipDirection.OUTGOING, Constants.DATA_JOB_ENTITY_NAME), + new LineageRegistry.EdgeInfo( + "Consumes", RelationshipDirection.OUTGOING, Constants.DATA_JOB_ENTITY_NAME)); + String entityType = "testEntityType"; + Map> urnsPerEntityType = Map.of(entityType, urns); + Map> urnsPerEntityTypeMultiple = + Map.of( + Constants.DATASET_ENTITY_NAME, + urnsMultiple1, + Constants.CHART_ENTITY_NAME, + urnsMultiple2); + Map> edgesPerEntityType = Map.of(entityType, edgeInfos); + Map> edgesPerEntityTypeMultiple = + Map.of( + Constants.DATASET_ENTITY_NAME, edgeInfosMultiple1, + Constants.DATA_JOB_ENTITY_NAME, edgeInfosMultiple2); GraphFilters graphFilters = new GraphFilters(ImmutableList.of(Constants.DATASET_ENTITY_NAME)); + GraphFilters graphFiltersMultiple = + new GraphFilters( + ImmutableList.of( + Constants.DATASET_ENTITY_NAME, + Constants.DASHBOARD_ENTITY_NAME, + Constants.DATA_JOB_ENTITY_NAME)); Long startTime = 0L; Long endTime = 1L; - QueryBuilder builder = - ESGraphQueryDAO.getQueryForLineage(urns, edgeInfos, graphFilters, startTime, endTime); - - Assert.assertEquals(builder.toString(), expectedQuery); + QueryBuilder limitedBuilder = + ESGraphQueryDAO.getLineageQueryForEntityType(urns, edgeInfos, graphFilters); + + QueryBuilder fullBuilder = + ESGraphQueryDAO.getLineageQuery( + urnsPerEntityType, edgesPerEntityType, graphFilters, startTime, endTime); + + QueryBuilder fullBuilderEmptyFilters = + ESGraphQueryDAO.getLineageQuery( + urnsPerEntityType, edgesPerEntityType, GraphFilters.emptyGraphFilters, null, null); + + QueryBuilder fullBuilderMultipleFilters = + ESGraphQueryDAO.getLineageQuery( + urnsPerEntityTypeMultiple, + edgesPerEntityTypeMultiple, + graphFiltersMultiple, + startTime, + endTime); + + Assert.assertEquals(limitedBuilder.toString(), expectedQueryLimited); + Assert.assertEquals(fullBuilder.toString(), expectedQueryFull); + Assert.assertEquals(fullBuilderEmptyFilters.toString(), expectedQueryFullEmptyFilters); + Assert.assertEquals(fullBuilderMultipleFilters.toString(), expectedQueryFullMultipleFilters); } @Test diff --git a/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_1.json b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full.json similarity index 81% rename from metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_1.json rename to metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full.json index eb84638f0ccd0..0a1cee08414a9 100644 --- a/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_1.json +++ b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full.json @@ -1,6 +1,62 @@ { "bool" : { - "must" : [ + "filter" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "must" : [ + { + "terms" : { + "destination.urn" : [ + "urn:li:dataset:test-urn" + ], + "boost" : 1.0 + } + }, + { + "terms" : { + "relationshipType" : [ + "DownstreamOf" + ], + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "terms" : { + "source.entityType" : [ + "dataset" + ], + "boost" : 1.0 + } + }, + { + "terms" : { + "destination.entityType" : [ + "dataset" + ], + "boost" : 1.0 + } + }, { "bool" : { "should" : [ @@ -160,46 +216,6 @@ } } ], - "should" : [ - { - "bool" : { - "must" : [ - { - "terms" : { - "destination.urn" : [ ], - "boost" : 1.0 - } - }, - { - "terms" : { - "relationshipType" : [ - "DownstreamOf" - ], - "boost" : 1.0 - } - }, - { - "terms" : { - "source.entityType" : [ - "dataset" - ], - "boost" : 1.0 - } - }, - { - "terms" : { - "destination.entityType" : [ - "dataset" - ], - "boost" : 1.0 - } - } - ], - "adjust_pure_negative" : true, - "boost" : 1.0 - } - } - ], "adjust_pure_negative" : true, "boost" : 1.0 } diff --git a/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_empty_filters.json b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_empty_filters.json new file mode 100644 index 0000000000000..ab2841d6602d8 --- /dev/null +++ b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_empty_filters.json @@ -0,0 +1,60 @@ +{ + "bool" : { + "filter" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "must" : [ + { + "terms" : { + "destination.urn" : [ + "urn:li:dataset:test-urn" + ], + "boost" : 1.0 + } + }, + { + "terms" : { + "relationshipType" : [ + "DownstreamOf" + ], + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "terms" : { + "source.entityType" : [ ], + "boost" : 1.0 + } + }, + { + "terms" : { + "destination.entityType" : [ ], + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } +} \ No newline at end of file diff --git a/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_multiple_filters.json b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_multiple_filters.json new file mode 100644 index 0000000000000..39f595e0e8dd2 --- /dev/null +++ b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_full_multiple_filters.json @@ -0,0 +1,229 @@ +{ + "bool" : { + "filter" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "must" : [ + { + "terms" : { + "source.urn" : [ + "urn:li:dataset:test-urn", + "urn:li:dataset:test-urn2", + "urn:li:dataset:test-urn3" + ], + "boost" : 1.0 + } + }, + { + "terms" : { + "relationshipType" : [ + "DownstreamOf", + "Consumes" + ], + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "terms" : { + "source.entityType" : [ + "dataset", + "dashboard", + "dataJob" + ], + "boost" : 1.0 + } + }, + { + "terms" : { + "destination.entityType" : [ + "dataset", + "dashboard", + "dataJob" + ], + "boost" : 1.0 + } + }, + { + "bool" : { + "should" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "must" : [ + { + "exists" : { + "field" : "createdOn", + "boost" : 1.0 + } + }, + { + "range" : { + "createdOn" : { + "from" : 0, + "to" : 1, + "include_lower" : true, + "include_upper" : true, + "boost" : 1.0 + } + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "bool" : { + "must" : [ + { + "exists" : { + "field" : "updatedOn", + "boost" : 1.0 + } + }, + { + "range" : { + "updatedOn" : { + "from" : 0, + "to" : 1, + "include_lower" : true, + "include_upper" : true, + "boost" : 1.0 + } + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "bool" : { + "must" : [ + { + "bool" : { + "should" : [ + { + "bool" : { + "must_not" : [ + { + "exists" : { + "field" : "createdOn", + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "bool" : { + "must" : [ + { + "term" : { + "createdOn" : { + "value" : 0, + "boost" : 1.0 + } + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "bool" : { + "should" : [ + { + "bool" : { + "must_not" : [ + { + "exists" : { + "field" : "updatedOn", + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "bool" : { + "must" : [ + { + "term" : { + "updatedOn" : { + "value" : 0, + "boost" : 1.0 + } + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + }, + { + "term" : { + "properties.source" : { + "value" : "UI", + "boost" : 1.0 + } + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } +} \ No newline at end of file diff --git a/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_limited.json b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_limited.json new file mode 100644 index 0000000000000..95d468ec3dac8 --- /dev/null +++ b/metadata-io/src/test/resources/elasticsearch/sample_filters/lineage_query_filters_limited.json @@ -0,0 +1,32 @@ +{ + "bool" : { + "should" : [ + { + "bool" : { + "must" : [ + { + "terms" : { + "destination.urn" : [ + "urn:li:dataset:test-urn" + ], + "boost" : 1.0 + } + }, + { + "terms" : { + "relationshipType" : [ + "DownstreamOf" + ], + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } + } + ], + "adjust_pure_negative" : true, + "boost" : 1.0 + } +} \ No newline at end of file From f06b5c782099ace00116fd33dda73af5a48e4184 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jan 2024 15:30:11 -0500 Subject: [PATCH 33/48] feat(ingest): improve config loading helpers (#9477) --- .../datahub/configuration/config_loader.py | 48 ++++++++------ .../datahub/ingestion/run/pipeline_config.py | 3 +- .../src/datahub/secret/__init__.py | 0 .../datahub/secret/datahub_secret_store.py | 66 +++++++++++++++++++ .../datahub/secret/datahub_secrets_client.py | 45 +++++++++++++ .../src/datahub/secret/secret_common.py | 59 +++++++++++++++++ .../src/datahub/secret/secret_store.py | 43 ++++++++++++ 7 files changed, 244 insertions(+), 20 deletions(-) create mode 100644 metadata-ingestion/src/datahub/secret/__init__.py create mode 100644 metadata-ingestion/src/datahub/secret/datahub_secret_store.py create mode 100644 metadata-ingestion/src/datahub/secret/datahub_secrets_client.py create mode 100644 metadata-ingestion/src/datahub/secret/secret_common.py create mode 100644 metadata-ingestion/src/datahub/secret/secret_store.py diff --git a/metadata-ingestion/src/datahub/configuration/config_loader.py b/metadata-ingestion/src/datahub/configuration/config_loader.py index 2f41af6f7286e..4266bac0c79ab 100644 --- a/metadata-ingestion/src/datahub/configuration/config_loader.py +++ b/metadata-ingestion/src/datahub/configuration/config_loader.py @@ -1,56 +1,59 @@ import io +import os import pathlib import re import sys import tempfile import unittest.mock -from typing import Any, Dict, Set, Union +from typing import Any, Dict, Mapping, Optional, Set, Union from urllib import parse import requests -from expandvars import UnboundVariable, expandvars +from expandvars import UnboundVariable, expand from datahub.configuration.common import ConfigurationError, ConfigurationMechanism from datahub.configuration.json_loader import JsonConfigurationMechanism from datahub.configuration.toml import TomlConfigurationMechanism from datahub.configuration.yaml import YamlConfigurationMechanism +Environ = Mapping[str, str] -def _resolve_element(element: str) -> str: + +def _resolve_element(element: str, environ: Environ) -> str: if re.search(r"(\$\{).+(\})", element): - return expandvars(element, nounset=True) + return expand(element, nounset=True, environ=environ) elif element.startswith("$"): try: - return expandvars(element, nounset=True) + return expand(element, nounset=True, environ=environ) except UnboundVariable: return element else: return element -def _resolve_list(ele_list: list) -> list: +def _resolve_list(ele_list: list, environ: Environ) -> list: new_v: list = [] for ele in ele_list: if isinstance(ele, str): - new_v.append(_resolve_element(ele)) + new_v.append(_resolve_element(ele, environ=environ)) elif isinstance(ele, list): - new_v.append(_resolve_list(ele)) + new_v.append(_resolve_list(ele, environ=environ)) elif isinstance(ele, dict): - new_v.append(resolve_env_variables(ele)) + new_v.append(resolve_env_variables(ele, environ=environ)) else: new_v.append(ele) return new_v -def resolve_env_variables(config: dict) -> dict: +def resolve_env_variables(config: dict, environ: Environ) -> dict: new_dict: Dict[Any, Any] = {} for k, v in config.items(): if isinstance(v, dict): - new_dict[k] = resolve_env_variables(v) + new_dict[k] = resolve_env_variables(v, environ=environ) elif isinstance(v, list): - new_dict[k] = _resolve_list(v) + new_dict[k] = _resolve_list(v, environ=environ) elif isinstance(v, str): - new_dict[k] = _resolve_element(v) + new_dict[k] = _resolve_element(v, environ=environ) else: new_dict[k] = v return new_dict @@ -60,13 +63,20 @@ def list_referenced_env_variables(config: dict) -> Set[str]: # This is a bit of a hack, but expandvars does a bunch of escaping # and other logic that we don't want to duplicate here. - with unittest.mock.patch("expandvars.getenv") as mock_getenv: - mock_getenv.return_value = "mocked_value" + vars = set() + + def mock_get_env(key: str, default: Optional[str] = None) -> str: + vars.add(key) + if default is not None: + return default + return "mocked_value" + + mock = unittest.mock.MagicMock() + mock.get.side_effect = mock_get_env - resolve_env_variables(config) + resolve_env_variables(config, environ=mock) - calls = mock_getenv.mock_calls - return set([call[1][0] for call in calls]) + return vars WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_" @@ -147,7 +157,7 @@ def load_config_file( config = raw_config.copy() if resolve_env_vars: - config = resolve_env_variables(config) + config = resolve_env_variables(config, environ=os.environ) if process_directives: config = _process_directives(config) diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py index f22f94c9e9351..c0f6add6df006 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py @@ -1,5 +1,6 @@ import datetime import logging +import os import uuid from typing import Any, Dict, List, Optional @@ -112,7 +113,7 @@ def default_sink_is_datahub_rest(cls, values: Dict[str, Any]) -> Any: } # resolve env variables if present default_sink_config = config_loader.resolve_env_variables( - default_sink_config + default_sink_config, environ=os.environ ) values["sink"] = default_sink_config diff --git a/metadata-ingestion/src/datahub/secret/__init__.py b/metadata-ingestion/src/datahub/secret/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/secret/datahub_secret_store.py b/metadata-ingestion/src/datahub/secret/datahub_secret_store.py new file mode 100644 index 0000000000000..8301ff2d9dc1a --- /dev/null +++ b/metadata-ingestion/src/datahub/secret/datahub_secret_store.py @@ -0,0 +1,66 @@ +import logging +from typing import Any, Dict, List, Optional, Union + +from pydantic import BaseModel, validator + +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.secret.datahub_secrets_client import DataHubSecretsClient +from datahub.secret.secret_store import SecretStore + +logger = logging.getLogger(__name__) + + +class DataHubSecretStoreConfig(BaseModel): + graph_client: Optional[DataHubGraph] = None + graph_client_config: Optional[DatahubClientConfig] = None + + class Config: + arbitrary_types_allowed = True + + @validator("graph_client") + def check_graph_connection(cls, v: DataHubGraph) -> DataHubGraph: + if v is not None: + v.test_connection() + return v + + +# An implementation of SecretStore that fetches secrets from DataHub +class DataHubSecretStore(SecretStore): + # Client for fetching secrets from DataHub GraphQL API + client: DataHubSecretsClient + + def __init__(self, config: DataHubSecretStoreConfig): + # Attempt to establish an outbound connection to DataHub and create a client. + if config.graph_client is not None: + self.client = DataHubSecretsClient(graph=config.graph_client) + elif config.graph_client_config is not None: + graph = DataHubGraph(config.graph_client_config) + self.client = DataHubSecretsClient(graph) + else: + raise Exception( + "Invalid configuration provided: unable to construct DataHub Graph Client." + ) + + def get_secret_values(self, secret_names: List[str]) -> Dict[str, Union[str, None]]: + # Fetch the secret from DataHub, using the credentials provided in the configuration. + # Use the GraphQL API. + try: + return self.client.get_secret_values(secret_names) + except Exception: + # Failed to resolve secrets, return empty. + logger.exception( + f"Caught exception while attempting to fetch secrets from DataHub. Secret names: {secret_names}" + ) + return {} + + def get_secret_value(self, secret_name: str) -> Union[str, None]: + secret_value_dict = self.get_secret_values([secret_name]) + return secret_value_dict.get(secret_name) + + def get_id(self) -> str: + return "datahub" + + @classmethod + def create(cls, config: Any) -> "DataHubSecretStore": + config = DataHubSecretStoreConfig.parse_obj(config) + return cls(config) diff --git a/metadata-ingestion/src/datahub/secret/datahub_secrets_client.py b/metadata-ingestion/src/datahub/secret/datahub_secrets_client.py new file mode 100644 index 0000000000000..c60aeff5db2f3 --- /dev/null +++ b/metadata-ingestion/src/datahub/secret/datahub_secrets_client.py @@ -0,0 +1,45 @@ +from typing import Dict, List, Optional + +from datahub.ingestion.graph.client import DataHubGraph + + +class DataHubSecretsClient: + """Class used to fetch secrets from DataHub.""" + + graph: DataHubGraph + + def __init__(self, graph: DataHubGraph): + self.graph = graph + + def get_secret_values(self, secret_names: List[str]) -> Dict[str, Optional[str]]: + if len(secret_names) == 0: + return {} + + request_json = { + "query": """query getSecretValues($input: GetSecretValuesInput!) {\n + getSecretValues(input: $input) {\n + name\n + value\n + }\n + }""", + "variables": {"input": {"secrets": secret_names}}, + } + # TODO: Use graph.execute_graphql() instead. + + # Fetch secrets using GraphQL API f + response = self.graph._session.post( + f"{self.graph.config.server}/api/graphql", json=request_json + ) + response.raise_for_status() + + # Verify response + res_data = response.json() + if "errors" in res_data: + raise Exception("Failed to retrieve secrets from DataHub.") + + # Convert list of name, value secret pairs into a dict and return + secret_value_list = res_data["data"]["getSecretValues"] + secret_value_dict = dict() + for secret_value in secret_value_list: + secret_value_dict[secret_value["name"]] = secret_value["value"] + return secret_value_dict diff --git a/metadata-ingestion/src/datahub/secret/secret_common.py b/metadata-ingestion/src/datahub/secret/secret_common.py new file mode 100644 index 0000000000000..2f7a584d87538 --- /dev/null +++ b/metadata-ingestion/src/datahub/secret/secret_common.py @@ -0,0 +1,59 @@ +import json +import logging +from typing import List + +from datahub.configuration.config_loader import ( + list_referenced_env_variables, + resolve_env_variables, +) +from datahub.secret.secret_store import SecretStore + +logger = logging.getLogger(__name__) + + +def resolve_secrets(secret_names: List[str], secret_stores: List[SecretStore]) -> dict: + # Attempt to resolve secret using by checking each configured secret store. + final_secret_values = dict({}) + + for secret_store in secret_stores: + try: + # Retrieve secret values from the store. + secret_values_dict = secret_store.get_secret_values(secret_names) + # Overlay secret values from each store, if not None. + for secret_name, secret_value in secret_values_dict.items(): + if secret_value is not None: + # HACK: We previously, incorrectly replaced newline characters with + # a r'\n' string. This was a lossy conversion, since we can no longer + # distinguish between a newline character and the literal '\n' in + # the secret value. For now, we assume that all r'\n' strings are + # actually newline characters. This will break if a secret value + # genuinely contains the string r'\n'. + # Once this PR https://github.com/datahub-project/datahub/pull/9484 + # has baked for a while, we should be able to remove this hack. + # TODO: This logic should live in the DataHub secret client/store, + # not the general secret resolution logic. + secret_value = secret_value.replace(r"\n", "\n") + + final_secret_values[secret_name] = secret_value + except Exception: + logger.exception( + f"Failed to fetch secret values from secret store with id {secret_store.get_id()}" + ) + return final_secret_values + + +def resolve_recipe(recipe: str, secret_stores: List[SecretStore]) -> dict: + json_recipe_raw = json.loads(recipe) + + # 1. Extract all secrets needing resolved. + secrets_to_resolve = list_referenced_env_variables(json_recipe_raw) + + # 2. Resolve secret values + secret_values_dict = resolve_secrets(list(secrets_to_resolve), secret_stores) + + # 3. Substitute secrets into recipe file + json_recipe_resolved = resolve_env_variables( + json_recipe_raw, environ=secret_values_dict + ) + + return json_recipe_resolved diff --git a/metadata-ingestion/src/datahub/secret/secret_store.py b/metadata-ingestion/src/datahub/secret/secret_store.py new file mode 100644 index 0000000000000..d6d61d8c3c924 --- /dev/null +++ b/metadata-ingestion/src/datahub/secret/secret_store.py @@ -0,0 +1,43 @@ +from abc import abstractmethod +from typing import Dict, List, Optional + +from datahub.configuration.common import ConfigModel + + +class SecretStoreConfig(ConfigModel): + type: str + config: Dict + + +class SecretStore: + """ + Abstract base class for a Secret Store, or a class that resolves "secret" values by name. + """ + + @classmethod + @abstractmethod + def create(cls, configs: dict) -> "SecretStore": + pass + + @abstractmethod + def get_secret_values(self, secret_names: List[str]) -> Dict[str, Optional[str]]: + """ + Attempt to fetch a group of secrets, returning a Dictionary of the secret of None if one + cannot be resolved by the store. + """ + + def get_secret_value(self, secret_name: str) -> Optional[str]: + secret_value_dict = self.get_secret_values([secret_name]) + return secret_value_dict.get(secret_name) + + @abstractmethod + def get_id(self) -> str: + """ + Get a unique name or id associated with the Secret Store. + """ + + @abstractmethod + def close(self) -> None: + """ + Wraps up the task + """ From 822d0eb014080fef030cdee84731878787c38c61 Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Wed, 3 Jan 2024 15:11:07 -0600 Subject: [PATCH 34/48] feat(patch): add dashboardInfo and chartInfo support for patch (#9536) --- .../registry/SnapshotEntityRegistry.java | 4 + .../template/AspectTemplateEngine.java | 4 +- .../template/chart/ChartInfoTemplate.java | 82 ++++ .../dashboard/DashboardInfoTemplate.java | 105 +++++ .../datajob/DataJobInputOutputTemplate.java | 2 - .../registry/patch/ChartInfoTemplateTest.java | 41 ++ .../patch/DashboardInfoTemplateTest.java | 41 ++ .../UpstreamLineageTemplateTest.java | 2 +- .../src/datahub/specific/chart.py | 316 ++++++++++++++ .../src/datahub/specific/dashboard.py | 410 ++++++++++++++++++ .../src/datahub/specific/datajob.py | 12 +- .../src/datahub/specific/dataproduct.py | 10 +- .../src/datahub/specific/dataset.py | 8 +- .../src/datahub/specific/ownership.py | 2 +- .../golden_dataproduct_out_upsert.json | 2 +- .../unit/patch/complex_dataset_patch.json | 2 +- .../tests/unit/patch/test_patch_builder.py | 47 +- .../patch/chart/ChartInfoPatchBuilder.java | 41 ++ .../client/patch/common/PatchUtil.java | 84 ++++ .../dashboard/DashboardInfoPatchBuilder.java | 103 +++++ .../DataJobInputOutputPatchBuilder.java | 73 +--- .../java/datahub/client/patch/PatchTest.java | 89 ++++ 22 files changed, 1385 insertions(+), 95 deletions(-) create mode 100644 entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/chart/ChartInfoTemplate.java create mode 100644 entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dashboard/DashboardInfoTemplate.java create mode 100644 entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/ChartInfoTemplateTest.java create mode 100644 entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/DashboardInfoTemplateTest.java rename entity-registry/src/test/java/com/linkedin/metadata/models/registry/{ => patch}/UpstreamLineageTemplateTest.java (99%) create mode 100644 metadata-ingestion/src/datahub/specific/chart.py create mode 100644 metadata-ingestion/src/datahub/specific/dashboard.py create mode 100644 metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/chart/ChartInfoPatchBuilder.java create mode 100644 metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/common/PatchUtil.java create mode 100644 metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dashboard/DashboardInfoPatchBuilder.java diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/SnapshotEntityRegistry.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/SnapshotEntityRegistry.java index cfc2c0901ce0d..bb0113abc9ed6 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/SnapshotEntityRegistry.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/SnapshotEntityRegistry.java @@ -12,9 +12,11 @@ import com.linkedin.metadata.models.EventSpec; import com.linkedin.metadata.models.registry.template.AspectTemplateEngine; import com.linkedin.metadata.models.registry.template.Template; +import com.linkedin.metadata.models.registry.template.chart.ChartInfoTemplate; import com.linkedin.metadata.models.registry.template.common.GlobalTagsTemplate; import com.linkedin.metadata.models.registry.template.common.GlossaryTermsTemplate; import com.linkedin.metadata.models.registry.template.common.OwnershipTemplate; +import com.linkedin.metadata.models.registry.template.dashboard.DashboardInfoTemplate; import com.linkedin.metadata.models.registry.template.dataflow.DataFlowInfoTemplate; import com.linkedin.metadata.models.registry.template.datajob.DataJobInfoTemplate; import com.linkedin.metadata.models.registry.template.datajob.DataJobInputOutputTemplate; @@ -79,6 +81,8 @@ private AspectTemplateEngine populateTemplateEngine(Map aspe aspectSpecTemplateMap.put(DATA_JOB_INFO_ASPECT_NAME, new DataJobInfoTemplate()); aspectSpecTemplateMap.put( DATA_PRODUCT_PROPERTIES_ASPECT_NAME, new DataProductPropertiesTemplate()); + aspectSpecTemplateMap.put(CHART_INFO_ASPECT_NAME, new ChartInfoTemplate()); + aspectSpecTemplateMap.put(DASHBOARD_INFO_ASPECT_NAME, new DashboardInfoTemplate()); aspectSpecTemplateMap.put(DATA_JOB_INPUT_OUTPUT_ASPECT_NAME, new DataJobInputOutputTemplate()); return new AspectTemplateEngine(aspectSpecTemplateMap); } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java index 95849a94bae29..029eb688c5291 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java @@ -32,7 +32,9 @@ public class AspectTemplateEngine { DATA_FLOW_INFO_ASPECT_NAME, DATA_JOB_INFO_ASPECT_NAME, DATA_PRODUCT_PROPERTIES_ASPECT_NAME, - DATA_JOB_INPUT_OUTPUT_ASPECT_NAME) + DATA_JOB_INPUT_OUTPUT_ASPECT_NAME, + CHART_INFO_ASPECT_NAME, + DASHBOARD_INFO_ASPECT_NAME) .collect(Collectors.toSet()); private final Map> _aspectTemplateMap; diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/chart/ChartInfoTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/chart/ChartInfoTemplate.java new file mode 100644 index 0000000000000..654f923e7322d --- /dev/null +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/chart/ChartInfoTemplate.java @@ -0,0 +1,82 @@ +package com.linkedin.metadata.models.registry.template.chart; + +import static com.linkedin.metadata.Constants.*; + +import com.fasterxml.jackson.databind.JsonNode; +import com.linkedin.chart.ChartDataSourceTypeArray; +import com.linkedin.chart.ChartInfo; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.ChangeAuditStamps; +import com.linkedin.common.EdgeArray; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.metadata.models.registry.template.ArrayMergingTemplate; +import java.util.Collections; +import javax.annotation.Nonnull; + +public class ChartInfoTemplate implements ArrayMergingTemplate { + + private static final String INPUT_EDGES_FIELD_NAME = "inputEdges"; + private static final String INPUTS_FIELD_NAME = "inputs"; + private static final String DESTINATION_URN_FIELD_NAME = "destinationUrn"; + + @Override + public ChartInfo getSubtype(RecordTemplate recordTemplate) throws ClassCastException { + if (recordTemplate instanceof ChartInfo) { + return (ChartInfo) recordTemplate; + } + throw new ClassCastException("Unable to cast RecordTemplate to DataJobInputOutput"); + } + + @Override + public Class getTemplateType() { + return ChartInfo.class; + } + + @Nonnull + @Override + public ChartInfo getDefault() { + ChartInfo chartInfo = new ChartInfo(); + chartInfo.setDescription(""); + chartInfo.setTitle(""); + ChangeAuditStamps changeAuditStamps = new ChangeAuditStamps(); + AuditStamp auditStamp = + new AuditStamp() + .setActor(UrnUtils.getUrn(SYSTEM_ACTOR)) + .setTime(System.currentTimeMillis()); + changeAuditStamps.setCreated(auditStamp).setLastModified(auditStamp); + chartInfo.setLastModified(changeAuditStamps); + chartInfo.setInputEdges(new EdgeArray()); + + // Deprecated fields + chartInfo.setInputs(new ChartDataSourceTypeArray()); + + return chartInfo; + } + + @Nonnull + @Override + public JsonNode transformFields(JsonNode baseNode) { + JsonNode transformedNode = + arrayFieldToMap( + baseNode, + INPUT_EDGES_FIELD_NAME, + Collections.singletonList(DESTINATION_URN_FIELD_NAME)); + + transformedNode = arrayFieldToMap(transformedNode, INPUTS_FIELD_NAME, Collections.emptyList()); + + return transformedNode; + } + + @Nonnull + @Override + public JsonNode rebaseFields(JsonNode patched) { + JsonNode rebasedNode = + transformedMapToArray( + patched, INPUT_EDGES_FIELD_NAME, Collections.singletonList(DESTINATION_URN_FIELD_NAME)); + + rebasedNode = transformedMapToArray(rebasedNode, INPUTS_FIELD_NAME, Collections.emptyList()); + + return rebasedNode; + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dashboard/DashboardInfoTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dashboard/DashboardInfoTemplate.java new file mode 100644 index 0000000000000..eae04b5285adf --- /dev/null +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/dashboard/DashboardInfoTemplate.java @@ -0,0 +1,105 @@ +package com.linkedin.metadata.models.registry.template.dashboard; + +import static com.linkedin.metadata.Constants.*; + +import com.fasterxml.jackson.databind.JsonNode; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.ChangeAuditStamps; +import com.linkedin.common.ChartUrnArray; +import com.linkedin.common.EdgeArray; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.dashboard.DashboardInfo; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.metadata.models.registry.template.ArrayMergingTemplate; +import java.util.Collections; +import javax.annotation.Nonnull; + +public class DashboardInfoTemplate implements ArrayMergingTemplate { + + private static final String CHART_EDGES_FIELD_NAME = "chartEdges"; + private static final String DATASET_EDGES_FIELD_NAME = "datasetEdges"; + private static final String DATASETS_FIELD_NAME = "datasets"; + private static final String CHARTS_FIELD_NAME = "charts"; + private static final String DESTINATION_URN_FIELD_NAME = "destinationUrn"; + + @Override + public DashboardInfo getSubtype(RecordTemplate recordTemplate) throws ClassCastException { + if (recordTemplate instanceof DashboardInfo) { + return (DashboardInfo) recordTemplate; + } + throw new ClassCastException("Unable to cast RecordTemplate to DataJobInputOutput"); + } + + @Override + public Class getTemplateType() { + return DashboardInfo.class; + } + + @Nonnull + @Override + public DashboardInfo getDefault() { + DashboardInfo dashboardInfo = new DashboardInfo(); + dashboardInfo.setTitle(""); + dashboardInfo.setDescription(""); + ChangeAuditStamps changeAuditStamps = new ChangeAuditStamps(); + AuditStamp auditStamp = + new AuditStamp() + .setActor(UrnUtils.getUrn(SYSTEM_ACTOR)) + .setTime(System.currentTimeMillis()); + changeAuditStamps.setCreated(auditStamp).setLastModified(auditStamp); + dashboardInfo.setLastModified(changeAuditStamps); + dashboardInfo.setChartEdges(new EdgeArray()); + dashboardInfo.setDatasetEdges(new EdgeArray()); + + // Deprecated fields + dashboardInfo.setDatasets(new UrnArray()); + dashboardInfo.setCharts(new ChartUrnArray()); + + return dashboardInfo; + } + + @Nonnull + @Override + public JsonNode transformFields(JsonNode baseNode) { + JsonNode transformedNode = + arrayFieldToMap( + baseNode, + CHART_EDGES_FIELD_NAME, + Collections.singletonList(DESTINATION_URN_FIELD_NAME)); + + transformedNode = + arrayFieldToMap( + transformedNode, + DATASET_EDGES_FIELD_NAME, + Collections.singletonList(DESTINATION_URN_FIELD_NAME)); + + transformedNode = + arrayFieldToMap(transformedNode, DATASETS_FIELD_NAME, Collections.emptyList()); + + transformedNode = arrayFieldToMap(transformedNode, CHARTS_FIELD_NAME, Collections.emptyList()); + + return transformedNode; + } + + @Nonnull + @Override + public JsonNode rebaseFields(JsonNode patched) { + JsonNode rebasedNode = + transformedMapToArray( + patched, + DATASET_EDGES_FIELD_NAME, + Collections.singletonList(DESTINATION_URN_FIELD_NAME)); + + rebasedNode = + transformedMapToArray( + rebasedNode, + CHART_EDGES_FIELD_NAME, + Collections.singletonList(DESTINATION_URN_FIELD_NAME)); + + rebasedNode = transformedMapToArray(rebasedNode, DATASETS_FIELD_NAME, Collections.emptyList()); + rebasedNode = transformedMapToArray(rebasedNode, CHARTS_FIELD_NAME, Collections.emptyList()); + + return rebasedNode; + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/datajob/DataJobInputOutputTemplate.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/datajob/DataJobInputOutputTemplate.java index 889297734e977..6761892b1b31b 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/datajob/DataJobInputOutputTemplate.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/datajob/DataJobInputOutputTemplate.java @@ -23,8 +23,6 @@ public class DataJobInputOutputTemplate implements ArrayMergingTemplate patchOperations = new ArrayList<>(); + ObjectNode edgeNode = instance.objectNode(); + edgeNode.put( + "destinationUrn", "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"); + JsonPatchOperation operation = + new AddOperation( + new JsonPointer( + "/inputEdges/urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"), + edgeNode); + patchOperations.add(operation); + JsonPatch patch = new JsonPatch(patchOperations); + ChartInfo result = chartInfoTemplate.applyPatch(dashboardInfo, patch); + + Assert.assertEquals( + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"), + result.getInputEdges().get(0).getDestinationUrn()); + } +} diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/DashboardInfoTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/DashboardInfoTemplateTest.java new file mode 100644 index 0000000000000..962ff1d40d873 --- /dev/null +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/DashboardInfoTemplateTest.java @@ -0,0 +1,41 @@ +package com.linkedin.metadata.models.registry.patch; + +import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.github.fge.jackson.jsonpointer.JsonPointer; +import com.github.fge.jsonpatch.AddOperation; +import com.github.fge.jsonpatch.JsonPatch; +import com.github.fge.jsonpatch.JsonPatchOperation; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.dashboard.DashboardInfo; +import com.linkedin.metadata.models.registry.template.dashboard.DashboardInfoTemplate; +import java.util.ArrayList; +import java.util.List; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class DashboardInfoTemplateTest { + + @Test + public void testDashboardInfoTemplate() throws Exception { + DashboardInfoTemplate dashboardInfoTemplate = new DashboardInfoTemplate(); + DashboardInfo dashboardInfo = dashboardInfoTemplate.getDefault(); + List patchOperations = new ArrayList<>(); + ObjectNode edgeNode = instance.objectNode(); + edgeNode.put( + "destinationUrn", "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"); + JsonPatchOperation operation = + new AddOperation( + new JsonPointer( + "/datasetEdges/urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"), + edgeNode); + patchOperations.add(operation); + JsonPatch patch = new JsonPatch(patchOperations); + DashboardInfo result = dashboardInfoTemplate.applyPatch(dashboardInfo, patch); + + Assert.assertEquals( + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)"), + result.getDatasetEdges().get(0).getDestinationUrn()); + } +} diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/UpstreamLineageTemplateTest.java similarity index 99% rename from entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java rename to entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/UpstreamLineageTemplateTest.java index 07982a87be56c..8f410ae8da085 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/models/registry/UpstreamLineageTemplateTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/models/registry/patch/UpstreamLineageTemplateTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.models.registry; +package com.linkedin.metadata.models.registry.patch; import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; diff --git a/metadata-ingestion/src/datahub/specific/chart.py b/metadata-ingestion/src/datahub/specific/chart.py new file mode 100644 index 0000000000000..5dc394e8ebe0f --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/chart.py @@ -0,0 +1,316 @@ +import time +from typing import Dict, List, Optional, TypeVar, Union +from urllib.parse import quote + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.metadata.schema_classes import ( + AuditStampClass, + ChartInfoClass as ChartInfo, + EdgeClass as Edge, + GlobalTagsClass as GlobalTags, + GlossaryTermAssociationClass as Term, + GlossaryTermsClass as GlossaryTerms, + KafkaAuditHeaderClass, + OwnerClass as Owner, + OwnershipTypeClass, + SystemMetadataClass, + TagAssociationClass as Tag, +) +from datahub.specific.custom_properties import CustomPropertiesPatchHelper +from datahub.specific.ownership import OwnershipPatchHelper +from datahub.utilities.urns.tag_urn import TagUrn +from datahub.utilities.urns.urn import Urn + +T = TypeVar("T", bound=MetadataPatchProposal) + + +class ChartPatchBuilder(MetadataPatchProposal): + def __init__( + self, + urn: str, + system_metadata: Optional[SystemMetadataClass] = None, + audit_header: Optional[KafkaAuditHeaderClass] = None, + ) -> None: + """ + Initializes a ChartPatchBuilder instance. + + Args: + urn: The URN of the chart + system_metadata: The system metadata of the chart (optional). + audit_header: The Kafka audit header of the chart (optional). + """ + super().__init__( + urn, "chart", system_metadata=system_metadata, audit_header=audit_header + ) + self.custom_properties_patch_helper = CustomPropertiesPatchHelper( + self, ChartInfo.ASPECT_NAME + ) + self.ownership_patch_helper = OwnershipPatchHelper(self) + + def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass: + """ + Creates an AuditStampClass instance with the current timestamp and other default values. + + Args: + message: The message associated with the audit stamp (optional). + + Returns: + An instance of AuditStampClass. + """ + return AuditStampClass( + time=int(time.time() * 1000.0), + actor="urn:li:corpuser:datahub", + message=message, + ) + + def _ensure_urn_type( + self, entity_type: str, edges: List[Edge], context: str + ) -> None: + """ + Ensures that the destination URNs in the given edges have the specified entity type. + + Args: + entity_type: The entity type to check against. + edges: A list of Edge objects. + context: The context or description of the operation. + + Raises: + ValueError: If any of the destination URNs is not of the specified entity type. + """ + for e in edges: + urn = Urn.create_from_string(e.destinationUrn) + if not urn.get_type() == entity_type: + raise ValueError( + f"{context}: {e.destinationUrn} is not of type {entity_type}" + ) + + def add_owner(self, owner: Owner) -> "ChartPatchBuilder": + """ + Adds an owner to the ChartPatchBuilder. + + Args: + owner: The Owner object to add. + + Returns: + The ChartPatchBuilder instance. + """ + self.ownership_patch_helper.add_owner(owner) + return self + + def remove_owner( + self, owner: str, owner_type: Optional[OwnershipTypeClass] = None + ) -> "ChartPatchBuilder": + """ + Removes an owner from the ChartPatchBuilder. + + Args: + owner: The owner to remove. + owner_type: The ownership type of the owner (optional). + + Returns: + The ChartPatchBuilder instance. + + Notes: + `owner_type` is optional. + """ + self.ownership_patch_helper.remove_owner(owner, owner_type) + return self + + def set_owners(self, owners: List[Owner]) -> "ChartPatchBuilder": + """ + Sets the owners of the ChartPatchBuilder. + + Args: + owners: A list of Owner objects. + + Returns: + The ChartPatchBuilder instance. + """ + self.ownership_patch_helper.set_owners(owners) + return self + + def add_input_edge(self, input: Union[Edge, Urn, str]) -> "ChartPatchBuilder": + """ + Adds an input to the ChartPatchBuilder. + + Args: + input: The input, which can be an Edge object, Urn object, or a string. + + Returns: + The ChartPatchBuilder instance. + + Notes: + If `input` is an Edge object, it is used directly. If `input` is a Urn object or string, + it is converted to an Edge object and added with default audit stamps. + """ + if isinstance(input, Edge): + input_urn: str = input.destinationUrn + input_edge: Edge = input + elif isinstance(input, (Urn, str)): + input_urn = str(input) + + input_edge = Edge( + destinationUrn=input_urn, + created=self._mint_auditstamp(), + lastModified=self._mint_auditstamp(), + ) + + self._ensure_urn_type("dataset", [input_edge], "add_dataset") + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path=f"/inputEdges/{quote(input_urn, safe='')}", + value=input_urn, + ) + return self + + def remove_input_edge(self, input: Union[str, Urn]) -> "ChartPatchBuilder": + """ + Removes an input from the ChartPatchBuilder. + + Args: + input: The input to remove, specified as a string or Urn object. + + Returns: + The ChartPatchBuilder instance. + """ + self._add_patch( + ChartInfo.ASPECT_NAME, + "remove", + path=f"/inputEdges/{input}", + value={}, + ) + return self + + def set_input_edges(self, inputs: List[Edge]) -> "ChartPatchBuilder": + """ + Sets the input edges for the ChartPatchBuilder. + + Args: + inputs: A list of Edge objects representing the input edges. + + Returns: + The ChartPatchBuilder instance. + + Notes: + This method replaces all existing inputs with the given inputs. + """ + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/inputEdges", + value=inputs, + ) + return self + + def add_tag(self, tag: Tag) -> "ChartPatchBuilder": + """ + Adds a tag to the ChartPatchBuilder. + + Args: + tag: The Tag object representing the tag to be added. + + Returns: + The ChartPatchBuilder instance. + """ + self._add_patch( + GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag + ) + return self + + def remove_tag(self, tag: Union[str, Urn]) -> "ChartPatchBuilder": + """ + Removes a tag from the ChartPatchBuilder. + + Args: + tag: The tag to remove, specified as a string or Urn object. + + Returns: + The ChartPatchBuilder instance. + """ + if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): + tag = TagUrn.create_from_id(tag) + self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) + return self + + def add_term(self, term: Term) -> "ChartPatchBuilder": + """ + Adds a glossary term to the ChartPatchBuilder. + + Args: + term: The Term object representing the glossary term to be added. + + Returns: + The ChartPatchBuilder instance. + """ + self._add_patch( + GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term + ) + return self + + def remove_term(self, term: Union[str, Urn]) -> "ChartPatchBuilder": + """ + Removes a glossary term from the ChartPatchBuilder. + + Args: + term: The term to remove, specified as a string or Urn object. + + Returns: + The ChartPatchBuilder instance. + """ + if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): + term = "urn:li:glossaryTerm:" + term + self._add_patch( + GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} + ) + return self + + def set_custom_properties( + self, custom_properties: Dict[str, str] + ) -> "ChartPatchBuilder": + """ + Sets the custom properties for the ChartPatchBuilder. + + Args: + custom_properties: A dictionary containing the custom properties to be set. + + Returns: + The ChartPatchBuilder instance. + + Notes: + This method replaces all existing custom properties with the given dictionary. + """ + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/customProperties", + value=custom_properties, + ) + return self + + def add_custom_property(self, key: str, value: str) -> "ChartPatchBuilder": + """ + Adds a custom property to the ChartPatchBuilder. + + Args: + key: The key of the custom property. + value: The value of the custom property. + + Returns: + The ChartPatchBuilder instance. + """ + self.custom_properties_patch_helper.add_property(key, value) + return self + + def remove_custom_property(self, key: str) -> "ChartPatchBuilder": + """ + Removes a custom property from the ChartPatchBuilder. + + Args: + key: The key of the custom property to remove. + + Returns: + The ChartPatchBuilder instance. + """ + self.custom_properties_patch_helper.remove_property(key) + return self diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py new file mode 100644 index 0000000000000..855dcc5685cea --- /dev/null +++ b/metadata-ingestion/src/datahub/specific/dashboard.py @@ -0,0 +1,410 @@ +import time +from typing import Dict, List, Optional, TypeVar, Union +from urllib.parse import quote + +from datahub.emitter.mcp_patch_builder import MetadataPatchProposal +from datahub.metadata.schema_classes import ( + AuditStampClass, + DashboardInfoClass as DashboardInfo, + EdgeClass as Edge, + GlobalTagsClass as GlobalTags, + GlossaryTermAssociationClass as Term, + GlossaryTermsClass as GlossaryTerms, + KafkaAuditHeaderClass, + OwnerClass as Owner, + OwnershipTypeClass, + SystemMetadataClass, + TagAssociationClass as Tag, +) +from datahub.specific.custom_properties import CustomPropertiesPatchHelper +from datahub.specific.ownership import OwnershipPatchHelper +from datahub.utilities.urns.tag_urn import TagUrn +from datahub.utilities.urns.urn import Urn + +T = TypeVar("T", bound=MetadataPatchProposal) + + +class DashboardPatchBuilder(MetadataPatchProposal): + def __init__( + self, + urn: str, + system_metadata: Optional[SystemMetadataClass] = None, + audit_header: Optional[KafkaAuditHeaderClass] = None, + ) -> None: + """ + Initializes a DashboardPatchBuilder instance. + + Args: + urn: The URN of the dashboard + system_metadata: The system metadata of the dashboard (optional). + audit_header: The Kafka audit header of the dashboard (optional). + """ + super().__init__( + urn, "dashboard", system_metadata=system_metadata, audit_header=audit_header + ) + self.custom_properties_patch_helper = CustomPropertiesPatchHelper( + self, DashboardInfo.ASPECT_NAME + ) + self.ownership_patch_helper = OwnershipPatchHelper(self) + + def _mint_auditstamp(self, message: Optional[str] = None) -> AuditStampClass: + """ + Creates an AuditStampClass instance with the current timestamp and other default values. + + Args: + message: The message associated with the audit stamp (optional). + + Returns: + An instance of AuditStampClass. + """ + return AuditStampClass( + time=int(time.time() * 1000.0), + actor="urn:li:corpuser:datahub", + message=message, + ) + + def _ensure_urn_type( + self, entity_type: str, edges: List[Edge], context: str + ) -> None: + """ + Ensures that the destination URNs in the given edges have the specified entity type. + + Args: + entity_type: The entity type to check against. + edges: A list of Edge objects. + context: The context or description of the operation. + + Raises: + ValueError: If any of the destination URNs is not of the specified entity type. + """ + for e in edges: + urn = Urn.create_from_string(e.destinationUrn) + if not urn.get_type() == entity_type: + raise ValueError( + f"{context}: {e.destinationUrn} is not of type {entity_type}" + ) + + def add_owner(self, owner: Owner) -> "DashboardPatchBuilder": + """ + Adds an owner to the DashboardPatchBuilder. + + Args: + owner: The Owner object to add. + + Returns: + The DashboardPatchBuilder instance. + """ + self.ownership_patch_helper.add_owner(owner) + return self + + def remove_owner( + self, owner: str, owner_type: Optional[OwnershipTypeClass] = None + ) -> "DashboardPatchBuilder": + """ + Removes an owner from the DashboardPatchBuilder. + + Args: + owner: The owner to remove. + owner_type: The ownership type of the owner (optional). + + Returns: + The DashboardPatchBuilder instance. + + Notes: + `owner_type` is optional. + """ + self.ownership_patch_helper.remove_owner(owner, owner_type) + return self + + def set_owners(self, owners: List[Owner]) -> "DashboardPatchBuilder": + """ + Sets the owners of the DashboardPatchBuilder. + + Args: + owners: A list of Owner objects. + + Returns: + The DashboardPatchBuilder instance. + """ + self.ownership_patch_helper.set_owners(owners) + return self + + def add_dataset_edge( + self, dataset: Union[Edge, Urn, str] + ) -> "DashboardPatchBuilder": + """ + Adds an dataset to the DashboardPatchBuilder. + + Args: + dataset: The dataset, which can be an Edge object, Urn object, or a string. + + Returns: + The DashboardPatchBuilder instance. + + Raises: + ValueError: If the dataset is not a Dataset urn. + + Notes: + If `dataset` is an Edge object, it is used directly. If `dataset` is a Urn object or string, + it is converted to an Edge object and added with default audit stamps. + """ + if isinstance(dataset, Edge): + dataset_urn: str = dataset.destinationUrn + dataset_edge: Edge = dataset + elif isinstance(dataset, (Urn, str)): + dataset_urn = str(dataset) + if not dataset_urn.startswith("urn:li:dataset:"): + raise ValueError(f"Input {dataset} is not a Dataset urn") + + dataset_edge = Edge( + destinationUrn=dataset_urn, + created=self._mint_auditstamp(), + lastModified=self._mint_auditstamp(), + ) + + self._ensure_urn_type("dataset", [dataset_edge], "add_dataset") + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path=f"/datasetEdges/{quote(dataset_urn, safe='')}", + value=dataset_edge, + ) + return self + + def remove_dataset_edge(self, dataset: Union[str, Urn]) -> "DashboardPatchBuilder": + """ + Removes a dataset edge from the DashboardPatchBuilder. + + Args: + dataset: The dataset to remove, specified as a string or Urn object. + + Returns: + The DashboardPatchBuilder instance. + """ + self._add_patch( + DashboardInfo.ASPECT_NAME, + "remove", + path=f"/datasetEdges/{dataset}", + value={}, + ) + return self + + def set_dataset_edges(self, datasets: List[Edge]) -> "DashboardPatchBuilder": + """ + Sets the dataset edges for the DashboardPatchBuilder. + + Args: + datasets: A list of Edge objects representing the dataset edges. + + Returns: + The DashboardPatchBuilder instance. + + Raises: + ValueError: If any of the input edges are not of type 'Datset'. + + Notes: + This method replaces all existing datasets with the given inputs. + """ + self._ensure_urn_type("dataset", datasets, "dataset edges") + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/datasetEdges", + value=datasets, + ) + return self + + def add_chart_edge(self, chart: Union[Edge, Urn, str]) -> "DashboardPatchBuilder": + """ + Adds a chart edge to the DashboardPatchBuilder. + + Args: + chart: The dataset, which can be an Edge object, Urn object, or a string. + + Returns: + The DashboardPatchBuilder instance. + + Raises: + ValueError: If the edge is not a Chart urn. + + Notes: + If `chart` is an Edge object, it is used directly. If `chart` is a Urn object or string, + it is converted to an Edge object and added with default audit stamps. + """ + if isinstance(chart, Edge): + chart_urn: str = chart.destinationUrn + chart_edge: Edge = chart + elif isinstance(chart, (Urn, str)): + chart_urn = str(chart) + if not chart_urn.startswith("urn:li:chart:"): + raise ValueError(f"Input {chart} is not a Chart urn") + + chart_edge = Edge( + destinationUrn=chart_urn, + created=self._mint_auditstamp(), + lastModified=self._mint_auditstamp(), + ) + + self._ensure_urn_type("dataset", [chart_edge], "add_chart_edge") + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path=f"/chartEdges/{quote(chart_urn, safe='')}", + value=chart_edge, + ) + return self + + def remove_chart_edge(self, chart: Union[str, Urn]) -> "DashboardPatchBuilder": + """ + Removes an chart edge from the DashboardPatchBuilder. + + Args: + chart: The chart to remove, specified as a string or Urn object. + + Returns: + The DashboardPatchBuilder instance. + """ + self._add_patch( + DashboardInfo.ASPECT_NAME, + "remove", + path=f"/chartEdges/{chart}", + value={}, + ) + return self + + def set_chart_edges(self, charts: List[Edge]) -> "DashboardPatchBuilder": + """ + Sets the chart edges for the DashboardPatchBuilder. + + Args: + charts: A list of Edge objects representing the chart edges. + + Returns: + The DashboardPatchBuilder instance. + + Raises: + ValueError: If any of the edges are not of type 'chart'. + + Notes: + This method replaces all existing charts with the given charts. + """ + self._ensure_urn_type("chart", charts, "set_charts") + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/chartEdges", + value=charts, + ) + return self + + def add_tag(self, tag: Tag) -> "DashboardPatchBuilder": + """ + Adds a tag to the DashboardPatchBuilder. + + Args: + tag: The Tag object representing the tag to be added. + + Returns: + The DashboardPatchBuilder instance. + """ + self._add_patch( + GlobalTags.ASPECT_NAME, "add", path=f"/tags/{tag.tag}", value=tag + ) + return self + + def remove_tag(self, tag: Union[str, Urn]) -> "DashboardPatchBuilder": + """ + Removes a tag from the DashboardPatchBuilder. + + Args: + tag: The tag to remove, specified as a string or Urn object. + + Returns: + The DashboardPatchBuilder instance. + """ + if isinstance(tag, str) and not tag.startswith("urn:li:tag:"): + tag = TagUrn.create_from_id(tag) + self._add_patch(GlobalTags.ASPECT_NAME, "remove", path=f"/tags/{tag}", value={}) + return self + + def add_term(self, term: Term) -> "DashboardPatchBuilder": + """ + Adds a glossary term to the DashboardPatchBuilder. + + Args: + term: The Term object representing the glossary term to be added. + + Returns: + The DashboardPatchBuilder instance. + """ + self._add_patch( + GlossaryTerms.ASPECT_NAME, "add", path=f"/terms/{term.urn}", value=term + ) + return self + + def remove_term(self, term: Union[str, Urn]) -> "DashboardPatchBuilder": + """ + Removes a glossary term from the DashboardPatchBuilder. + + Args: + term: The term to remove, specified as a string or Urn object. + + Returns: + The DashboardPatchBuilder instance. + """ + if isinstance(term, str) and not term.startswith("urn:li:glossaryTerm:"): + term = "urn:li:glossaryTerm:" + term + self._add_patch( + GlossaryTerms.ASPECT_NAME, "remove", path=f"/terms/{term}", value={} + ) + return self + + def set_custom_properties( + self, custom_properties: Dict[str, str] + ) -> "DashboardPatchBuilder": + """ + Sets the custom properties for the DashboardPatchBuilder. + + Args: + custom_properties: A dictionary containing the custom properties to be set. + + Returns: + The DashboardPatchBuilder instance. + + Notes: + This method replaces all existing custom properties with the given dictionary. + """ + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/customProperties", + value=custom_properties, + ) + return self + + def add_custom_property(self, key: str, value: str) -> "DashboardPatchBuilder": + """ + Adds a custom property to the DashboardPatchBuilder. + + Args: + key: The key of the custom property. + value: The value of the custom property. + + Returns: + The DashboardPatchBuilder instance. + """ + self.custom_properties_patch_helper.add_property(key, value) + return self + + def remove_custom_property(self, key: str) -> "DashboardPatchBuilder": + """ + Removes a custom property from the DashboardPatchBuilder. + + Args: + key: The key of the custom property to remove. + + Returns: + The DashboardPatchBuilder instance. + """ + self.custom_properties_patch_helper.remove_property(key) + return self diff --git a/metadata-ingestion/src/datahub/specific/datajob.py b/metadata-ingestion/src/datahub/specific/datajob.py index 7ebaee6b918c1..0338a1320c15b 100644 --- a/metadata-ingestion/src/datahub/specific/datajob.py +++ b/metadata-ingestion/src/datahub/specific/datajob.py @@ -207,7 +207,7 @@ def set_input_datajobs(self, inputs: List[Edge]) -> "DataJobPatchBuilder": self._ensure_urn_type("dataJob", inputs, "input datajobs") self._add_patch( DataJobInputOutput.ASPECT_NAME, - "replace", + "add", path="/inputDatajobEdges", value=inputs, ) @@ -290,7 +290,7 @@ def set_input_datasets(self, inputs: List[Edge]) -> "DataJobPatchBuilder": self._ensure_urn_type("dataset", inputs, "set_input_datasets") self._add_patch( DataJobInputOutput.ASPECT_NAME, - "replace", + "add", path="/inputDatasetEdges", value=inputs, ) @@ -375,7 +375,7 @@ def set_output_datasets(self, outputs: List[Edge]) -> "DataJobPatchBuilder": self._ensure_urn_type("dataset", outputs, "set_output_datasets") self._add_patch( DataJobInputOutput.ASPECT_NAME, - "replace", + "add", path="/outputDatasetEdges", value=outputs, ) @@ -463,7 +463,7 @@ def set_input_dataset_fields(self, inputs: List[Edge]) -> "DataJobPatchBuilder": self._ensure_urn_type("schemaField", inputs, "set_input_dataset_fields") self._add_patch( DataJobInputOutput.ASPECT_NAME, - "replace", + "add", path="/inputDatasetFields", value=inputs, ) @@ -551,7 +551,7 @@ def set_output_dataset_fields(self, outputs: List[Edge]) -> "DataJobPatchBuilder self._ensure_urn_type("schemaField", outputs, "set_output_dataset_fields") self._add_patch( DataJobInputOutput.ASPECT_NAME, - "replace", + "add", path="/outputDatasetFields", value=outputs, ) @@ -636,7 +636,7 @@ def set_custom_properties( """ self._add_patch( DataJobInfo.ASPECT_NAME, - "replace", + "add", path="/customProperties", value=custom_properties, ) diff --git a/metadata-ingestion/src/datahub/specific/dataproduct.py b/metadata-ingestion/src/datahub/specific/dataproduct.py index bb49ac47b3ef8..2c174e0c9a6cb 100644 --- a/metadata-ingestion/src/datahub/specific/dataproduct.py +++ b/metadata-ingestion/src/datahub/specific/dataproduct.py @@ -85,7 +85,7 @@ def remove_term(self, term: Union[str, Urn]) -> "DataProductPatchBuilder": def set_name(self, name: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, - "replace", + "add", path="/name", value=name, ) @@ -94,7 +94,7 @@ def set_name(self, name: str) -> "DataProductPatchBuilder": def set_description(self, description: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, - "replace", + "add", path="/description", value=description, ) @@ -105,7 +105,7 @@ def set_custom_properties( ) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, - "replace", + "add", path="/customProperties", value=custom_properties, ) @@ -124,7 +124,7 @@ def set_assets( ) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, - "replace", + "add", path="/assets", value=assets, ) @@ -151,7 +151,7 @@ def remove_asset(self, asset_urn: str) -> "DataProductPatchBuilder": def set_external_url(self, external_url: str) -> "DataProductPatchBuilder": self._add_patch( DataProductProperties.ASPECT_NAME, - "replace", + "add", path="/externalUrl", value=external_url, ) diff --git a/metadata-ingestion/src/datahub/specific/dataset.py b/metadata-ingestion/src/datahub/specific/dataset.py index 294a80572669b..62ee4fc57b61b 100644 --- a/metadata-ingestion/src/datahub/specific/dataset.py +++ b/metadata-ingestion/src/datahub/specific/dataset.py @@ -143,7 +143,7 @@ def remove_upstream_lineage( def set_upstream_lineages(self, upstreams: List[Upstream]) -> "DatasetPatchBuilder": self._add_patch( - UpstreamLineage.ASPECT_NAME, "replace", path="/upstreams", value=upstreams + UpstreamLineage.ASPECT_NAME, "add", path="/upstreams", value=upstreams ) return self @@ -297,7 +297,7 @@ def set_description( DatasetProperties.ASPECT_NAME if not editable else EditableDatasetProperties.ASPECT_NAME, - "replace", + "add", path="/description", value=description, ) @@ -308,7 +308,7 @@ def set_custom_properties( ) -> "DatasetPatchBuilder": self._add_patch( DatasetProperties.ASPECT_NAME, - "replace", + "add", path="/customProperties", value=custom_properties, ) @@ -326,7 +326,7 @@ def set_display_name(self, display_name: str) -> "DatasetPatchBuilder": if display_name is not None: self._add_patch( DatasetProperties.ASPECT_NAME, - "replace", + "add", path="/name", value=display_name, ) diff --git a/metadata-ingestion/src/datahub/specific/ownership.py b/metadata-ingestion/src/datahub/specific/ownership.py index 334b45a67437f..c2a3874a3a33f 100644 --- a/metadata-ingestion/src/datahub/specific/ownership.py +++ b/metadata-ingestion/src/datahub/specific/ownership.py @@ -43,6 +43,6 @@ def remove_owner( def set_owners(self, owners: List[OwnerClass]) -> "OwnershipPatchHelper": self._parent._add_patch( - OwnershipClass.ASPECT_NAME, "replace", path="/owners", value=owners + OwnershipClass.ASPECT_NAME, "add", path="/owners", value=owners ) return self diff --git a/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json b/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json index 97c2330f58bc7..66bc2ce0c2a0c 100644 --- a/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json +++ b/metadata-ingestion/tests/unit/api/entities/dataproducts/golden_dataproduct_out_upsert.json @@ -5,7 +5,7 @@ "changeType": "PATCH", "aspectName": "dataProductProperties", "aspect": { - "value": "[{\"op\": \"replace\", \"path\": \"/name\", \"value\": \"Pet of the Week Campaign\"}, {\"op\": \"replace\", \"path\": \"/assets\", \"value\": [{\"destinationUrn\": \"urn:li:container:DATABASE\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:container:SCHEMA\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}]}, {\"op\": \"replace\", \"path\": \"/customProperties\", \"value\": {\"version\": \"2.0\", \"classification\": \"pii\"}}, {\"op\": \"replace\", \"path\": \"/externalUrl\", \"value\": \"https://github.com/datahub-project/datahub\"}]", + "value": "[{\"op\": \"add\", \"path\": \"/name\", \"value\": \"Pet of the Week Campaign\"}, {\"op\": \"add\", \"path\": \"/assets\", \"value\": [{\"destinationUrn\": \"urn:li:container:DATABASE\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:container:SCHEMA\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}, {\"destinationUrn\": \"urn:li:mlFeatureTable:(urn:li:dataPlatform:feast,test_feature_table_all_feature_dtypes)\", \"created\": {\"time\": 1681455600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"yaml\"}}]}, {\"op\": \"add\", \"path\": \"/customProperties\", \"value\": {\"version\": \"2.0\", \"classification\": \"pii\"}}, {\"op\": \"add\", \"path\": \"/externalUrl\", \"value\": \"https://github.com/datahub-project/datahub\"}]", "contentType": "application/json-patch+json" } }, diff --git a/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json b/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json index ed5a7723ac2bf..bcc619a09401e 100644 --- a/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json +++ b/metadata-ingestion/tests/unit/patch/complex_dataset_patch.json @@ -7,7 +7,7 @@ "aspect": { "json": [ { - "op": "replace", + "op": "add", "path": "/description", "value": "test description" }, diff --git a/metadata-ingestion/tests/unit/patch/test_patch_builder.py b/metadata-ingestion/tests/unit/patch/test_patch_builder.py index f05c4978f8644..e68f948be8aa0 100644 --- a/metadata-ingestion/tests/unit/patch/test_patch_builder.py +++ b/metadata-ingestion/tests/unit/patch/test_patch_builder.py @@ -3,7 +3,12 @@ import pytest -from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.emitter.mce_builder import ( + make_chart_urn, + make_dashboard_urn, + make_dataset_urn, + make_tag_urn, +) from datahub.ingestion.sink.file import write_metadata_file from datahub.metadata.schema_classes import ( DatasetLineageTypeClass, @@ -15,6 +20,8 @@ TagAssociationClass, UpstreamClass, ) +from datahub.specific.chart import ChartPatchBuilder +from datahub.specific.dashboard import DashboardPatchBuilder from datahub.specific.dataset import DatasetPatchBuilder @@ -80,3 +87,41 @@ def test_complex_dataset_patch( pytestconfig.rootpath / "tests/unit/patch/complex_dataset_patch.json" ).read_text() ) + + +def test_basic_chart_patch_builder(): + patcher = ChartPatchBuilder( + make_chart_urn(platform="hive", name="fct_users_created") + ).add_tag(TagAssociationClass(tag=make_tag_urn("test_tag"))) + + assert patcher.build() == [ + MetadataChangeProposalClass( + entityType="chart", + entityUrn="urn:li:chart:(hive,fct_users_created)", + changeType="PATCH", + aspectName="globalTags", + aspect=GenericAspectClass( + value=b'[{"op": "add", "path": "/tags/urn:li:tag:test_tag", "value": {"tag": "urn:li:tag:test_tag"}}]', + contentType="application/json-patch+json", + ), + ), + ] + + +def test_basic_dashboard_patch_builder(): + patcher = DashboardPatchBuilder( + make_dashboard_urn(platform="hive", name="fct_users_created") + ).add_tag(TagAssociationClass(tag=make_tag_urn("test_tag"))) + + assert patcher.build() == [ + MetadataChangeProposalClass( + entityType="dashboard", + entityUrn="urn:li:dashboard:(hive,fct_users_created)", + changeType="PATCH", + aspectName="globalTags", + aspect=GenericAspectClass( + value=b'[{"op": "add", "path": "/tags/urn:li:tag:test_tag", "value": {"tag": "urn:li:tag:test_tag"}}]', + contentType="application/json-patch+json", + ), + ), + ] diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/chart/ChartInfoPatchBuilder.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/chart/ChartInfoPatchBuilder.java new file mode 100644 index 0000000000000..0655d2b3eb8eb --- /dev/null +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/chart/ChartInfoPatchBuilder.java @@ -0,0 +1,41 @@ +package datahub.client.patch.chart; + +import static com.linkedin.metadata.Constants.*; +import static datahub.client.patch.common.PatchUtil.*; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.linkedin.common.urn.Urn; +import datahub.client.patch.AbstractMultiFieldPatchBuilder; +import datahub.client.patch.PatchOperationType; +import javax.annotation.Nonnull; +import org.apache.commons.lang3.tuple.ImmutableTriple; + +public class ChartInfoPatchBuilder extends AbstractMultiFieldPatchBuilder { + private static final String INPUT_EDGES_PATH_START = "/inputEdges/"; + + // Simplified with just Urn + public ChartInfoPatchBuilder addInputEdge(@Nonnull Urn urn) { + ObjectNode value = createEdgeValue(urn); + + pathValues.add( + ImmutableTriple.of(PatchOperationType.ADD.getValue(), INPUT_EDGES_PATH_START + urn, value)); + return this; + } + + public ChartInfoPatchBuilder removeInputEdge(@Nonnull Urn urn) { + pathValues.add( + ImmutableTriple.of( + PatchOperationType.REMOVE.getValue(), INPUT_EDGES_PATH_START + urn, null)); + return this; + } + + @Override + protected String getAspectName() { + return CHART_INFO_ASPECT_NAME; + } + + @Override + protected String getEntityType() { + return CHART_ENTITY_NAME; + } +} diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/common/PatchUtil.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/common/PatchUtil.java new file mode 100644 index 0000000000000..69db36c6e038c --- /dev/null +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/common/PatchUtil.java @@ -0,0 +1,84 @@ +package datahub.client.patch.common; + +import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; +import static com.linkedin.metadata.Constants.*; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.linkedin.common.Edge; +import com.linkedin.common.urn.Urn; +import javax.annotation.Nonnull; + +public class PatchUtil { + private PatchUtil() {} + + private static final String TIME_KEY = "time"; + private static final String ACTOR_KEY = "actor"; + private static final String IMPERSONATOR_KEY = "impersonator"; + private static final String MESSAGE_KEY = "message"; + private static final String LAST_MODIFIED_KEY = "lastModified"; + private static final String CREATED_KEY = "created"; + private static final String DESTINATION_URN_KEY = "destinationUrn"; + private static final String SOURCE_URN_KEY = "sourceUrn"; + + private static final String PROPERTIES_KEY = "properties"; + + public static ObjectNode createEdgeValue(@Nonnull Edge edge) { + ObjectNode value = instance.objectNode(); + + ObjectNode created = instance.objectNode(); + if (edge.getCreated() == null) { + created.put(TIME_KEY, System.currentTimeMillis()).put(ACTOR_KEY, UNKNOWN_ACTOR); + } else { + created + .put(TIME_KEY, edge.getCreated().getTime()) + .put(ACTOR_KEY, edge.getCreated().getActor().toString()); + if (edge.getCreated().getImpersonator() != null) { + created.put(IMPERSONATOR_KEY, edge.getCreated().getImpersonator().toString()); + } + if (edge.getCreated().getMessage() != null) { + created.put(MESSAGE_KEY, edge.getCreated().getMessage()); + } + } + value.set(CREATED_KEY, created); + + ObjectNode lastModified = instance.objectNode(); + if (edge.getLastModified() == null) { + lastModified.put(TIME_KEY, System.currentTimeMillis()).put(ACTOR_KEY, UNKNOWN_ACTOR); + } else { + lastModified + .put(TIME_KEY, edge.getLastModified().getTime()) + .put(ACTOR_KEY, edge.getLastModified().getActor().toString()); + if (edge.getLastModified().getImpersonator() != null) { + lastModified.put(IMPERSONATOR_KEY, edge.getLastModified().getImpersonator().toString()); + } + if (edge.getLastModified().getMessage() != null) { + lastModified.put(MESSAGE_KEY, edge.getLastModified().getMessage()); + } + } + value.set(LAST_MODIFIED_KEY, lastModified); + + if (edge.getProperties() != null) { + ObjectNode propertiesNode = instance.objectNode(); + edge.getProperties().forEach((k, v) -> propertiesNode.set(k, instance.textNode(v))); + value.set(PROPERTIES_KEY, propertiesNode); + } + + value.put(DESTINATION_URN_KEY, edge.getDestinationUrn().toString()); + if (edge.getSourceUrn() != null) { + value.put(SOURCE_URN_KEY, edge.getSourceUrn().toString()); + } + + return value; + } + + public static ObjectNode createEdgeValue(@Nonnull Urn urn) { + ObjectNode value = instance.objectNode(); + ObjectNode auditStamp = instance.objectNode(); + auditStamp.put(TIME_KEY, System.currentTimeMillis()).put(ACTOR_KEY, UNKNOWN_ACTOR); + + value.put(DESTINATION_URN_KEY, urn.toString()).set(LAST_MODIFIED_KEY, auditStamp); + value.set(CREATED_KEY, auditStamp); + + return value; + } +} diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dashboard/DashboardInfoPatchBuilder.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dashboard/DashboardInfoPatchBuilder.java new file mode 100644 index 0000000000000..cadde582f1c64 --- /dev/null +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/dashboard/DashboardInfoPatchBuilder.java @@ -0,0 +1,103 @@ +package datahub.client.patch.dashboard; + +import static com.linkedin.metadata.Constants.*; +import static datahub.client.patch.common.PatchUtil.*; + +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.linkedin.common.Edge; +import com.linkedin.common.urn.ChartUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.common.urn.Urn; +import datahub.client.patch.AbstractMultiFieldPatchBuilder; +import datahub.client.patch.PatchOperationType; +import javax.annotation.Nonnull; +import org.apache.commons.lang3.tuple.ImmutableTriple; + +public class DashboardInfoPatchBuilder + extends AbstractMultiFieldPatchBuilder { + private static final String CHART_EDGES_PATH_START = "/chartEdges/"; + private static final String DATASET_EDGES_PATH_START = "/datasetEdges/"; + + // Simplified with just Urn + public DashboardInfoPatchBuilder addChartEdge(@Nonnull ChartUrn urn) { + ObjectNode value = createEdgeValue(urn); + + pathValues.add( + ImmutableTriple.of(PatchOperationType.ADD.getValue(), CHART_EDGES_PATH_START + urn, value)); + return this; + } + + public DashboardInfoPatchBuilder removeChartEdge(@Nonnull ChartUrn urn) { + pathValues.add( + ImmutableTriple.of( + PatchOperationType.REMOVE.getValue(), CHART_EDGES_PATH_START + urn, null)); + return this; + } + + public DashboardInfoPatchBuilder addDatasetEdge(@Nonnull DatasetUrn urn) { + ObjectNode value = createEdgeValue(urn); + + pathValues.add( + ImmutableTriple.of( + PatchOperationType.ADD.getValue(), DATASET_EDGES_PATH_START + urn, value)); + return this; + } + + public DashboardInfoPatchBuilder removeDatasetEdge(@Nonnull DatasetUrn urn) { + pathValues.add( + ImmutableTriple.of( + PatchOperationType.REMOVE.getValue(), DATASET_EDGES_PATH_START + urn, null)); + return this; + } + + // Full Edge modification + public DashboardInfoPatchBuilder addEdge(@Nonnull Edge edge) { + ObjectNode value = createEdgeValue(edge); + String path = getEdgePath(edge); + + pathValues.add(ImmutableTriple.of(PatchOperationType.ADD.getValue(), path, value)); + return this; + } + + public DashboardInfoPatchBuilder removeEdge(@Nonnull Edge edge) { + String path = getEdgePath(edge); + + pathValues.add(ImmutableTriple.of(PatchOperationType.REMOVE.getValue(), path, null)); + return this; + } + + /** + * Determines Edge path based on supplied Urn, if not a valid entity type throws + * IllegalArgumentException + * + * @param edge + * @return + * @throws IllegalArgumentException if destinationUrn is an invalid entity type + */ + private String getEdgePath(@Nonnull Edge edge) { + Urn destinationUrn = edge.getDestinationUrn(); + + if (DATASET_ENTITY_NAME.equals(destinationUrn.getEntityType())) { + return DATASET_EDGES_PATH_START + destinationUrn; + } + + if (CHART_ENTITY_NAME.equals(destinationUrn.getEntityType())) { + return CHART_EDGES_PATH_START + destinationUrn; + } + + // TODO: Output Data Jobs not supported by aspect, add here if this changes + + throw new IllegalArgumentException( + String.format("Unsupported entity type: %s", destinationUrn.getEntityType())); + } + + @Override + protected String getAspectName() { + return DASHBOARD_INFO_ASPECT_NAME; + } + + @Override + protected String getEntityType() { + return DASHBOARD_ENTITY_NAME; + } +} diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/datajob/DataJobInputOutputPatchBuilder.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/datajob/DataJobInputOutputPatchBuilder.java index 0fb0454533fc0..fc250daffe916 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/datajob/DataJobInputOutputPatchBuilder.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch/datajob/DataJobInputOutputPatchBuilder.java @@ -2,6 +2,7 @@ import static com.fasterxml.jackson.databind.node.JsonNodeFactory.*; import static com.linkedin.metadata.Constants.*; +import static datahub.client.patch.common.PatchUtil.*; import com.fasterxml.jackson.databind.node.ObjectNode; import com.fasterxml.jackson.databind.node.TextNode; @@ -20,21 +21,9 @@ public class DataJobInputOutputPatchBuilder private static final String INPUT_DATA_JOB_EDGES_PATH_START = "/inputDatajobEdges/"; private static final String INPUT_DATASET_EDGES_PATH_START = "/inputDatasetEdges/"; private static final String OUTPUT_DATASET_EDGES_PATH_START = "/outputDatasetEdges/"; - - private static final String DESTINATION_URN_KEY = "destinationUrn"; - private static final String SOURCE_URN_KEY = "sourceUrn"; - private static final String LAST_MODIFIED_KEY = "lastModified"; - private static final String CREATED_KEY = "created"; - private static final String PROPERTIES_KEY = "properties"; - private static final String INPUT_DATASET_FIELDS_PATH_START = "/inputDatasetFields/"; private static final String OUTPUT_DATASET_FIELDS_PATH_START = "/outputDatasetFields/"; - private static final String TIME_KEY = "time"; - private static final String ACTOR_KEY = "actor"; - private static final String IMPERSONATOR_KEY = "impersonator"; - private static final String MESSAGE_KEY = "message"; - // Simplified with just Urn public DataJobInputOutputPatchBuilder addInputDatajobEdge(@Nonnull DataJobUrn dataJobUrn) { ObjectNode value = createEdgeValue(dataJobUrn); @@ -144,66 +133,6 @@ public DataJobInputOutputPatchBuilder removeEdge( return this; } - private ObjectNode createEdgeValue(@Nonnull Urn urn) { - ObjectNode value = instance.objectNode(); - ObjectNode auditStamp = instance.objectNode(); - auditStamp.put(TIME_KEY, System.currentTimeMillis()).put(ACTOR_KEY, UNKNOWN_ACTOR); - - value.put(DESTINATION_URN_KEY, urn.toString()).set(LAST_MODIFIED_KEY, auditStamp); - value.set(CREATED_KEY, auditStamp); - - return value; - } - - private ObjectNode createEdgeValue(@Nonnull Edge edge) { - ObjectNode value = instance.objectNode(); - - ObjectNode created = instance.objectNode(); - if (edge.getCreated() == null) { - created.put(TIME_KEY, System.currentTimeMillis()).put(ACTOR_KEY, UNKNOWN_ACTOR); - } else { - created - .put(TIME_KEY, edge.getCreated().getTime()) - .put(ACTOR_KEY, edge.getCreated().getActor().toString()); - if (edge.getCreated().getImpersonator() != null) { - created.put(IMPERSONATOR_KEY, edge.getCreated().getImpersonator().toString()); - } - if (edge.getCreated().getMessage() != null) { - created.put(MESSAGE_KEY, edge.getCreated().getMessage()); - } - } - value.set(CREATED_KEY, created); - - ObjectNode lastModified = instance.objectNode(); - if (edge.getLastModified() == null) { - lastModified.put(TIME_KEY, System.currentTimeMillis()).put(ACTOR_KEY, UNKNOWN_ACTOR); - } else { - lastModified - .put(TIME_KEY, edge.getLastModified().getTime()) - .put(ACTOR_KEY, edge.getLastModified().getActor().toString()); - if (edge.getLastModified().getImpersonator() != null) { - lastModified.put(IMPERSONATOR_KEY, edge.getLastModified().getImpersonator().toString()); - } - if (edge.getLastModified().getMessage() != null) { - lastModified.put(MESSAGE_KEY, edge.getLastModified().getMessage()); - } - } - value.set(LAST_MODIFIED_KEY, lastModified); - - if (edge.getProperties() != null) { - ObjectNode propertiesNode = instance.objectNode(); - edge.getProperties().forEach((k, v) -> propertiesNode.set(k, instance.textNode(v))); - value.set(PROPERTIES_KEY, propertiesNode); - } - - value.put(DESTINATION_URN_KEY, edge.getDestinationUrn().toString()); - if (edge.getSourceUrn() != null) { - value.put(SOURCE_URN_KEY, edge.getSourceUrn().toString()); - } - - return value; - } - /** * Determines Edge path based on supplied Urn, if not a valid entity type throws * IllegalArgumentException diff --git a/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java b/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java index 563742990f546..5bd10245899e4 100644 --- a/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java +++ b/metadata-integration/java/datahub-client/src/test/java/datahub/client/patch/PatchTest.java @@ -8,6 +8,7 @@ import com.linkedin.common.GlossaryTermAssociation; import com.linkedin.common.OwnershipType; import com.linkedin.common.TagAssociation; +import com.linkedin.common.urn.ChartUrn; import com.linkedin.common.urn.CorpuserUrn; import com.linkedin.common.urn.DataJobUrn; import com.linkedin.common.urn.DataPlatformUrn; @@ -22,7 +23,9 @@ import datahub.client.MetadataWriteResponse; import datahub.client.file.FileEmitter; import datahub.client.file.FileEmitterConfig; +import datahub.client.patch.chart.ChartInfoPatchBuilder; import datahub.client.patch.common.OwnershipPatchBuilder; +import datahub.client.patch.dashboard.DashboardInfoPatchBuilder; import datahub.client.patch.dataflow.DataFlowInfoPatchBuilder; import datahub.client.patch.datajob.DataJobInfoPatchBuilder; import datahub.client.patch.datajob.DataJobInputOutputPatchBuilder; @@ -551,4 +554,90 @@ public void testLocalDataJobInputAddEdge() { System.out.println(Arrays.asList(e.getStackTrace())); } } + + @Test + @Ignore + public void testLocalChartInfoAdd() { + RestEmitter restEmitter = new RestEmitter(RestEmitterConfig.builder().build()); + try { + MetadataChangeProposal chartInfoPatch = + new ChartInfoPatchBuilder() + .urn(UrnUtils.getUrn("urn:li:chart:(dashboardTool,chartId)")) + .addInputEdge( + DatasetUrn.createFromString( + "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleHiveDataset,PROD)")) + .build(); + Future response = restEmitter.emit(chartInfoPatch); + + System.out.println(response.get().getResponseContent()); + + } catch (URISyntaxException | IOException | ExecutionException | InterruptedException e) { + System.out.println(Arrays.asList(e.getStackTrace())); + } + } + + @Test + @Ignore + public void testLocalChartInfoRemove() { + RestEmitter restEmitter = new RestEmitter(RestEmitterConfig.builder().build()); + try { + MetadataChangeProposal chartInfoPatch = + new ChartInfoPatchBuilder() + .urn(UrnUtils.getUrn("urn:li:chart:(dashboardTool,chartId)")) + .removeInputEdge( + DatasetUrn.createFromString( + "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleHiveDataset,PROD)")) + .build(); + Future response = restEmitter.emit(chartInfoPatch); + + System.out.println(response.get().getResponseContent()); + + } catch (URISyntaxException | IOException | ExecutionException | InterruptedException e) { + System.out.println(Arrays.asList(e.getStackTrace())); + } + } + + @Test + @Ignore + public void testLocalDashboardInfoAdd() { + RestEmitter restEmitter = new RestEmitter(RestEmitterConfig.builder().build()); + try { + MetadataChangeProposal dashboardInfoPatch = + new DashboardInfoPatchBuilder() + .urn(UrnUtils.getUrn("urn:li:dashboard:(dashboardTool,dashboardId)")) + .addDatasetEdge( + DatasetUrn.createFromString( + "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleHiveDataset,PROD)")) + .addChartEdge(ChartUrn.createFromString("urn:li:chart:(dashboartTool, chartId)")) + .build(); + Future response = restEmitter.emit(dashboardInfoPatch); + + System.out.println(response.get().getResponseContent()); + + } catch (URISyntaxException | IOException | ExecutionException | InterruptedException e) { + System.out.println(Arrays.asList(e.getStackTrace())); + } + } + + @Test + @Ignore + public void testLocalDashboardInfoRemove() { + RestEmitter restEmitter = new RestEmitter(RestEmitterConfig.builder().build()); + try { + MetadataChangeProposal dashboardInfoPatch = + new DashboardInfoPatchBuilder() + .urn(UrnUtils.getUrn("urn:li:dashboard:(dashboardTool,dashboardId)")) + .removeDatasetEdge( + DatasetUrn.createFromString( + "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleHiveDataset,PROD)")) + .removeChartEdge(ChartUrn.createFromString("urn:li:chart:(dashboardTool, chartId)")) + .build(); + Future response = restEmitter.emit(dashboardInfoPatch); + + System.out.println(response.get().getResponseContent()); + + } catch (URISyntaxException | IOException | ExecutionException | InterruptedException e) { + System.out.println(Arrays.asList(e.getStackTrace())); + } + } } From 296e41dfed325116c2a5661c32ae27790b28aafd Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 3 Jan 2024 15:58:50 -0600 Subject: [PATCH 35/48] feat(docker): docker compose profiles updates (#9514) Co-authored-by: Harshal Sheth --- docker/build.gradle | 7 ++++++- docker/profiles/README.md | 2 +- docker/profiles/docker-compose.actions.yml | 2 +- docker/profiles/docker-compose.frontend.yml | 4 ++-- docker/profiles/docker-compose.gms.yml | 16 ++++++++-------- .../profiles/docker-compose.prerequisites.yml | 18 +++++++++--------- docs/developers.md | 2 +- docs/how/updating-datahub.md | 3 ++- 8 files changed, 30 insertions(+), 24 deletions(-) diff --git a/docker/build.gradle b/docker/build.gradle index 190202620c382..189c4959e0442 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -31,6 +31,11 @@ ext { pg_quickstart_modules = quickstart_modules - [':docker:mysql-setup'] + [':docker:postgres-setup'] } +tasks.register('minDockerCompose2.20', Exec) { + executable 'bash' + args '-c', 'echo -e "$(docker compose version --short)\n2.20"|sort --version-sort --check=quiet --reverse' +} + tasks.register('quickstart') {} tasks.register('quickstartSlim') {} tasks.register('quickstartDebug') {} @@ -118,9 +123,9 @@ tasks.getByName('quickstartDebugComposeUp').dependsOn( ) tasks.withType(ComposeUp).configureEach { shouldRunAfter('quickstartNuke') + dependsOn tasks.named("minDockerCompose2.20") } - task debugReload(type: Exec) { def cmd = ['docker compose -p datahub --profile debug'] + compose_args + ['restart'] + debug_reloadable commandLine 'bash', '-c', cmd.join(" ") diff --git a/docker/profiles/README.md b/docker/profiles/README.md index df09f15cd85ce..fb3c9e3c84a7a 100644 --- a/docker/profiles/README.md +++ b/docker/profiles/README.md @@ -5,7 +5,7 @@ for quickstart use-cases as well as development use-cases. These configurations infrastructure configurations that DataHub can operate on. Requirements: -* Use the profiles requires a modern version of docker. +* Using profiles requires docker compose >= 2.20. * If using the debug/development profiles, you will need to have built the `debug` docker images locally. See the Development Profiles section for more details. ```bash diff --git a/docker/profiles/docker-compose.actions.yml b/docker/profiles/docker-compose.actions.yml index a509a6a67d270..676a72bae3201 100644 --- a/docker/profiles/docker-compose.actions.yml +++ b/docker/profiles/docker-compose.actions.yml @@ -1,7 +1,7 @@ x-datahub-actions-service: &datahub-actions-service hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-head} env_file: datahub-actions/env/docker.env environment: ACTIONS_EXTRA_PACKAGES: ${ACTIONS_EXTRA_PACKAGES:-} diff --git a/docker/profiles/docker-compose.frontend.yml b/docker/profiles/docker-compose.frontend.yml index 80cb4e7b4b596..6e1bbc0be70f5 100644 --- a/docker/profiles/docker-compose.frontend.yml +++ b/docker/profiles/docker-compose.frontend.yml @@ -1,7 +1,7 @@ x-datahub-frontend-service: &datahub-frontend-service hostname: datahub-frontend-react - image: ${DATAHUB_FRONTEND_IMAGE:-linkedin/datahub-frontend-react}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_FRONTEND_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-frontend-react}:${DATAHUB_VERSION:-head} ports: - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002 env_file: datahub-frontend/env/docker.env @@ -12,7 +12,7 @@ x-datahub-frontend-service: &datahub-frontend-service x-datahub-frontend-service-dev: &datahub-frontend-service-dev <<: *datahub-frontend-service - image: linkedin/datahub-frontend-react:debug + image: ${DATAHUB_FRONTEND_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-frontend-react}:debug ports: - ${DATAHUB_MAPPED_FRONTEND_DEBUG_PORT:-5002}:5002 - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002 diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index 01602c8b906b9..93072a76d4041 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -54,7 +54,7 @@ x-datahub-dev-telemetry-env: &datahub-dev-telemetry-env ################################# x-datahub-system-update-service: &datahub-system-update-service hostname: datahub-system-update - image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_UPGRADE_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-upgrade}:${DATAHUB_VERSION:-head} command: - -u - SystemUpdate @@ -67,7 +67,7 @@ x-datahub-system-update-service: &datahub-system-update-service x-datahub-system-update-service-dev: &datahub-system-update-service-dev <<: *datahub-system-update-service - image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:debug + image: ${DATAHUB_UPGRADE_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-upgrade}:debug ports: - ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003 environment: &datahub-system-update-dev-env @@ -85,7 +85,7 @@ x-datahub-system-update-service-dev: &datahub-system-update-service-dev ################################# x-datahub-gms-service: &datahub-gms-service hostname: datahub-gms - image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_GMS_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-gms}:${DATAHUB_VERSION:-head} ports: - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 env_file: datahub-gms/env/docker.env @@ -102,7 +102,7 @@ x-datahub-gms-service: &datahub-gms-service x-datahub-gms-service-dev: &datahub-gms-service-dev <<: *datahub-gms-service - image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:debug + image: ${DATAHUB_GMS_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-gms}:debug ports: - ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001 - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 @@ -128,7 +128,7 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev ################################# x-datahub-mae-consumer-service: &datahub-mae-consumer-service hostname: datahub-mae-consumer - image: ${DATAHUB_MAE_CONSUMER_IMAGE:-linkedin/datahub-mae-consumer}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_MAE_CONSUMER_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: - 9091:9091 env_file: datahub-mae-consumer/env/docker.env @@ -137,7 +137,7 @@ x-datahub-mae-consumer-service: &datahub-mae-consumer-service x-datahub-mae-consumer-service-dev: &datahub-mae-consumer-service-dev <<: *datahub-mae-consumer-service - image: ${DATAHUB_MAE_CONSUMER_IMAGE:-linkedin/datahub-mae-consumer}:debug + image: ${DATAHUB_MAE_CONSUMER_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-mae-consumer}:debug environment: <<: [*datahub-dev-telemetry-env, *datahub-mae-consumer-env] volumes: @@ -151,7 +151,7 @@ x-datahub-mae-consumer-service-dev: &datahub-mae-consumer-service-dev ################################# x-datahub-mce-consumer-service: &datahub-mce-consumer-service hostname: datahub-mce-consumer - image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_MCE_CONSUMER_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 env_file: datahub-mce-consumer/env/docker.env @@ -160,7 +160,7 @@ x-datahub-mce-consumer-service: &datahub-mce-consumer-service x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev <<: *datahub-mce-consumer-service - image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:debug + image: ${DATAHUB_MCE_CONSUMER_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-mce-consumer}:debug environment: <<: [*datahub-dev-telemetry-env, *datahub-mce-consumer-env] volumes: diff --git a/docker/profiles/docker-compose.prerequisites.yml b/docker/profiles/docker-compose.prerequisites.yml index d90d4a252f993..232239c6c70d0 100644 --- a/docker/profiles/docker-compose.prerequisites.yml +++ b/docker/profiles/docker-compose.prerequisites.yml @@ -128,7 +128,7 @@ services: container_name: mysql-setup profiles: *mysql-profiles-quickstart hostname: mysql-setup - image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_MYSQL_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mysql-setup}:${DATAHUB_VERSION:-head} env_file: mysql-setup/env/docker.env depends_on: mysql: @@ -139,7 +139,7 @@ services: <<: *mysql-setup container_name: mysql-setup-dev profiles: *mysql-profiles-dev - image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:debug + image: ${DATAHUB_MYSQL_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-mysql-setup}:debug postgres: container_name: postgres profiles: *postgres-profiles @@ -162,7 +162,7 @@ services: container_name: postgres-setup profiles: *postgres-profiles-quickstart hostname: postgres-setup - image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-acryldata/datahub-postgres-setup}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-postgres-setup}:${DATAHUB_VERSION:-head} env_file: postgres-setup/env/docker.env depends_on: postgres: @@ -173,7 +173,7 @@ services: <<: *postgres-setup container_name: postgres-setup-dev profiles: *postgres-profiles-dev - image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-acryldata/datahub-postgres-setup}:debug + image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-${DATAHUB_REPO:-acryldata}/datahub-postgres-setup}:debug cassandra: container_name: cassandra profiles: *cassandra-profiles @@ -267,7 +267,7 @@ services: container_name: kafka-setup profiles: *profiles-quickstart hostname: kafka-setup - image: ${DATAHUB_KAFKA_SETUP_IMAGE:-linkedin/datahub-kafka-setup}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_KAFKA_SETUP_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-kafka-setup}:${DATAHUB_VERSION:-head} env_file: kafka-setup/env/docker.env environment: &kafka-setup-env DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-false} @@ -285,7 +285,7 @@ services: environment: <<: *kafka-setup-env DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-true} - image: ${DATAHUB_KAFKA_SETUP_IMAGE:-linkedin/datahub-kafka-setup}:debug + image: ${DATAHUB_KAFKA_SETUP_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-kafka-setup}:debug elasticsearch: container_name: elasticsearch profiles: *elasticsearch-profiles @@ -311,7 +311,7 @@ services: - esdata:/usr/share/elasticsearch/data elasticsearch-setup-dev: &elasticsearch-setup-dev container_name: elasticsearch-setup-dev - image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:debug + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-elasticsearch-setup}:debug profiles: *elasticsearch-profiles hostname: elasticsearch-setup env_file: elasticsearch-setup/env/docker.env @@ -351,7 +351,7 @@ services: container_name: opensearch-setup profiles: *opensearch-profiles-quickstart hostname: opensearch-setup - image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-head} + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-head} environment: <<: *search-datastore-environment USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true} @@ -365,7 +365,7 @@ services: container_name: opensearch-setup-dev profiles: *opensearch-profiles-dev hostname: opensearch-setup-dev - image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:debug + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-${DATAHUB_REPO:-linkedin}/datahub-elasticsearch-setup}:debug environment: <<: *search-datastore-environment USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true} diff --git a/docs/developers.md b/docs/developers.md index fe007a56ddc68..4e31aceeb4382 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -9,7 +9,7 @@ title: "Local Development" - [Java 17 JDK](https://openjdk.org/projects/jdk/17/) - [Python 3.10](https://www.python.org/downloads/release/python-3100/) - [Docker](https://www.docker.com/) -- [Docker Compose](https://docs.docker.com/compose/) +- [Docker Compose >=2.20](https://docs.docker.com/compose/) - Docker engine with at least 8GB of memory to run tests. ::: diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 61ad2d623d72a..fb082bea7d151 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -8,7 +8,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - Updating MySQL version for quickstarts to 8.2, may cause quickstart issues for existing instances. - Neo4j 5.x, may require migration from 4.x -- Build now requires JDK17 (Runtime Java 11) +- Build requires JDK17 (Runtime Java 11) +- Build requires Docker Compose > 2.20 ### Potential Downtime From 424057862790b520e6d6e7d9d0a04f52aa46e500 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 3 Jan 2024 17:16:16 -0500 Subject: [PATCH 36/48] feat(ui): switch to vite and vitest (#9451) --- .github/workflows/metadata-io.yml | 3 - .github/workflows/spark-smoke-test.yml | 3 +- build.gradle | 4 + datahub-frontend/build.gradle | 18 - datahub-frontend/conf/routes | 11 +- datahub-web-react/.env | 4 +- datahub-web-react/.eslintrc.js | 3 +- datahub-web-react/build.gradle | 66 +- datahub-web-react/craco.config.js | 75 - datahub-web-react/datahub-frontend.graphql | 389 - datahub-web-react/{public => }/index.html | 14 +- datahub-web-react/package.json | 59 +- .../public/{ => assets}/favicon.ico | Bin .../public/{ => assets}/logo.png | Bin datahub-web-react/public/manifest.json | 2 +- datahub-web-react/src/App.less | 5 +- datahub-web-react/src/App.test.tsx | 15 +- datahub-web-react/src/App.tsx | 35 +- datahub-web-react/src/Mocks.tsx | 12 + datahub-web-react/src/app/Routes.tsx | 4 +- .../src/app/analytics/analytics.ts | 2 +- .../src/app/domain/DomainIcon.tsx | 2 +- .../src/app/entity/dataJob/tabs/RunsTab.tsx | 2 +- .../entity/dataset/profile/OperationsTab.tsx | 2 +- .../dataset/profile/__tests__/Schema.test.tsx | 70 +- .../__tests__/SchemaDescriptionField.test.tsx | 4 +- .../__tests__/PlatformContent.test.tsx | 6 +- .../embed/UpstreamHealth/FailingEntity.tsx | 2 +- .../embed/UpstreamHealth/UpstreamHealth.tsx | 2 +- .../__tests__/DocumentationTab.test.tsx | 8 +- .../editor/__tests__/Editor.test.tsx | 2 +- .../Entity/__tests__/DataJobFlowTab.test.tsx | 6 +- .../entity/user/__tests__/UserHeader.test.tsx | 11 - .../ingest/source/builder/RecipeBuilder.tsx | 8 +- .../source/builder/RecipeForm/FormField.tsx | 8 +- .../source/builder/RecipeForm/RecipeForm.tsx | 8 +- .../RecipeForm/SecretField/SecretField.tsx | 16 +- .../TestConnection/TestConnectionModal.tsx | 2 +- .../app/ingest/source/builder/YamlEditor.tsx | 3 +- .../lineage/__tests__/LineageEdges.test.tsx | 22 +- .../__tests__/LineageEntityView.test.tsx | 2 +- .../lineage/__tests__/LineageTree.test.tsx | 12 +- .../policy/_tests_/policyUtils.test.tsx | 175 +- .../src/app/preview/DefaultPreviewCard.tsx | 4 +- .../__tests__/Recommendations.test.tsx | 1 + .../src/app/search/ToggleSidebarButton.tsx | 4 +- .../__tests__/FilterRendererRegistry.test.tsx | 6 +- .../src/app/search/filters/utils.tsx | 2 +- .../src/app/search/sidebar/EntityLink.tsx | 2 +- .../app/search/sorting/SearchSortSelect.tsx | 2 +- datahub-web-react/src/conf/Global.ts | 1 - .../src/conf/theme/global-variables.less | 26 +- .../src/graphql-mock/createServer.ts | 12 - datahub-web-react/src/graphql-mock/server.ts | 84 - datahub-web-react/src/index.tsx | 3 +- datahub-web-react/src/react-app-env.d.ts | 1 - datahub-web-react/src/setupProxy.js | 37 - datahub-web-react/src/setupTests.ts | 21 +- .../utils/test-utils/TestPageContainer.tsx | 2 +- datahub-web-react/src/vite-env.d.ts | 2 + datahub-web-react/tsconfig.json | 5 +- datahub-web-react/vite.config.ts | 100 + datahub-web-react/yarn.lock | 8860 +++-------------- smoke-test/tests/cypress/package-lock.json | 2031 ---- .../tests/read_only/test_services_up.py | 2 +- 65 files changed, 1905 insertions(+), 10400 deletions(-) delete mode 100644 datahub-web-react/craco.config.js delete mode 100644 datahub-web-react/datahub-frontend.graphql rename datahub-web-react/{public => }/index.html (66%) rename datahub-web-react/public/{ => assets}/favicon.ico (100%) rename datahub-web-react/public/{ => assets}/logo.png (100%) delete mode 100644 datahub-web-react/src/graphql-mock/createServer.ts delete mode 100644 datahub-web-react/src/graphql-mock/server.ts delete mode 100644 datahub-web-react/src/react-app-env.d.ts delete mode 100644 datahub-web-react/src/setupProxy.js create mode 100644 datahub-web-react/src/vite-env.d.ts create mode 100644 datahub-web-react/vite.config.ts delete mode 100644 smoke-test/tests/cypress/package-lock.json diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 96229642244b6..c964352c3e129 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -40,9 +40,6 @@ jobs: python-version: "3.10" cache: "pip" - name: Gradle build (and test) - # there is some race condition in gradle build, which makes gradle never terminate in ~30% of the runs - # running build first without datahub-web-react:yarnBuild and then with it is 100% stable - # datahub-frontend:unzipAssets depends on datahub-web-react:yarnBuild but gradle does not know about it run: | ./gradlew :metadata-io:test - uses: actions/upload-artifact@v3 diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 94692bd3c2336..bd99905a513d6 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -51,8 +51,7 @@ jobs: -x :datahub-web-react:yarnLint \ -x :datahub-web-react:yarnGenerate \ -x :datahub-web-react:yarnInstall \ - -x :datahub-web-react:yarnQuickBuild \ - -x :datahub-web-react:copyAssets \ + -x :datahub-web-react:yarnBuild \ -x :datahub-web-react:distZip \ -x :datahub-web-react:jar - uses: actions/upload-artifact@v3 diff --git a/build.gradle b/build.gradle index bb01a15a7db8d..4680598165d28 100644 --- a/build.gradle +++ b/build.gradle @@ -325,6 +325,10 @@ subprojects { } plugins.withType(JavaPlugin).configureEach { + if (project.name == 'datahub-web-react') { + return + } + dependencies { implementation externalDependency.annotationApi constraints { diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index 437c72e6394ea..1174c5c5cfd5d 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -1,5 +1,4 @@ plugins { - id "io.github.kobylynskyi.graphql.codegen" version "4.1.1" id 'scala' id 'com.palantir.docker' id 'org.gradle.playframework' @@ -39,23 +38,6 @@ artifacts { archives myTar } -graphqlCodegen { - // For options: https://github.com/kobylynskyi/graphql-java-codegen/blob/master/docs/codegen-options.md - graphqlSchemaPaths = ["$projectDir/conf/datahub-frontend.graphql".toString()] - outputDir = new File("$projectDir/app/graphql") - packageName = "generated" - generateApis = true - modelValidationAnnotation = "" - customTypesMapping = [ - Long: "Long", - ] -} - -tasks.withType(Checkstyle) { - exclude "**/generated/**" -} - - /* PLAY UPGRADE NOTE Generates the distribution jars under the expected names. The playFramework plugin only accepts certain name values diff --git a/datahub-frontend/conf/routes b/datahub-frontend/conf/routes index 3102c26497fed..6b53a2789e7cc 100644 --- a/datahub-frontend/conf/routes +++ b/datahub-frontend/conf/routes @@ -36,11 +36,14 @@ PUT /openapi/*path c HEAD /openapi/*path controllers.Application.proxy(path: String, request: Request) PATCH /openapi/*path controllers.Application.proxy(path: String, request: Request) -# Map static resources from the /public folder to the /assets URL path -GET /assets/*file controllers.Assets.at(path="/public", file) - # Analytics route POST /track controllers.TrackingController.track(request: Request) -# Wildcard route accepts any routes and delegates to serveAsset which in turn serves the React Bundle +# Known React asset routes +GET /assets/*file controllers.Assets.at(path="/public/assets", file) +GET /node_modules/*file controllers.Assets.at(path="/public/node_modules", file) +GET /manifest.json controllers.Assets.at(path="/public", file="manifest.json") +GET /robots.txt controllers.Assets.at(path="/public", file="robots.txt") + +# Wildcard route accepts any routes and delegates to serveAsset which in turn serves the React Bundle's index.html GET /*path controllers.Application.index(path) diff --git a/datahub-web-react/.env b/datahub-web-react/.env index e5529bbdaa56d..7c02340752104 100644 --- a/datahub-web-react/.env +++ b/datahub-web-react/.env @@ -1,5 +1,3 @@ -PUBLIC_URL=/assets REACT_APP_THEME_CONFIG=theme_light.config.json SKIP_PREFLIGHT_CHECK=true -BUILD_PATH=build/yarn -REACT_APP_PROXY_TARGET=http://localhost:9002 \ No newline at end of file +REACT_APP_PROXY_TARGET=http://localhost:9002 diff --git a/datahub-web-react/.eslintrc.js b/datahub-web-react/.eslintrc.js index 2806942dd1053..e48dfdb23a4e7 100644 --- a/datahub-web-react/.eslintrc.js +++ b/datahub-web-react/.eslintrc.js @@ -5,7 +5,7 @@ module.exports = { 'airbnb-typescript', 'airbnb/hooks', 'plugin:@typescript-eslint/recommended', - 'plugin:jest/recommended', + 'plugin:vitest/recommended', 'prettier', ], plugins: ['@typescript-eslint'], @@ -46,6 +46,7 @@ module.exports = { argsIgnorePattern: '^_', }, ], + 'vitest/prefer-to-be': 'off', }, settings: { react: { diff --git a/datahub-web-react/build.gradle b/datahub-web-react/build.gradle index 72821d8b97dc0..c0355b935137a 100644 --- a/datahub-web-react/build.gradle +++ b/datahub-web-react/build.gradle @@ -19,7 +19,7 @@ node { version = '21.2.0' // Version of Yarn to use. - yarnVersion = '1.22.1' + yarnVersion = '1.22.21' // Base URL for fetching node distributions (set nodeDistBaseUrl if you have a mirror). if (project.hasProperty('nodeDistBaseUrl')) { @@ -44,10 +44,33 @@ node { */ task yarnInstall(type: YarnTask) { args = ['install'] + + // The node_modules directory can contain built artifacts, so + // it's not really safe to cache it. + outputs.cacheIf { false } + + inputs.files( + file('yarn.lock'), + file('package.json'), + ) + outputs.dir('node_modules') } task yarnGenerate(type: YarnTask, dependsOn: yarnInstall) { args = ['run', 'generate'] + + outputs.cacheIf { true } + + inputs.files( + yarnInstall.inputs.files, + file('codegen.yml'), + project.fileTree(dir: "../datahub-graphql-core/src/main/resources/", include: "*.graphql"), + project.fileTree(dir: "src", include: "**/*.graphql"), + ) + + outputs.files( + project.fileTree(dir: "src", include: "**/*.generated.ts"), + ) } task yarnServe(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { @@ -55,7 +78,8 @@ task yarnServe(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { } task yarnTest(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { - args = ['run', 'test', '--watchAll', 'false'] + // Explicitly runs in non-watch mode. + args = ['run', 'test', 'run'] } task yarnLint(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { @@ -68,13 +92,24 @@ task yarnLintFix(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { args = ['run', 'lint-fix'] } -task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnTest, yarnLint]) { - args = ['run', 'build'] -} - -task yarnQuickBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { +task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) { environment = [NODE_OPTIONS: "--max-old-space-size=3072 --openssl-legacy-provider"] args = ['run', 'build'] + + outputs.cacheIf { true } + inputs.files( + file('index.html'), + project.fileTree(dir: "src"), + project.fileTree(dir: "public"), + + yarnInstall.inputs.files, + yarnGenerate.outputs.files, + + file('.env'), + file('vite.config.ts'), + file('tsconfig.json'), + ) + outputs.dir('dist') } task cleanExtraDirs { @@ -82,9 +117,8 @@ task cleanExtraDirs { delete 'dist' delete 'tmp' delete 'just' - delete 'src/types.generated.ts' delete fileTree('../datahub-frontend/public') - delete fileTree(dir: 'src/graphql', include: '*.generated.ts') + delete fileTree(dir: 'src', include: '*.generated.ts') } clean.finalizedBy(cleanExtraDirs) @@ -93,24 +127,16 @@ configurations { } distZip { - dependsOn yarnQuickBuild + dependsOn yarnBuild archiveFileName = "datahub-web-react-${archiveVersion}.${archiveExtension}" from 'dist' } -task copyAssets(dependsOn: distZip) { - doLast { - copy { - from zipTree(distZip.outputs.files.first()) - into "../datahub-frontend/public" - } - } -} - jar { - dependsOn distZip, copyAssets + dependsOn distZip into('public') { from zipTree(distZip.outputs.files.first()) } archiveClassifier = 'assets' } +build.dependsOn jar diff --git a/datahub-web-react/craco.config.js b/datahub-web-react/craco.config.js deleted file mode 100644 index 6ede45902128f..0000000000000 --- a/datahub-web-react/craco.config.js +++ /dev/null @@ -1,75 +0,0 @@ -/* eslint-disable @typescript-eslint/no-var-requires */ -require('dotenv').config(); -const { whenProd } = require('@craco/craco'); -const CracoAntDesignPlugin = require('craco-antd'); -const path = require('path'); -const CopyWebpackPlugin = require('copy-webpack-plugin'); - -// eslint-disable-next-line import/no-dynamic-require -const themeConfig = require(`./src/conf/theme/${process.env.REACT_APP_THEME_CONFIG}`); - -function addLessPrefixToKeys(styles) { - const output = {}; - Object.keys(styles).forEach((key) => { - output[`@${key}`] = styles[key]; - }); - return output; -} - -module.exports = { - webpack: { - configure: { - optimization: whenProd(() => ({ - splitChunks: { - cacheGroups: { - vendor: { - test: /[\\/]node_modules[\\/]/, - name: 'vendors', - chunks: 'all', - }, - }, - }, - })), - // Webpack 5 no longer automatically pollyfill core Node.js modules - resolve: { fallback: { fs: false } }, - // Ignore Webpack 5's missing source map warnings from node_modules - ignoreWarnings: [{ module: /node_modules/, message: /source-map-loader/ }], - }, - plugins: { - add: [ - // Self host images by copying them to the build directory - new CopyWebpackPlugin({ - patterns: [{ from: 'src/images', to: 'platforms' }], - }), - // Copy monaco-editor files to the build directory - new CopyWebpackPlugin({ - patterns: [ - { from: 'node_modules/monaco-editor/min/vs/', to: 'monaco-editor/vs' }, - { from: 'node_modules/monaco-editor/min-maps/vs/', to: 'monaco-editor/min-maps/vs' }, - ], - }), - ], - }, - }, - plugins: [ - { - plugin: CracoAntDesignPlugin, - options: { - customizeThemeLessPath: path.join(__dirname, 'src/conf/theme/global-variables.less'), - customizeTheme: addLessPrefixToKeys(themeConfig.styles), - }, - }, - ], - jest: { - configure: { - // Use dist files instead of source files - moduleNameMapper: { - '^d3-interpolate-path': `d3-interpolate-path/build/d3-interpolate-path`, - '^d3-(.*)$': `d3-$1/dist/d3-$1`, - '^lib0/((?!dist).*)$': 'lib0/dist/$1.cjs', - '^y-protocols/(.*)$': 'y-protocols/dist/$1.cjs', - '\\.(css|less)$': '/src/__mocks__/styleMock.js', - }, - }, - }, -}; diff --git a/datahub-web-react/datahub-frontend.graphql b/datahub-web-react/datahub-frontend.graphql deleted file mode 100644 index 6df3c387e14fe..0000000000000 --- a/datahub-web-react/datahub-frontend.graphql +++ /dev/null @@ -1,389 +0,0 @@ -scalar Long - -schema { - query: Query - mutation: Mutation -} - -type Query { - dataset(urn: String!): Dataset - user(urn: String!): CorpUser - search(input: SearchInput!): SearchResults - autoComplete(input: AutoCompleteInput!): AutoCompleteResults - browse(input: BrowseInput!): BrowseResults - browsePaths(input: BrowsePathsInput!): [[String!]!] -} - -type Mutation { - logIn(username: String!, password: String!): CorpUser - updateDataset(input: DatasetUpdateInput!): Dataset -} - -input DatasetUpdateInput { - urn: String! - ownership: OwnershipUpdate -} - -input OwnershipUpdate { - owners: [OwnerUpdate!] -} - -input OwnerUpdate { - # The owner URN, eg urn:li:corpuser:1 - owner: String! - - # The owner role type - type: OwnershipType! -} - -enum OwnershipSourceType { - AUDIT - DATABASE - FILE_SYSTEM - ISSUE_TRACKING_SYSTEM - MANUAL - SERVICE - SOURCE_CONTROL - OTHER -} - -type OwnershipSource { - """ - The type of the source - """ - type: OwnershipSourceType! - - """ - A reference URL for the source - """ - url: String -} - -enum OwnershipType { - """ - A person or group that is in charge of developing the code - """ - DEVELOPER - - """ - A person or group that is owning the data - """ - DATAOWNER - - """ - A person or a group that overseas the operation, e.g. a DBA or SRE. - """ - DELEGATE - - """ - A person, group, or service that produces/generates the data - """ - PRODUCER - - """ - A person, group, or service that consumes the data - """ - CONSUMER - - """ - A person or a group that has direct business interest - """ - STAKEHOLDER -} - -type Owner { - """ - Owner object - """ - owner: CorpUser! - - """ - The type of the ownership - """ - type: OwnershipType - - """ - Source information for the ownership - """ - source: OwnershipSource -} - -type Ownership { - owners: [Owner!] - - lastModified: Long! -} - -enum FabricType { - """ - Designates development fabrics - """ - DEV - - """ - Designates early-integration (staging) fabrics - """ - EI - - """ - Designates production fabrics - """ - PROD - - """ - Designates corporation fabrics - """ - CORP -} - -enum PlatformNativeType { - """ - Table - """ - TABLE - - """ - View - """ - VIEW - - """ - Directory in file system - """ - DIRECTORY - - """ - Stream - """ - STREAM - - """ - Bucket in key value store - """ - BUCKET -} - -type PropertyTuple { - key: String! - value: String -} - -type SubTypes { - typeNames: [String!] -} - -type Dataset { - urn: String! - - platform: String! - - name: String! - - origin: FabricType! - - description: String - - uri: String - - platformNativeType: PlatformNativeType - - tags: [String!]! - - properties: [PropertyTuple!] - - createdTime: Long! - - modifiedTime: Long! - - ownership: Ownership - - subTypes: SubTypes -} - -type CorpUserInfo { - active: Boolean! - - displayName: String - - email: String! - - title: String - - manager: CorpUser - - departmentId: Long - - departmentName: String - - firstName: String - - lastName: String - - fullName: String - - countryCode: String -} - -type CorpUserEditableInfo { - aboutMe: String - - teams: [String!] - - skills: [String!] - - pictureLink: String -} - -type CorpUser { - urn: String! - - username: String! - - info: CorpUserInfo - - editableInfo: CorpUserEditableInfo -} - -type CorpGroup implements Entity { - """ - The unique user URN - """ - urn: String! - - """ - GMS Entity Type - """ - type: EntityType! - - """ - group name e.g. wherehows-dev, ask_metadata - """ - name: String - - """ - Information of the corp group - """ - info: CorpGroupInfo -} - - -type CorpGroupInfo { - """ - email of this group - """ - email: String! - - """ - owners of this group - """ - admins: [String!]! - - """ - List of ldap urn in this group. - """ - members: [String!]! - - """ - List of groups in this group. - """ - groups: [String!]! -} - -enum EntityType { - DATASET - USER - DATA_FLOW - DATA_JOB - CORP_USER - CORP_GROUP -} - -# Search Input -input SearchInput { - type: EntityType! - query: String! - start: Int - count: Int - filters: [FacetFilterInput!] -} - -input FacetFilterInput { - field: String! # Facet Field Name - value: String! # Facet Value -} - -# Search Output -type SearchResults { - start: Int! - count: Int! - total: Int! - elements: [SearchResult!]! - facets: [FacetMetadata!] -} - -union SearchResult = Dataset | CorpUser - -type FacetMetadata { - field: String! - aggregations: [AggregationMetadata!]! -} - -type AggregationMetadata { - value: String! - count: Long! -} - -# Autocomplete Input -input AutoCompleteInput { - type: EntityType! - query: String! - field: String # Field name - limit: Int - filters: [FacetFilterInput!] -} - -# Autocomplete Output -type AutoCompleteResults { - query: String! - suggestions: [String!]! -} - -# Browse Inputs -input BrowseInput { - type: EntityType! - path: [String!] - start: Int - count: Int - filters: [FacetFilterInput!] -} - -# Browse Output -type BrowseResults { - entities: [BrowseResultEntity!]! - start: Int! - count: Int! - total: Int! - metadata: BrowseResultMetadata! -} - -type BrowseResultEntity { - name: String! - urn: String! -} - -type BrowseResultMetadata { - path: [String!] - groups: [BrowseResultGroup!]! - totalNumEntities: Long! -} - -type BrowseResultGroup { - name: String! - count: Long! -} - -# Browse Paths Input -input BrowsePathsInput { - type: EntityType! - urn: String! -} diff --git a/datahub-web-react/public/index.html b/datahub-web-react/index.html similarity index 66% rename from datahub-web-react/public/index.html rename to datahub-web-react/index.html index ead3a0aba82cb..9490881246e12 100644 --- a/datahub-web-react/public/index.html +++ b/datahub-web-react/index.html @@ -2,7 +2,7 @@ - + @@ -10,21 +10,13 @@ manifest.json provides metadata used when your web app is installed on a user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/ --> - - + DataHub
+