From 4c83f10cdbd18c7a806061e87eeee024dc9bb5d8 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 29 Feb 2024 16:13:43 -0800 Subject: [PATCH 01/13] feat(ingest): throw codegen error on duplicate class names (#9960) --- metadata-ingestion/scripts/avro_codegen.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index bd4988f990534a..fbf45c08bb61e4 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -148,11 +148,27 @@ def merge_schemas(schemas_obj: List[dict]) -> str: # Combine schemas as a "union" of all of the types. merged = ["null"] + schemas_obj + # Check that we don't have the same class name in multiple namespaces. + names_to_spaces: Dict[str, str] = {} + # Patch add_name method to NOT complain about duplicate names. class NamesWithDups(avro.schema.Names): def add_name(self, name_attr, space_attr, new_schema): + to_add = avro.schema.Name(name_attr, space_attr, self.default_namespace) + assert to_add.name + assert to_add.space assert to_add.fullname + + if to_add.name in names_to_spaces: + if names_to_spaces[to_add.name] != to_add.space: + raise ValueError( + f"Duplicate name {to_add.name} in namespaces {names_to_spaces[to_add.name]} and {to_add.space}. " + "This will cause conflicts in the generated code." + ) + else: + names_to_spaces[to_add.name] = to_add.space + self.names[to_add.fullname] = new_schema return to_add From 05593f443e1af7abc33a83b93424ffb437584fc7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 29 Feb 2024 18:05:14 -0800 Subject: [PATCH 02/13] feat(docker): respect pip mirrors with uv (#9963) --- docker/datahub-ingestion-base/Dockerfile | 16 +++++++++------- docker/datahub-ingestion/Dockerfile | 18 +++++++++++------- docker/datahub-ingestion/Dockerfile-slim-only | 19 +++++++++---------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 220c6f7d448aff..bfd4ee1143f5ef 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -5,7 +5,7 @@ ARG BASE_IMAGE=base ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG DEBIAN_REPO_URL=https://deb.debian.org/debian -ARG PIP_MIRROR_URL=null +ARG PIP_MIRROR_URL=https://pypi.python.org/simple FROM golang:1-alpine3.18 AS dockerize-binary @@ -26,15 +26,18 @@ RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION FROM python:3.10 as base -ARG DEBIAN_REPO_URL -ARG PIP_MIRROR_URL ARG GITHUB_REPO_URL ENV DEBIAN_FRONTEND noninteractive -# Optionally set corporate mirror for apk and pip +# Optionally set corporate mirror for deb +ARG DEBIAN_REPO_URL RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi + +# Optionally set corporate mirror for pip +ARG PIP_MIRROR_URL +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} RUN apt-get update && apt-get install -y -qq \ python3-ldap \ @@ -67,8 +70,7 @@ USER datahub ENV VIRTUAL_ENV=/datahub-ingestion/.venv ENV PATH="${VIRTUAL_ENV}/bin:$PATH" RUN python3 -m venv $VIRTUAL_ENV && \ - uv pip install --no-cache -r requirements.txt && \ - pip uninstall -y acryl-datahub + uv pip install --no-cache -r requirements.txt ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 6c8829557837cf..3f29417dca0d78 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -2,12 +2,22 @@ ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=head -ARG PIP_MIRROR_URL=null ARG DEBIAN_REPO_URL=https://deb.debian.org/debian +ARG PIP_MIRROR_URL=https://pypi.python.org/simple FROM $BASE_IMAGE:$DOCKER_VERSION as base + +# Optionally set corporate mirror for deb +USER 0 +ARG DEBIAN_REPO_URL +RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi USER datahub +# Optionally set corporate mirror for pip +ARG PIP_MIRROR_URL +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} + COPY --chown=datahub ./metadata-ingestion /datahub-ingestion COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin @@ -19,23 +29,17 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__ FROM base as slim-install -ARG PIP_MIRROR_URL -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." FROM base as full-install-build -ARG PIP_MIRROR_URL -ARG DEBIAN_REPO_URL USER 0 -RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi RUN apt-get update && apt-get install -y -qq maven USER datahub COPY ./docker/datahub-ingestion/pyspark_jars.sh . -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi RUN uv pip install --no-cache "acryl-datahub[base,all] @ ." "acryl-datahub-airflow-plugin[plugin-v2] @ ./airflow-plugin" && \ datahub --version RUN ./pyspark_jars.sh diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only index ba43bd3c3c6be7..a5f2a93e8a27bd 100644 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ b/docker/datahub-ingestion/Dockerfile-slim-only @@ -1,26 +1,25 @@ # Defining environment ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=head-slim -ARG PIP_MIRROR_URL=null +ARG PIP_MIRROR_URL=https://pypi.python.org/simple FROM $BASE_IMAGE:$DOCKER_VERSION as base -USER 0 +USER datahub + +# Optionally set corporate mirror for apk and pip +ARG PIP_MIRROR_URL +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} -COPY ./metadata-ingestion /datahub-ingestion +COPY --chown=datahub ./metadata-ingestion /datahub-ingestion ARG RELEASE_VERSION WORKDIR /datahub-ingestion RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \ - cat src/datahub/__init__.py && \ - chown -R datahub /datahub-ingestion - -USER datahub + cat src/datahub/__init__.py FROM base as slim-install -ARG PIP_MIRROR_URL - -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." && \ datahub --version From 6eb5f80a9a6d5409443307b22fd81b2c1b6022e6 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Fri, 1 Mar 2024 16:31:21 +0100 Subject: [PATCH 03/13] Openlineage endpoint and Spark Lineage Beta Plugin (#9870) Co-authored-by: David Leifker --- build.gradle | 3 + docs-website/sidebars.js | 4 + .../AbstractMultiFieldPatchBuilder.java | 9 + .../DataJobInputOutputPatchBuilder.java | 36 +- .../builder/UpstreamLineagePatchBuilder.java | 99 +- .../patch/template/AspectTemplateEngine.java | 1 - .../java/datahub-client/build.gradle | 1 + .../java/datahub-event/build.gradle | 61 + .../java/datahub/event/EventFormatter.java | 0 .../event/EventValidationException.java | 0 .../event/MetadataChangeProposalWrapper.java | 0 .../java/datahub/event/StringEscapeUtils.java | 0 .../datahub/event/UpsertAspectRequest.java | 0 .../resources/JavaSpring/interface.mustache | 21 + .../main/resources/JavaSpring/model.mustache | 41 + .../main/resources/JavaSpring/pojo.mustache | 166 ++ .../resources/MetadataChangeProposal.avsc | 174 ++ .../src/main/resources/client.properties | 1 + .../datahub/event/EventFormatterTest.java | 53 + .../MetadataChangeProposalWrapperTest.java | 76 + .../resources/golden_files/mcps_golden.json | 1543 +++++++++++++++++ .../java/openlineage-converter/README.md | 8 + .../java/openlineage-converter/build.gradle | 55 + .../config/DatahubOpenlineageConfig.java | 43 + .../converter/OpenLineageToDataHub.java | 826 +++++++++ .../dataset/CatalogTableDataset.java | 13 + .../openlineage/dataset/DatahubDataset.java | 19 + .../openlineage/dataset/DatahubJob.java | 445 +++++ .../openlineage/dataset/Dataset.java | 29 + .../openlineage/dataset/HdfsPathDataset.java | 206 +++ .../openlineage/dataset/HdfsPlatform.java | 32 + .../openlineage/dataset/PathSpec.java | 20 + .../openlineage/dataset/SparkDataset.java | 33 + .../openlineage/utils/DatahubUtils.java | 37 + .../openlineage/HdfsPathDatasetTest.java | 271 +++ .../src/test/resources/log4j.properties | 8 + .../java/spark-lineage-beta/README.md | 343 ++++ .../java/spark-lineage-beta/build.gradle | 254 +++ .../spark-lineage-beta/scripts/check_jar.sh | 55 + .../datahub/spark/DatahubEventEmitter.java | 414 +++++ .../datahub/spark/DatahubSparkListener.java | 230 +++ .../spark/conf/DatahubEmitterConfig.java | 5 + .../spark/conf/RestDatahubEmitterConfig.java | 18 + .../datahub/spark/conf/SparkAppContext.java | 14 + .../datahub/spark/conf/SparkConfigParser.java | 323 ++++ .../datahub/spark/conf/SparkLineageConf.java | 54 + .../SparkStreamingEventToDatahub.java | 194 +++ .../lifecycle/OpenLineageRunEventBuilder.java | 493 ++++++ .../plan/LogicalRelationDatasetBuilder.java | 220 +++ .../spark/agent/util/PathUtils.java | 207 +++ .../spark/agent/util/PlanUtils.java | 343 ++++ .../plan/catalog/IcebergHandler.java | 192 ++ .../datahub/spark/HdfsPathDatasetTest.java | 291 ++++ .../SparkStreamingEventToDatahubTest.java | 486 ++++++ .../src/test/resources/log4j.properties | 10 + .../ol_events/sample_failed_spark.json | 104 ++ .../resources/ol_events/sample_spark.json | 114 ++ .../log4j-defaults.properties | 9 + metadata-service/openapi-servlet/build.gradle | 36 +- .../openapi/config/SpringWebConfig.java | 11 + .../config/OpenLineageServletConfig.java | 29 + .../controller/LineageApiImpl.java | 86 + .../openlineage/mapping/RunEventMapper.java | 37 + .../openlineage/model/LineageBody.java | 26 + .../resources/openlineage/openlineage.json | 413 +++++ settings.gradle | 3 + 66 files changed, 9328 insertions(+), 20 deletions(-) create mode 100644 metadata-integration/java/datahub-event/build.gradle rename metadata-integration/java/{datahub-client => datahub-event}/src/main/java/datahub/event/EventFormatter.java (100%) rename metadata-integration/java/{datahub-client => datahub-event}/src/main/java/datahub/event/EventValidationException.java (100%) rename metadata-integration/java/{datahub-client => datahub-event}/src/main/java/datahub/event/MetadataChangeProposalWrapper.java (100%) rename metadata-integration/java/{datahub-client => datahub-event}/src/main/java/datahub/event/StringEscapeUtils.java (100%) rename metadata-integration/java/{datahub-client => datahub-event}/src/main/java/datahub/event/UpsertAspectRequest.java (100%) create mode 100644 metadata-integration/java/datahub-event/src/main/resources/JavaSpring/interface.mustache create mode 100644 metadata-integration/java/datahub-event/src/main/resources/JavaSpring/model.mustache create mode 100644 metadata-integration/java/datahub-event/src/main/resources/JavaSpring/pojo.mustache create mode 100644 metadata-integration/java/datahub-event/src/main/resources/MetadataChangeProposal.avsc create mode 100644 metadata-integration/java/datahub-event/src/main/resources/client.properties create mode 100644 metadata-integration/java/datahub-event/src/test/java/datahub/event/EventFormatterTest.java create mode 100644 metadata-integration/java/datahub-event/src/test/java/datahub/event/MetadataChangeProposalWrapperTest.java create mode 100644 metadata-integration/java/datahub-event/src/test/resources/golden_files/mcps_golden.json create mode 100644 metadata-integration/java/openlineage-converter/README.md create mode 100644 metadata-integration/java/openlineage-converter/build.gradle create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/CatalogTableDataset.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubDataset.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/Dataset.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPathDataset.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPlatform.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/PathSpec.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/SparkDataset.java create mode 100644 metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/utils/DatahubUtils.java create mode 100644 metadata-integration/java/openlineage-converter/src/test/java/io/datahubproject/openlineage/HdfsPathDatasetTest.java create mode 100644 metadata-integration/java/openlineage-converter/src/test/resources/log4j.properties create mode 100644 metadata-integration/java/spark-lineage-beta/README.md create mode 100644 metadata-integration/java/spark-lineage-beta/build.gradle create mode 100755 metadata-integration/java/spark-lineage-beta/scripts/check_jar.sh create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubEventEmitter.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/DatahubEmitterConfig.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/RestDatahubEmitterConfig.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkAppContext.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkConfigParser.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkLineageConf.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/converter/SparkStreamingEventToDatahub.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/OpenLineageRunEventBuilder.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/plan/LogicalRelationDatasetBuilder.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PathUtils.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark3/agent/lifecycle/plan/catalog/IcebergHandler.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/HdfsPathDatasetTest.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/SparkStreamingEventToDatahubTest.java create mode 100644 metadata-integration/java/spark-lineage-beta/src/test/resources/log4j.properties create mode 100644 metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_failed_spark.json create mode 100644 metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_spark.json create mode 100644 metadata-integration/java/spark-lineage-beta/src/test/resources/org.apache.spark/log4j-defaults.properties create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/config/OpenLineageServletConfig.java create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/mapping/RunEventMapper.java create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/model/LineageBody.java create mode 100644 metadata-service/openapi-servlet/src/main/resources/openlineage/openlineage.json diff --git a/build.gradle b/build.gradle index 228a8a9f5ff0a0..fe6af725be7704 100644 --- a/build.gradle +++ b/build.gradle @@ -52,6 +52,8 @@ buildscript { ext.hazelcastVersion = '5.3.6' ext.ebeanVersion = '12.16.1' ext.googleJavaFormatVersion = '1.18.1' + ext.openLineageVersion = '1.5.0' + ext.logbackClassicJava8 = '1.2.12' ext.docker_registry = 'linkedin' @@ -176,6 +178,7 @@ project.ext.externalDependency = [ 'kafkaClients': "org.apache.kafka:kafka-clients:$kafkaVersion", 'snappy': 'org.xerial.snappy:snappy-java:1.1.10.4', 'logbackClassic': "ch.qos.logback:logback-classic:$logbackClassic", + 'logbackClassicJava8' : "ch.qos.logback:logback-classic:$logbackClassicJava8", 'slf4jApi': "org.slf4j:slf4j-api:$slf4jVersion", 'log4jCore': "org.apache.logging.log4j:log4j-core:$log4jVersion", 'log4jApi': "org.apache.logging.log4j:log4j-api:$log4jVersion", diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 86a3b9176e632e..13bda5d735f3e7 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -299,6 +299,7 @@ module.exports = { "metadata-integration/java/spark-lineage/README", "metadata-ingestion/integration_docs/great-expectations", "metadata-integration/java/datahub-protobuf/README", + "metadata-integration/java/spark-lineage-beta/README", //"metadata-ingestion/source-docs-template", { type: "autogenerated", @@ -746,6 +747,9 @@ module.exports = { //"docs/how/build-metadata-service", //"docs/how/graph-onboarding", //"docs/demo/graph-onboarding", + //"metadata-integration/java/spark-lineage/README", + // "metadata-integration/java/spark-lineage-beta/README.md + // "metadata-integration/java/openlineage-converter/README" //"metadata-ingestion-modules/airflow-plugin/README" // "metadata-ingestion/schedule_docs/datahub", // we can delete this // TODO: change the titles of these, removing the "What is..." portion from the sidebar" diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/AbstractMultiFieldPatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/AbstractMultiFieldPatchBuilder.java index 165a4d26c339cc..3920d5dc2ea15f 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/AbstractMultiFieldPatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/AbstractMultiFieldPatchBuilder.java @@ -12,6 +12,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; +import javax.annotation.Nonnull; import org.apache.commons.lang3.tuple.ImmutableTriple; public abstract class AbstractMultiFieldPatchBuilder> { @@ -64,6 +65,14 @@ public T urn(Urn urn) { */ protected abstract String getEntityType(); + protected static String encodeValue(@Nonnull String value) { + return value.replace("~ ", "~0").replace("/", "~1"); + } + + protected static String encodeValueUrn(@Nonnull Urn urn) { + return encodeValue(urn.toString()); + } + /** * Overrides basic behavior to construct multiple patches based on properties * diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java index 8e2168e5b6a338..6fffb17521ddb7 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/DataJobInputOutputPatchBuilder.java @@ -32,7 +32,7 @@ public DataJobInputOutputPatchBuilder addInputDatajobEdge(@Nonnull DataJobUrn da pathValues.add( ImmutableTriple.of( PatchOperationType.ADD.getValue(), - INPUT_DATA_JOB_EDGES_PATH_START + dataJobUrn, + INPUT_DATA_JOB_EDGES_PATH_START + encodeValue(dataJobUrn.toString()), value)); return this; } @@ -41,7 +41,7 @@ public DataJobInputOutputPatchBuilder removeInputDatajobEdge(@Nonnull DataJobUrn pathValues.add( ImmutableTriple.of( PatchOperationType.REMOVE.getValue(), - INPUT_DATA_JOB_EDGES_PATH_START + dataJobUrn, + INPUT_DATA_JOB_EDGES_PATH_START + encodeValue(dataJobUrn.toString()), null)); return this; } @@ -51,7 +51,9 @@ public DataJobInputOutputPatchBuilder addInputDatasetEdge(@Nonnull DatasetUrn da pathValues.add( ImmutableTriple.of( - PatchOperationType.ADD.getValue(), INPUT_DATASET_EDGES_PATH_START + datasetUrn, value)); + PatchOperationType.ADD.getValue(), + INPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()), + value)); return this; } @@ -59,7 +61,7 @@ public DataJobInputOutputPatchBuilder removeInputDatasetEdge(@Nonnull DatasetUrn pathValues.add( ImmutableTriple.of( PatchOperationType.REMOVE.getValue(), - INPUT_DATASET_EDGES_PATH_START + datasetUrn, + INPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()), null)); return this; } @@ -70,7 +72,7 @@ public DataJobInputOutputPatchBuilder addOutputDatasetEdge(@Nonnull DatasetUrn d pathValues.add( ImmutableTriple.of( PatchOperationType.ADD.getValue(), - OUTPUT_DATASET_EDGES_PATH_START + datasetUrn, + OUTPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()), value)); return this; } @@ -79,7 +81,7 @@ public DataJobInputOutputPatchBuilder removeOutputDatasetEdge(@Nonnull DatasetUr pathValues.add( ImmutableTriple.of( PatchOperationType.REMOVE.getValue(), - OUTPUT_DATASET_EDGES_PATH_START + datasetUrn, + OUTPUT_DATASET_EDGES_PATH_START + encodeValue(datasetUrn.toString()), null)); return this; } @@ -88,7 +90,9 @@ public DataJobInputOutputPatchBuilder addInputDatasetField(@Nonnull Urn urn) { TextNode textNode = instance.textNode(urn.toString()); pathValues.add( ImmutableTriple.of( - PatchOperationType.ADD.getValue(), INPUT_DATASET_FIELDS_PATH_START + urn, textNode)); + PatchOperationType.ADD.getValue(), + INPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()), + textNode)); return this; } @@ -96,7 +100,9 @@ public DataJobInputOutputPatchBuilder addInputDatasetField(@Nonnull Urn urn) { public DataJobInputOutputPatchBuilder removeInputDatasetField(@Nonnull Urn urn) { pathValues.add( ImmutableTriple.of( - PatchOperationType.REMOVE.getValue(), INPUT_DATASET_FIELDS_PATH_START + urn, null)); + PatchOperationType.REMOVE.getValue(), + INPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()), + null)); return this; } @@ -104,7 +110,9 @@ public DataJobInputOutputPatchBuilder addOutputDatasetField(@Nonnull Urn urn) { TextNode textNode = instance.textNode(urn.toString()); pathValues.add( ImmutableTriple.of( - PatchOperationType.ADD.getValue(), OUTPUT_DATASET_FIELDS_PATH_START + urn, textNode)); + PatchOperationType.ADD.getValue(), + OUTPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()), + textNode)); return this; } @@ -112,7 +120,9 @@ public DataJobInputOutputPatchBuilder addOutputDatasetField(@Nonnull Urn urn) { public DataJobInputOutputPatchBuilder removeOutputDatasetField(@Nonnull Urn urn) { pathValues.add( ImmutableTriple.of( - PatchOperationType.REMOVE.getValue(), OUTPUT_DATASET_FIELDS_PATH_START + urn, null)); + PatchOperationType.REMOVE.getValue(), + OUTPUT_DATASET_FIELDS_PATH_START + encodeValue(urn.toString()), + null)); return this; } @@ -147,17 +157,17 @@ private String getEdgePath(@Nonnull Edge edge, LineageDirection direction) { if (DATASET_ENTITY_NAME.equals(destinationUrn.getEntityType()) && LineageDirection.UPSTREAM.equals(direction)) { - return INPUT_DATASET_EDGES_PATH_START + destinationUrn; + return INPUT_DATASET_EDGES_PATH_START + encodeValue(destinationUrn.toString()); } if (DATASET_ENTITY_NAME.equals(destinationUrn.getEntityType()) && LineageDirection.DOWNSTREAM.equals(direction)) { - return INPUT_DATASET_EDGES_PATH_START + destinationUrn; + return INPUT_DATASET_EDGES_PATH_START + encodeValue(destinationUrn.toString()); } if (DATA_JOB_ENTITY_NAME.equals(destinationUrn.getEntityType()) && LineageDirection.UPSTREAM.equals(direction)) { - return INPUT_DATA_JOB_EDGES_PATH_START + destinationUrn; + return INPUT_DATA_JOB_EDGES_PATH_START + encodeValue(destinationUrn.toString()); } // TODO: Output Data Jobs not supported by aspect, add here if this changes diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java index a5fd3ac0ba89db..40b6de68f7b56a 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/builder/UpstreamLineagePatchBuilder.java @@ -9,6 +9,8 @@ import com.linkedin.common.urn.DatasetUrn; import com.linkedin.common.urn.Urn; import com.linkedin.dataset.DatasetLineageType; +import com.linkedin.dataset.FineGrainedLineageDownstreamType; +import com.linkedin.dataset.FineGrainedLineageUpstreamType; import com.linkedin.metadata.aspect.patch.PatchOperationType; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -40,14 +42,60 @@ public UpstreamLineagePatchBuilder addUpstream( pathValues.add( ImmutableTriple.of( - PatchOperationType.ADD.getValue(), UPSTREAMS_PATH_START + datasetUrn, value)); + PatchOperationType.ADD.getValue(), + UPSTREAMS_PATH_START + encodeValueUrn(datasetUrn), + value)); return this; } public UpstreamLineagePatchBuilder removeUpstream(@Nonnull DatasetUrn datasetUrn) { pathValues.add( ImmutableTriple.of( - PatchOperationType.REMOVE.getValue(), UPSTREAMS_PATH_START + datasetUrn, null)); + PatchOperationType.REMOVE.getValue(), + UPSTREAMS_PATH_START + encodeValueUrn(datasetUrn), + null)); + return this; + } + + /** + * Adds a field as a fine grained upstream + * + * @param schemaFieldUrn a schema field to be marked as upstream, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for + * full confidence + * @param transformationOperation string operation type that describes the transformation + * operation happening in the lineage edge + * @param type the upstream lineage type, either Field or Field Set + * @return this builder + */ + public UpstreamLineagePatchBuilder addFineGrainedUpstreamField( + @Nonnull Urn schemaFieldUrn, + @Nullable Float confidenceScore, + @Nonnull String transformationOperation, + @Nullable FineGrainedLineageUpstreamType type) { + Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore); + String finalType; + if (type == null) { + // Default to set of fields if not explicitly a single field + finalType = FineGrainedLineageUpstreamType.FIELD_SET.toString(); + } else { + finalType = type.toString(); + } + + pathValues.add( + ImmutableTriple.of( + PatchOperationType.ADD.getValue(), + FINE_GRAINED_PATH_START + + transformationOperation + + "/" + + "upstreamType" + + "/" + + finalType + + "/" + + encodeValueUrn(schemaFieldUrn), + instance.numberNode(finalConfidenceScore))); + return this; } @@ -91,9 +139,50 @@ public UpstreamLineagePatchBuilder addFineGrainedUpstreamField( + "/" + finalQueryUrn + "/" - + upstreamSchemaField, - fineGrainedLineageNode)); + + encodeValueUrn(upstreamSchemaField), + instance.numberNode(finalConfidenceScore))); + + return this; + } + /** + * Adds a field as a fine grained downstream + * + * @param schemaFieldUrn a schema field to be marked as downstream, format: + * urn:li:schemaField(DATASET_URN, COLUMN NAME) + * @param confidenceScore optional, confidence score for the lineage edge. Defaults to 1.0 for + * full confidence + * @param transformationOperation string operation type that describes the transformation + * operation happening in the lineage edge + * @param type the downstream lineage type, either Field or Field Set + * @return this builder + */ + public UpstreamLineagePatchBuilder addFineGrainedDownstreamField( + @Nonnull Urn schemaFieldUrn, + @Nullable Float confidenceScore, + @Nonnull String transformationOperation, + @Nullable FineGrainedLineageDownstreamType type) { + Float finalConfidenceScore = getConfidenceScoreOrDefault(confidenceScore); + String finalType; + if (type == null) { + // Default to set of fields if not explicitly a single field + finalType = FineGrainedLineageDownstreamType.FIELD_SET.toString(); + } else { + finalType = type.toString(); + } + + pathValues.add( + ImmutableTriple.of( + PatchOperationType.ADD.getValue(), + FINE_GRAINED_PATH_START + + transformationOperation + + "/" + + "downstreamType" + + "/" + + finalType + + "/" + + encodeValueUrn(schemaFieldUrn), + instance.numberNode(finalConfidenceScore))); return this; } @@ -142,7 +231,7 @@ public UpstreamLineagePatchBuilder removeFineGrainedUpstreamField( + "/" + finalQueryUrn + "/" - + upstreamSchemaFieldUrn, + + encodeValueUrn(upstreamSchemaFieldUrn), null)); return this; diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/AspectTemplateEngine.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/AspectTemplateEngine.java index e9d09085e7eb5e..4613396109cc10 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/AspectTemplateEngine.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/AspectTemplateEngine.java @@ -81,7 +81,6 @@ public RecordTemplate getDefaultTemplate(String aspectSpecName) { public RecordTemplate applyPatch( RecordTemplate recordTemplate, Patch jsonPatch, AspectSpec aspectSpec) throws JsonProcessingException, JsonPatchException { - Template template = getTemplate(aspectSpec); return template.applyPatch(recordTemplate, jsonPatch); } diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 873943fd437813..eee84b1f8c8274 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -16,6 +16,7 @@ jar.enabled = false // Since we only want to build shadow jars, disabling the re dependencies { implementation project(':entity-registry') + implementation project(':metadata-integration:java:datahub-event') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } diff --git a/metadata-integration/java/datahub-event/build.gradle b/metadata-integration/java/datahub-event/build.gradle new file mode 100644 index 00000000000000..a516b9d43da4b9 --- /dev/null +++ b/metadata-integration/java/datahub-event/build.gradle @@ -0,0 +1,61 @@ +plugins { + id("com.palantir.git-version") apply false + id 'java' + id 'com.github.johnrengelman.shadow' + id 'jacoco' + id 'signing' + id 'io.codearte.nexus-staging' + id 'maven-publish' +} + +apply from: "../versioning.gradle" + +dependencies { + implementation project(':metadata-models') + implementation project(path: ':metadata-models', configuration: "dataTemplate") + + implementation externalDependency.slf4jApi + implementation externalDependency.jacksonDataBind + runtimeOnly externalDependency.jna + + compileOnly externalDependency.lombok + annotationProcessor externalDependency.lombok + // VisibleForTesting + compileOnly externalDependency.guava + testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.testContainers + testImplementation externalDependency.httpAsyncClient + testRuntimeOnly externalDependency.logbackClassicJava8 +} + +jacocoTestReport { + dependsOn test // tests are required to run before generating the report +} + +task copyAvroSchemas { + dependsOn(':metadata-events:mxe-schemas:renameNamespace') + copy { + from file('../../../metadata-events/mxe-schemas/src/renamed/avro/com/linkedin/mxe/MetadataChangeProposal.avsc') + into file('./src/main/resources') + } +} + +compileJava.dependsOn copyAvroSchemas + +test { + // to avoid simultaneous executions of tests when complete build is run + mustRunAfter(":metadata-io:test") + useJUnit() + finalizedBy jacocoTestReport +} + +// task sourcesJar(type: Jar) { +// archiveClassifier = 'sources' +// from sourceSets.main.allSource +//} + +//task javadocJar(type: Jar) { +// archiveClassifier = 'javadoc' +// from javadoc +//} diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/event/EventFormatter.java b/metadata-integration/java/datahub-event/src/main/java/datahub/event/EventFormatter.java similarity index 100% rename from metadata-integration/java/datahub-client/src/main/java/datahub/event/EventFormatter.java rename to metadata-integration/java/datahub-event/src/main/java/datahub/event/EventFormatter.java diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/event/EventValidationException.java b/metadata-integration/java/datahub-event/src/main/java/datahub/event/EventValidationException.java similarity index 100% rename from metadata-integration/java/datahub-client/src/main/java/datahub/event/EventValidationException.java rename to metadata-integration/java/datahub-event/src/main/java/datahub/event/EventValidationException.java diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/event/MetadataChangeProposalWrapper.java b/metadata-integration/java/datahub-event/src/main/java/datahub/event/MetadataChangeProposalWrapper.java similarity index 100% rename from metadata-integration/java/datahub-client/src/main/java/datahub/event/MetadataChangeProposalWrapper.java rename to metadata-integration/java/datahub-event/src/main/java/datahub/event/MetadataChangeProposalWrapper.java diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/event/StringEscapeUtils.java b/metadata-integration/java/datahub-event/src/main/java/datahub/event/StringEscapeUtils.java similarity index 100% rename from metadata-integration/java/datahub-client/src/main/java/datahub/event/StringEscapeUtils.java rename to metadata-integration/java/datahub-event/src/main/java/datahub/event/StringEscapeUtils.java diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/event/UpsertAspectRequest.java b/metadata-integration/java/datahub-event/src/main/java/datahub/event/UpsertAspectRequest.java similarity index 100% rename from metadata-integration/java/datahub-client/src/main/java/datahub/event/UpsertAspectRequest.java rename to metadata-integration/java/datahub-event/src/main/java/datahub/event/UpsertAspectRequest.java diff --git a/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/interface.mustache b/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/interface.mustache new file mode 100644 index 00000000000000..ae2faa2ce0b49a --- /dev/null +++ b/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/interface.mustache @@ -0,0 +1,21 @@ +{{#jackson}} +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +{{/jackson}} +/** +* {{#description}}{{.}}{{/description}}{{^description}}{{classname}}{{/description}} +*/ +{{#jackson}} +@JsonTypeInfo( + use = JsonTypeInfo.Id.NAME, + include = JsonTypeInfo.As.EXISTING_PROPERTY, + property = "__type") +@JsonSubTypes({ + {{#subTypes}} + @JsonSubTypes.Type(value = {{classname}}.class, name = "{{classname}}"){{^@last}},{{/@last}} + {{/subTypes}} +}) +{{/jackson}} +public interface {{{classname}}} { + +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/model.mustache b/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/model.mustache new file mode 100644 index 00000000000000..a048f249a6b3de --- /dev/null +++ b/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/model.mustache @@ -0,0 +1,41 @@ +package {{package}}; + +{{^x-is-composed-model}} +import java.util.Objects; +{{#imports}}import {{import}}; +{{/imports}} +{{#serializableModel}} +import java.io.Serializable; +{{/serializableModel}} +{{#useBeanValidation}} +import org.springframework.validation.annotation.Validated; +import javax.validation.Valid; +import com.fasterxml.jackson.annotation.JsonInclude; +import javax.validation.constraints.*; +{{/useBeanValidation}} +{{#jackson}} +{{#withXml}} +import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlRootElement; +import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlProperty; +{{/withXml}} +{{/jackson}} +{{#withXml}} +import javax.xml.bind.annotation.*; +{{/withXml}} +{{/x-is-composed-model}} + +{{#models}} +{{#model}} +{{#isComposedModel}} +{{>interface}} +{{/isComposedModel}} +{{^isComposedModel}} +{{#isEnum}} +{{>enumOuterClass}} +{{/isEnum}} +{{^isEnum}} +{{>pojo}} +{{/isEnum}} +{{/isComposedModel}} +{{/model}} +{{/models}} \ No newline at end of file diff --git a/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/pojo.mustache b/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/pojo.mustache new file mode 100644 index 00000000000000..8747765dcb031d --- /dev/null +++ b/metadata-integration/java/datahub-event/src/main/resources/JavaSpring/pojo.mustache @@ -0,0 +1,166 @@ +{{#if hasVars}} +{{else}} +import com.fasterxml.jackson.annotation.JsonProperty; +import io.swagger.v3.oas.annotations.media.Schema; +{{/if}} +/** + * {{#description}}{{.}}{{/description}}{{^description}}{{classname}}{{/description}} + */{{#description}} +{{#useOas2}}@ApiModel{{/useOas2}}{{^useOas2}}@Schema{{/useOas2}}(description = "{{{description}}}"){{/description}} +{{#useBeanValidation}}@Validated{{/useBeanValidation}} +{{>generatedAnnotation}}{{#discriminator}}{{>typeInfoAnnotation}}{{/discriminator}}{{>xmlAnnotation}} +@JsonInclude(JsonInclude.Include.NON_NULL) +public class {{classname}} {{#parent}}extends {{{parent}}}{{/parent}} {{#serializableModel}}implements Serializable {{#interfaceModels}}, {{classname}}{{^@last}}, {{/@last}}{{#@last}} {{/@last}}{{/interfaceModels}}{{/serializableModel}}{{^serializableModel}}{{#interfaceModels}}{{#@first}}implements {{/@first}}{{classname}}{{^@last}}, {{/@last}}{{#@last}}{{/@last}}{{/interfaceModels}}{{/serializableModel}} { +{{#serializableModel}} + private static final long serialVersionUID = 1L; + +{{/serializableModel}} +{{#if interfaceModels}} + + @JsonProperty(value = "__type", defaultValue = "{{classname}}") + private String __type = "{{classname}}"; + + /** + * Name of this subclass in SimpleClassName format + * @return __type + **/ + @Schema(required = true, description = "Name of this subclass in SimpleClassName format", allowableValues = {"{{classname}}"}, + defaultValue = "{{classname}}") + @NotNull + public String get__type() { + return __type; + } +{{/if}} + + {{#vars}} + {{#baseItems this}} + {{#isEnum}} +{{>enumClass}} + {{/isEnum}} + {{/baseItems}} + {{#jackson}} + {{#vendorExtensions.x-is-discriminator-property}} + @JsonTypeId + {{/vendorExtensions.x-is-discriminator-property}} + {{^vendorExtensions.x-is-discriminator-property}} + @JsonProperty("{{baseName}}"){{#withXml}} + @JacksonXmlProperty({{#isXmlAttribute}}isAttribute = true, {{/isXmlAttribute}}{{#xmlNamespace}}namespace="{{xmlNamespace}}", {{/xmlNamespace}}localName = "{{#xmlName}}{{xmlName}}{{/xmlName}}{{^xmlName}}{{baseName}}{{/xmlName}}"){{/withXml}} + {{/vendorExtensions.x-is-discriminator-property}} + {{/jackson}} + {{#gson}} + @SerializedName("{{baseName}}") + {{/gson}} + {{#isContainer}} + {{#useBeanValidation}}@Valid{{/useBeanValidation}} + private {{{datatypeWithEnum}}} {{name}}{{#required}} = {{{defaultValue}}}{{/required}}{{^required}} = null{{/required}}; + {{/isContainer}} + {{^isContainer}} + private {{{datatypeWithEnum}}} {{name}} = {{{defaultValue}}}; + {{/isContainer}} + + {{/vars}} + {{#vars}} + public {{classname}} {{name}}({{{datatypeWithEnum}}} {{name}}) { + this.{{name}} = {{name}}; + return this; + } + {{#isListContainer}} + + public {{classname}} add{{nameInCamelCase}}Item({{{items.datatypeWithEnum}}} {{name}}Item) { + {{^required}} + if (this.{{name}} == null) { + this.{{name}} = {{{defaultValue}}}; + } + {{/required}} + this.{{name}}.add({{name}}Item); + return this; + } + {{/isListContainer}} + {{#isMapContainer}} + + public {{classname}} put{{nameInCamelCase}}Item(String key, {{{items.datatypeWithEnum}}} {{name}}Item) { + {{^required}} + if (this.{{name}} == null) { + this.{{name}} = {{{defaultValue}}}; + } + {{/required}} + this.{{name}}.put(key, {{name}}Item); + return this; + } + {{/isMapContainer}} + + /** + {{#description}} + * {{{description}}} + {{/description}} + {{^description}} + * Get {{name}} + {{/description}} + {{#minimum}} + * minimum: {{minimum}} + {{/minimum}} + {{#maximum}} + * maximum: {{maximum}} + {{/maximum}} + * @return {{name}} + **/ + {{#vendorExtensions.extraAnnotation}} + {{{vendorExtensions.extraAnnotation}}} + {{/vendorExtensions.extraAnnotation}} + {{#useOas2}} + @ApiModelProperty({{#example}}example = "{{{example}}}", {{/example}}{{#required}}required = {{required}}, {{/required}}{{#isReadOnly}}readOnly = {{{isReadOnly}}}, {{/isReadOnly}}value = "{{{description}}}") + {{/useOas2}} + {{^useOas2}} + @Schema({{#example}}example = "{{{example}}}", {{/example}}{{#required}}required = {{required}}, {{/required}}{{#isReadOnly}}accessMode = Schema.AccessMode.READ_ONLY, {{/isReadOnly}}description = "{{{description}}}") + {{/useOas2}} + {{#useBeanValidation}}{{>beanValidation}}{{/useBeanValidation}} public {{{datatypeWithEnum}}} {{getter}}() { + return {{name}}; + } + + public void {{setter}}({{{datatypeWithEnum}}} {{name}}) { + this.{{name}} = {{name}}; + } + + {{/vars}} + + @Override + public boolean equals(java.lang.Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + }{{#hasVars}} + {{classname}} {{classVarName}} = ({{classname}}) o; + return {{#vars}}Objects.equals(this.{{name}}, {{classVarName}}.{{name}}){{#hasMore}} && + {{/hasMore}}{{/vars}}{{#parent}} && + super.equals(o){{/parent}};{{/hasVars}}{{^hasVars}} + return true;{{/hasVars}} + } + + @Override + public int hashCode() { + return Objects.hash({{#vars}}{{name}}{{#hasMore}}, {{/hasMore}}{{/vars}}{{#parent}}{{#hasVars}}, {{/hasVars}}super.hashCode(){{/parent}}); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("class {{classname}} {\n"); + {{#parent}}sb.append(" ").append(toIndentedString(super.toString())).append("\n");{{/parent}} + {{#vars}}sb.append(" {{name}}: ").append(toIndentedString({{name}})).append("\n"); + {{/vars}}sb.append("}"); + return sb.toString(); + } + + /** + * Convert the given object to string with each line indented by 4 spaces + * (except the first line). + */ + private String toIndentedString(java.lang.Object o) { + if (o == null) { + return "null"; + } + return o.toString().replace("\n", "\n "); + } +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-event/src/main/resources/MetadataChangeProposal.avsc b/metadata-integration/java/datahub-event/src/main/resources/MetadataChangeProposal.avsc new file mode 100644 index 00000000000000..64216636af26d1 --- /dev/null +++ b/metadata-integration/java/datahub-event/src/main/resources/MetadataChangeProposal.avsc @@ -0,0 +1,174 @@ +{ + "type" : "record", + "name" : "MetadataChangeProposal", + "namespace" : "com.linkedin.pegasus2avro.mxe", + "doc" : "Kafka event for proposing a metadata change for an entity. A corresponding MetadataChangeLog is emitted when the change is accepted and committed, otherwise a FailedMetadataChangeProposal will be emitted instead.", + "fields" : [ { + "name" : "auditHeader", + "type" : [ "null", { + "type" : "record", + "name" : "KafkaAuditHeader", + "namespace" : "com.linkedin.events", + "doc" : "This header records information about the context of an event as it is emitted into kafka and is intended to be used by the kafka audit application. For more information see go/kafkaauditheader", + "fields" : [ { + "name" : "time", + "type" : "long", + "doc" : "The time at which the event was emitted into kafka.", + "compliance" : [ { + "policy" : "EVENT_TIME" + } ] + }, { + "name" : "server", + "type" : "string", + "doc" : "The fully qualified name of the host from which the event is being emitted.", + "compliance" : "NONE" + }, { + "name" : "instance", + "type" : [ "null", "string" ], + "doc" : "The instance on the server from which the event is being emitted. e.g. i001", + "default" : null, + "compliance" : "NONE" + }, { + "name" : "appName", + "type" : "string", + "doc" : "The name of the application from which the event is being emitted. see go/appname", + "compliance" : "NONE" + }, { + "name" : "messageId", + "type" : { + "type" : "fixed", + "name" : "UUID", + "size" : 16 + }, + "doc" : "A unique identifier for the message", + "compliance" : "NONE" + }, { + "name" : "auditVersion", + "type" : [ "null", "int" ], + "doc" : "The version that is being used for auditing. In version 0, the audit trail buckets events into 10 minute audit windows based on the EventHeader timestamp. In version 1, the audit trail buckets events as follows: if the schema has an outer KafkaAuditHeader, use the outer audit header timestamp for bucketing; else if the EventHeader has an inner KafkaAuditHeader use that inner audit header's timestamp for bucketing", + "default" : null, + "compliance" : "NONE" + }, { + "name" : "fabricUrn", + "type" : [ "null", "string" ], + "doc" : "The fabricUrn of the host from which the event is being emitted. Fabric Urn in the format of urn:li:fabric:{fabric_name}. See go/fabric.", + "default" : null, + "compliance" : "NONE" + }, { + "name" : "clusterConnectionString", + "type" : [ "null", "string" ], + "doc" : "This is a String that the client uses to establish some kind of connection with the Kafka cluster. The exact format of it depends on specific versions of clients and brokers. This information could potentially identify the fabric and cluster with which the client is producing to or consuming from.", + "default" : null, + "compliance" : "NONE" + } ] + } ], + "doc" : "Kafka audit header. Currently remains unused in the open source.", + "default" : null + }, { + "name" : "entityType", + "type" : "string", + "doc" : "Type of the entity being written to" + }, { + "name" : "entityUrn", + "type" : [ "null", "string" ], + "doc" : "Urn of the entity being written", + "default" : null, + "java" : { + "class" : "com.linkedin.pegasus2avro.common.urn.Urn" + } + }, { + "name" : "entityKeyAspect", + "type" : [ "null", { + "type" : "record", + "name" : "GenericAspect", + "doc" : "Generic record structure for serializing an Aspect", + "fields" : [ { + "name" : "value", + "type" : "bytes", + "doc" : "The value of the aspect, serialized as bytes." + }, { + "name" : "contentType", + "type" : "string", + "doc" : "The content type, which represents the fashion in which the aspect was serialized.\nThe only type currently supported is application/json." + } ] + } ], + "doc" : "Key aspect of the entity being written", + "default" : null + }, { + "name" : "changeType", + "type" : { + "type" : "enum", + "name" : "ChangeType", + "namespace" : "com.linkedin.pegasus2avro.events.metadata", + "doc" : "Descriptor for a change action", + "symbols" : [ "UPSERT", "CREATE", "UPDATE", "DELETE", "PATCH", "RESTATE" ], + "symbolDocs" : { + "CREATE" : "NOT SUPPORTED YET\ninsert if not exists. otherwise fail", + "DELETE" : "NOT SUPPORTED YET\ndelete action", + "PATCH" : "NOT SUPPORTED YET\npatch the changes instead of full replace", + "RESTATE" : "Restate an aspect, eg. in a index refresh.", + "UPDATE" : "NOT SUPPORTED YET\nupdate if exists. otherwise fail", + "UPSERT" : "insert if not exists. otherwise update" + } + }, + "doc" : "Type of change being proposed" + }, { + "name" : "aspectName", + "type" : [ "null", "string" ], + "doc" : "Aspect of the entity being written to\nNot filling this out implies that the writer wants to affect the entire entity\nNote: This is only valid for CREATE, UPSERT, and DELETE operations.", + "default" : null + }, { + "name" : "aspect", + "type" : [ "null", "GenericAspect" ], + "doc" : "The value of the new aspect.", + "default" : null + }, { + "name" : "systemMetadata", + "type" : [ "null", { + "type" : "record", + "name" : "SystemMetadata", + "doc" : "Metadata associated with each metadata change that is processed by the system", + "fields" : [ { + "name" : "lastObserved", + "type" : [ "long", "null" ], + "doc" : "The timestamp the metadata was observed at", + "default" : 0 + }, { + "name" : "runId", + "type" : [ "string", "null" ], + "doc" : "The original run id that produced the metadata. Populated in case of batch-ingestion.", + "default" : "no-run-id-provided" + }, { + "name" : "lastRunId", + "type" : [ "string", "null" ], + "doc" : "The last run id that produced the metadata. Populated in case of batch-ingestion.", + "default" : "no-run-id-provided" + }, { + "name" : "pipelineName", + "type" : [ "null", "string" ], + "doc" : "The ingestion pipeline id that produced the metadata. Populated in case of batch ingestion.", + "default" : null + }, { + "name" : "registryName", + "type" : [ "null", "string" ], + "doc" : "The model registry name that was used to process this event", + "default" : null + }, { + "name" : "registryVersion", + "type" : [ "null", "string" ], + "doc" : "The model registry version that was used to process this event", + "default" : null + }, { + "name" : "properties", + "type" : [ "null", { + "type" : "map", + "values" : "string" + } ], + "doc" : "Additional properties", + "default" : null + } ] + } ], + "doc" : "A string->string map of custom properties that one might want to attach to an event", + "default" : null + } ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-event/src/main/resources/client.properties b/metadata-integration/java/datahub-event/src/main/resources/client.properties new file mode 100644 index 00000000000000..880e86993549db --- /dev/null +++ b/metadata-integration/java/datahub-event/src/main/resources/client.properties @@ -0,0 +1 @@ +clientVersion=@fullVersion@ diff --git a/metadata-integration/java/datahub-event/src/test/java/datahub/event/EventFormatterTest.java b/metadata-integration/java/datahub-event/src/test/java/datahub/event/EventFormatterTest.java new file mode 100644 index 00000000000000..0b2a4500e019d5 --- /dev/null +++ b/metadata-integration/java/datahub-event/src/test/java/datahub/event/EventFormatterTest.java @@ -0,0 +1,53 @@ +package datahub.event; + +import com.linkedin.dataset.DatasetProperties; +import com.linkedin.mxe.MetadataChangeProposal; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import org.junit.Test; +import org.testng.Assert; + +public class EventFormatterTest { + + @Test + public void testPartialMCPW() throws URISyntaxException, IOException, EventValidationException { + MetadataChangeProposalWrapper metadataChangeProposalWrapper = + MetadataChangeProposalWrapper.builder() + .entityType("dataset") + .entityUrn("urn:li:foo") + .upsert() + .aspect(new DatasetProperties().setDescription("A test dataset")) + .build(); + EventFormatter eventFormatter = new EventFormatter(); + MetadataChangeProposal mcp = eventFormatter.convert(metadataChangeProposalWrapper); + Assert.assertEquals(mcp.getAspect().getContentType(), "application/json"); + String content = mcp.getAspect().getValue().asString(StandardCharsets.UTF_8); + Assert.assertEquals(content, "{\"description\":\"A test dataset\"}"); + } + + @Test + public void testUtf8Encoding() throws URISyntaxException, IOException { + + MetadataChangeProposalWrapper mcpw = + MetadataChangeProposalWrapper.builder() + .entityType("dataset") + .entityUrn( + "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.user-table,PROD)") + .upsert() + .aspect( + new DatasetProperties() + .setDescription( + "This is the canonical User profile dataset œ∑´´†¥¨ˆˆπ“‘åß∂ƒ©˙∆˚¬…æΩ≈ç√∫˜˜≤≥ç")) + .build(); + EventFormatter eventFormatter = new EventFormatter(); + MetadataChangeProposal mcp = eventFormatter.convert(mcpw); + Assert.assertEquals(mcp.getAspect().getContentType(), "application/json"); + String content = mcp.getAspect().getValue().asString(StandardCharsets.UTF_8); + String expectedContent = + "{\"description\":\"This is the canonical User profile dataset \\u0153\\u2211\\u00B4\\u00B4" + + "\\u2020\\u00A5\\u00A8\\u02C6\\u02C6\\u03C0\\u201C\\u2018\\u00E5\\u00DF\\u2202\\u0192\\u00A9\\u02D9\\u2206" + + "\\u02DA\\u00AC\\u2026\\u00E6\\u03A9\\u2248\\u00E7\\u221A\\u222B\\u02DC\\u02DC\\u2264\\u2265\\u00E7\"}"; + Assert.assertEquals(content, expectedContent); + } +} diff --git a/metadata-integration/java/datahub-event/src/test/java/datahub/event/MetadataChangeProposalWrapperTest.java b/metadata-integration/java/datahub-event/src/test/java/datahub/event/MetadataChangeProposalWrapperTest.java new file mode 100644 index 00000000000000..3a333abc5cb108 --- /dev/null +++ b/metadata-integration/java/datahub-event/src/test/java/datahub/event/MetadataChangeProposalWrapperTest.java @@ -0,0 +1,76 @@ +package datahub.event; + +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.dataset.DatasetProperties; +import java.net.URISyntaxException; +import org.junit.Assert; +import org.junit.Test; + +public class MetadataChangeProposalWrapperTest { + + /** We should throw errors on validation as exceptions */ + @Test + public void testBuilderExceptions() { + try { + MetadataChangeProposalWrapper.create( + b -> b.entityType("dataset").entityUrn("foo") // bad urn should throw exception + ); + Assert.fail("Should throw an exception"); + } catch (EventValidationException e) { + Assert.assertTrue( + "Underlying exception should be a URI syntax issue", + e.getCause() instanceof URISyntaxException); + } catch (Exception e) { + Assert.fail("Should not throw any other exception"); + } + } + + @Test + public void testAspectInferenceSuccess() throws EventValidationException { + MetadataChangeProposalWrapper mcpw = + MetadataChangeProposalWrapper.create( + b -> + b.entityType("dataset") + .entityUrn("urn:li:dataset:(foo,bar,PROD)") + .upsert() + .aspect(new DatasetProperties())); + Assert.assertEquals(mcpw.getAspectName(), "datasetProperties"); + } + + /** + * We throw exceptions on using the regular builder pattern + * + * @throws URISyntaxException + * @throws EventValidationException + */ + @Test(expected = EventValidationException.class) + public void testAspectInferenceFailure() throws URISyntaxException, EventValidationException { + MetadataChangeProposalWrapper mcpw = + MetadataChangeProposalWrapper.builder() + .entityType("dataset") + .entityUrn("urn:li:dataset:(foo,bar,PROD)") + .upsert() + .aspect(new AuditStamp().setActor(Urn.createFromString("urn:li:corpUser:jdoe"))) + .build(); + } + + /** + * We throw exceptions on using the lambda builder pattern + * + * @throws URISyntaxException + * @throws EventValidationException + */ + @Test(expected = EventValidationException.class) + public void testAspectInferenceFailureLambda() + throws URISyntaxException, EventValidationException { + Urn actorUrn = Urn.createFromString("urn:li:corpUser:jdoe"); + MetadataChangeProposalWrapper mcpw = + MetadataChangeProposalWrapper.create( + b -> + b.entityType("dataset") + .entityUrn("urn:li:dataset:(foo,bar,PROD)") + .upsert() + .aspect(new AuditStamp().setActor(actorUrn))); + } +} diff --git a/metadata-integration/java/datahub-event/src/test/resources/golden_files/mcps_golden.json b/metadata-integration/java/datahub-event/src/test/resources/golden_files/mcps_golden.json new file mode 100644 index 00000000000000..9af97f6c680431 --- /dev/null +++ b/metadata-integration/java/datahub-event/src/test/resources/golden_files/mcps_golden.json @@ -0,0 +1,1543 @@ +[ + { + "aspect": { + "contentType": "application/json", + "value": "{\"customProperties\": {\"platform\": \"mysql\", \"instance\": \"PROD\", \"database\": \"metagalaxy\"}, \"name\": \"metagalaxy\"}" + }, + "aspectName": "containerProperties", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:17751259af32dd0385cad799df608c40", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"platform\": \"urn:li:dataPlatform:mysql\"}" + }, + "aspectName": "dataPlatformInstance", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:17751259af32dd0385cad799df608c40", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"Database\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:17751259af32dd0385cad799df608c40", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"domains\": [\"urn:li:domain:sales\"]}" + }, + "aspectName": "domains", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:17751259af32dd0385cad799df608c40", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"customProperties\": {\"platform\": \"mysql\", \"instance\": \"PROD\", \"database\": \"metagalaxy\", \"schema\": \"datacharmer\"}, \"name\": \"datacharmer\"}" + }, + "aspectName": "containerProperties", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:ba408413d97771e6470c16f9869f2e0d", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"platform\": \"urn:li:dataPlatform:mysql\"}" + }, + "aspectName": "dataPlatformInstance", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:ba408413d97771e6470c16f9869f2e0d", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"Schema\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:ba408413d97771e6470c16f9869f2e0d", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:17751259af32dd0385cad799df608c40\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:ba408413d97771e6470c16f9869f2e0d", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:ba408413d97771e6470c16f9869f2e0d\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.employees,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "description": "这是一个很好的描述", + "name": "employees", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "emp_no", + "isPartOfKey": true, + "nativeDataType": "INTEGER()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "birth_date", + "isPartOfKey": false, + "nativeDataType": "DATE()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": { + } + } + } + }, + { + "fieldPath": "first_name", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=14)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "last_name", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=16)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "gender", + "isPartOfKey": false, + "nativeDataType": "ENUM('M', 'F')", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.EnumType": { + } + } + } + }, + { + "fieldPath": "hire_date", + "isPartOfKey": false, + "nativeDataType": "DATE()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "datacharmer.employees", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.employees,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.employees,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:ba408413d97771e6470c16f9869f2e0d\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.salaries,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "name": "salaries", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "emp_no", + "isPartOfKey": true, + "nativeDataType": "INTEGER()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "salary", + "isPartOfKey": false, + "nativeDataType": "INTEGER()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "from_date", + "isPartOfKey": true, + "nativeDataType": "DATE()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": { + } + } + } + }, + { + "fieldPath": "to_date", + "isPartOfKey": false, + "nativeDataType": "DATE()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "datacharmer.salaries", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.salaries,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.salaries,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"customProperties\": {\"platform\": \"mysql\", \"instance\": \"PROD\", \"database\": \"metagalaxy\", \"schema\": \"metagalaxy\"}, \"name\": \"metagalaxy\"}" + }, + "aspectName": "containerProperties", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:593ea3998729fdae4bdfb42206561a3a", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"platform\": \"urn:li:dataPlatform:mysql\"}" + }, + "aspectName": "dataPlatformInstance", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:593ea3998729fdae4bdfb42206561a3a", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"Schema\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:593ea3998729fdae4bdfb42206561a3a", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:17751259af32dd0385cad799df608c40\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:593ea3998729fdae4bdfb42206561a3a", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:593ea3998729fdae4bdfb42206561a3a\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_aspect,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "name": "metadata_aspect", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "urn", + "isPartOfKey": true, + "nativeDataType": "VARCHAR(length=500)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "aspect", + "isPartOfKey": true, + "nativeDataType": "VARCHAR(length=200)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "version", + "isPartOfKey": true, + "nativeDataType": "BIGINT()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "metadata", + "isPartOfKey": false, + "nativeDataType": "LONGTEXT()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "createdon", + "isPartOfKey": false, + "nativeDataType": "DATETIME(fsp=6)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": { + } + } + } + }, + { + "fieldPath": "createdby", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=255)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "createdfor", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=255)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "metagalaxy.metadata_aspect", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_aspect,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_aspect,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"domains\": [\"urn:li:domain:sales\"]}" + }, + "aspectName": "domains", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_aspect,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:593ea3998729fdae4bdfb42206561a3a\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "description": "This is a table comment", + "name": "metadata_index", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "id", + "isPartOfKey": true, + "nativeDataType": "BIGINT()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "description": "This is a column comment about URNs", + "fieldPath": "urn", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=200)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "aspect", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=150)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "path", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=150)", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "longVal", + "isPartOfKey": false, + "nativeDataType": "BIGINT()", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "stringVal", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=200)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "doubleVal", + "isPartOfKey": false, + "nativeDataType": "DOUBLE(asdecimal=True)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "metagalaxy.metadata_index", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"domains\": [\"urn:li:domain:sales\"]}" + }, + "aspectName": "domains", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:593ea3998729fdae4bdfb42206561a3a\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "is_view": "True", + "view_definition": "CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `metadata_index_view` AS select `metadata_index`.`id` AS `id`,`metadata_index`.`urn` AS `urn`,`metadata_index`.`path` AS `path`,`metadata_index`.`doubleVal` AS `doubleVal` from `metadata_index`" + }, + "name": "metadata_index_view", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "id", + "isPartOfKey": false, + "nativeDataType": "BIGINT()", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "urn", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=200)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "path", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=150)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "doubleVal", + "isPartOfKey": false, + "nativeDataType": "DOUBLE(asdecimal=True)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "metagalaxy.metadata_index_view", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"view\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"materialized\": false, \"viewLogic\": \"CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `metadata_index_view` AS select `metadata_index`.`id` AS `id`,`metadata_index`.`urn` AS `urn`,`metadata_index`.`path` AS `path`,`metadata_index`.`doubleVal` AS `doubleVal` from `metadata_index`\", \"viewLanguage\": \"SQL\"}" + }, + "aspectName": "viewProperties", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"domains\": [\"urn:li:domain:sales\"]}" + }, + "aspectName": "domains", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"customProperties\": {\"platform\": \"mysql\", \"instance\": \"PROD\", \"database\": \"metagalaxy\", \"schema\": \"northwind\"}, \"name\": \"northwind\"}" + }, + "aspectName": "containerProperties", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:36bfb6eae3f7972efbcb56dedecdfba6", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"platform\": \"urn:li:dataPlatform:mysql\"}" + }, + "aspectName": "dataPlatformInstance", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:36bfb6eae3f7972efbcb56dedecdfba6", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"Schema\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:36bfb6eae3f7972efbcb56dedecdfba6", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:17751259af32dd0385cad799df608c40\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:36bfb6eae3f7972efbcb56dedecdfba6", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:36bfb6eae3f7972efbcb56dedecdfba6\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "name": "customers", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "id", + "isPartOfKey": true, + "nativeDataType": "INTEGER()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "company", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=50)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "last_name", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=50)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "first_name", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=50)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "email_address", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=50)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "priority", + "isPartOfKey": false, + "nativeDataType": "FLOAT()", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "northwind.customers", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:36bfb6eae3f7972efbcb56dedecdfba6\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "name": "orders", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "id", + "isPartOfKey": true, + "nativeDataType": "INTEGER()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + }, + { + "fieldPath": "description", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=50)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + }, + { + "fieldPath": "customer_id", + "isPartOfKey": false, + "nativeDataType": "INTEGER()", + "nullable": false, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": { + } + } + } + } + ], + "foreignKeys": [ + { + "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "foreignFields": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD),id)" + ], + "name": "fk_order_customer", + "sourceFields": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD),customer_id)" + ] + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "northwind.orders", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"customProperties\": {\"platform\": \"mysql\", \"instance\": \"PROD\", \"database\": \"metagalaxy\", \"schema\": \"test_cases\"}, \"name\": \"test_cases\"}" + }, + "aspectName": "containerProperties", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:989c003cbe689094c2b5c340a67f62be", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"platform\": \"urn:li:dataPlatform:mysql\"}" + }, + "aspectName": "dataPlatformInstance", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:989c003cbe689094c2b5c340a67f62be", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"Schema\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:989c003cbe689094c2b5c340a67f62be", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:17751259af32dd0385cad799df608c40\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "container", + "entityUrn": "urn:li:container:989c003cbe689094c2b5c340a67f62be", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"container\": \"urn:li:container:989c003cbe689094c2b5c340a67f62be\"}" + }, + "aspectName": "container", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.test_empty,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + }, + "name": "test_empty", + "tags": [ + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "created": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "fields": [ + { + "fieldPath": "dummy", + "isPartOfKey": false, + "nativeDataType": "VARCHAR(length=50)", + "nullable": true, + "recursive": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": { + } + } + } + } + ], + "hash": "", + "lastModified": { + "actor": "urn:li:corpuser:unknown", + "time": 0 + }, + "platform": "urn:li:dataPlatform:mysql", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "schemaName": "test_cases.test_empty", + "version": 0 + } + } + ], + "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.test_empty,PROD)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"typeNames\": [\"table\"]}" + }, + "aspectName": "subTypes", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.test_empty,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 10, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"10001\", \"10002\", \"10003\", \"10004\", \"10005\", \"10006\", \"10007\", \"10008\", \"10009\", \"10010\"]}, {\"fieldPath\": \"birth_date\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1952-04-19\", \"max\": \"1964-06-02\", \"sampleValues\": [\"1953-09-02\", \"1964-06-02\", \"1959-12-03\", \"1954-05-01\", \"1955-01-21\", \"1953-04-20\", \"1957-05-23\", \"1958-02-19\", \"1952-04-19\", \"1963-06-01\"]}, {\"fieldPath\": \"first_name\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Georgi\", \"Bezalel\", \"Parto\", \"Chirstian\", \"Kyoichi\", \"Anneke\", \"Tzvetan\", \"Saniya\", \"Sumant\", \"Duangkaew\"]}, {\"fieldPath\": \"last_name\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Facello\", \"Simmel\", \"Bamford\", \"Koblick\", \"Maliniak\", \"Preusig\", \"Zielinski\", \"Kalloufi\", \"Peac\", \"Piveteau\"]}, {\"fieldPath\": \"gender\", \"uniqueCount\": 2, \"uniqueProportion\": 0.2, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"M\", \"frequency\": 5}, {\"value\": \"F\", \"frequency\": 5}], \"sampleValues\": [\"M\", \"F\", \"M\", \"M\", \"M\", \"F\", \"F\", \"M\", \"F\", \"F\"]}, {\"fieldPath\": \"hire_date\", \"uniqueCount\": 10, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"1994-09-15\", \"sampleValues\": [\"1986-06-26\", \"1985-11-21\", \"1986-08-28\", \"1986-12-01\", \"1989-09-12\", \"1989-06-02\", \"1989-02-10\", \"1994-09-15\", \"1985-02-18\", \"1989-08-24\"]}]}" + }, + "aspectName": "datasetProfile", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.employees,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 112, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 0.08928571428571429, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"10001\", \"max\": \"10010\", \"mean\": \"10005.3125\", \"median\": \"10005.0\", \"stdev\": \"2.834889609688869\", \"distinctValueFrequencies\": [{\"value\": \"10001\", \"frequency\": 17}, {\"value\": \"10002\", \"frequency\": 6}, {\"value\": \"10003\", \"frequency\": 7}, {\"value\": \"10004\", \"frequency\": 16}, {\"value\": \"10005\", \"frequency\": 13}, {\"value\": \"10006\", \"frequency\": 12}, {\"value\": \"10007\", \"frequency\": 14}, {\"value\": \"10008\", \"frequency\": 3}, {\"value\": \"10009\", \"frequency\": 18}, {\"value\": \"10010\", \"frequency\": 6}], \"sampleValues\": [\"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10002\", \"10002\", \"10002\"]}, {\"fieldPath\": \"salary\", \"uniqueCount\": 111, \"uniqueProportion\": 0.9910714285714286, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"40000\", \"max\": \"94692\", \"mean\": \"68303.11607142857\", \"median\": \"69544.0\", \"stdev\": \"15505.291475014095\", \"sampleValues\": [\"60117\", \"62102\", \"66074\", \"66596\", \"66961\", \"71046\", \"74333\", \"75286\", \"75994\", \"76884\", \"80013\", \"81025\", \"81097\", \"84917\", \"85112\", \"85097\", \"88958\", \"65909\", \"65909\", \"67534\"]}, {\"fieldPath\": \"from_date\", \"uniqueCount\": 106, \"uniqueProportion\": 0.9464285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"2002-06-22\", \"sampleValues\": [\"1986-06-26\", \"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\"]}, {\"fieldPath\": \"to_date\", \"uniqueCount\": 99, \"uniqueProportion\": 0.8839285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1986-02-18\", \"max\": \"9999-01-01\", \"sampleValues\": [\"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"9999-01-01\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\"]}]}" + }, + "aspectName": "datasetProfile", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datacharmer.salaries,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 5, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"1\", \"2\", \"3\", \"4\", \"5\"]}, {\"fieldPath\": \"company\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Company A\", \"Company B\", \"Company C\", \"Company D\", \"Company E\"]}, {\"fieldPath\": \"last_name\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Axen\", \"Bedecs\", \"Donnell\", \"Gratacos Solsona\", \"Lee\"]}, {\"fieldPath\": \"first_name\", \"uniqueCount\": 5, \"uniqueProportion\": 1.0, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Anna\", \"Antonio\", \"Christina\", \"Martin\", \"Thomas\"]}, {\"fieldPath\": \"email_address\", \"uniqueCount\": 0, \"nullCount\": 5, \"nullProportion\": 1.0, \"sampleValues\": []}, {\"fieldPath\": \"priority\", \"uniqueCount\": 3, \"uniqueProportion\": 0.75, \"nullCount\": 1, \"nullProportion\": 0.2, \"min\": \"3.8\", \"max\": \"4.9\", \"mean\": \"4.175000011920929\", \"median\": \"4.0\", \"distinctValueFrequencies\": [{\"value\": \"3.8\", \"frequency\": 1}, {\"value\": \"4.0\", \"frequency\": 2}, {\"value\": \"4.9\", \"frequency\": 1}], \"sampleValues\": [\"4.0\", \"4.9\", \"4.0\", \"3.8\"]}]}" + }, + "aspectName": "datasetProfile", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.customers,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 0, \"columnCount\": 3, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}, {\"fieldPath\": \"description\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}, {\"fieldPath\": \"customer_id\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}]}" + }, + "aspectName": "datasetProfile", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,northwind.orders,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + }, + { + "aspect": { + "contentType": "application/json", + "value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 0, \"columnCount\": 1, \"fieldProfiles\": [{\"fieldPath\": \"dummy\", \"uniqueCount\": 0, \"nullCount\": 0, \"sampleValues\": []}]}" + }, + "aspectName": "datasetProfile", + "changeType": "UPSERT", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,test_cases.test_empty,PROD)", + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test" + } + } +] diff --git a/metadata-integration/java/openlineage-converter/README.md b/metadata-integration/java/openlineage-converter/README.md new file mode 100644 index 00000000000000..fbf93406fd4450 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/README.md @@ -0,0 +1,8 @@ +# Openlineage Converter + +## Overview +It converts arbitary Openlineage events to a DataHub Aspects. + +## Known Issues +- Currently, it was tested only with Spark and Airflow events. +- Due to Openlineage's stateless nature, it is possible not all the inputs or outputs captured. diff --git a/metadata-integration/java/openlineage-converter/build.gradle b/metadata-integration/java/openlineage-converter/build.gradle new file mode 100644 index 00000000000000..4fb35f6b3c563c --- /dev/null +++ b/metadata-integration/java/openlineage-converter/build.gradle @@ -0,0 +1,55 @@ +apply plugin: 'java' +apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'signing' +apply plugin: 'maven-publish' +apply plugin: 'jacoco' +apply from: '../versioning.gradle' + +repositories { + mavenLocal() + mavenCentral() +} + +dependencies { + implementation project(path: ':entity-registry') + implementation project(path: ':metadata-integration:java:datahub-event') + implementation project(path: ':metadata-models') + implementation project(path: ':metadata-models', configuration: "dataTemplate") + compileOnly "io.openlineage:openlineage-java:$openLineageVersion" + + implementation externalDependency.slf4jApi + implementation externalDependency.commonsLang + compileOnly externalDependency.lombok + annotationProcessor externalDependency.lombok + + implementation externalDependency.json + + testImplementation externalDependency.junit + // Use JUnit Jupiter for testing. + testImplementation 'org.junit.jupiter:junit-jupiter:5.9.2' +} + +jacocoTestReport { + dependsOn test // tests are required to run before generating the report +} + +test { + forkEvery = 1 + useJUnit() + finalizedBy jacocoTestReport +} + +test { + useJUnit() + finalizedBy jacocoTestReport +} + +//task sourcesJar(type: Jar) { +// classifier 'sources' +// from sourceSets.main.allJava +//} + +//task javadocJar(type: Jar, dependsOn: javadoc) { +// classifier 'javadoc' +// from javadoc.destinationDir +//} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java new file mode 100644 index 00000000000000..b8d4d53511cd0b --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/config/DatahubOpenlineageConfig.java @@ -0,0 +1,43 @@ +package io.datahubproject.openlineage.config; + +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DataJobUrn; +import io.datahubproject.openlineage.dataset.PathSpec; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import lombok.Builder; +import lombok.Getter; + +@Builder +@Getter +public class DatahubOpenlineageConfig { + @Builder.Default private final boolean isStreaming = false; + @Builder.Default private final String pipelineName = null; + private final String platformInstance; + private final String commonDatasetPlatformInstance; + private final String platform; + @Builder.Default private final Map> pathSpecs = new HashMap<>(); + private final String filePartitionRegexpPattern; + @Builder.Default private final FabricType fabricType = FabricType.PROD; + private final boolean materializeDataset; + private final boolean includeSchemaMetadata; + @Builder.Default private final boolean captureColumnLevelLineage = true; + @Builder.Default private final DataJobUrn parentJobUrn = null; + // This is disabled until column level patch support won't be fixed in GMS + @Builder.Default private final boolean usePatch = false; + + public List getPathSpecsForPlatform(String platform) { + if ((pathSpecs == null) || (pathSpecs.isEmpty())) { + return Collections.emptyList(); + } + + return pathSpecs.values().stream() + .filter( + specs -> specs.stream().anyMatch(pathSpec -> pathSpec.getPlatform().equals(platform))) + .flatMap(List::stream) + .collect(Collectors.toList()); + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java new file mode 100644 index 00000000000000..038e8d33a97c44 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/converter/OpenLineageToDataHub.java @@ -0,0 +1,826 @@ +package io.datahubproject.openlineage.converter; + +import com.linkedin.common.AuditStamp; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.Edge; +import com.linkedin.common.EdgeArray; +import com.linkedin.common.GlobalTags; +import com.linkedin.common.Owner; +import com.linkedin.common.OwnerArray; +import com.linkedin.common.Ownership; +import com.linkedin.common.OwnershipSource; +import com.linkedin.common.OwnershipSourceType; +import com.linkedin.common.OwnershipType; +import com.linkedin.common.TagAssociation; +import com.linkedin.common.TagAssociationArray; +import com.linkedin.common.TimeStamp; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.DataFlowUrn; +import com.linkedin.common.urn.DataJobUrn; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.common.urn.TagUrn; +import com.linkedin.common.urn.TupleKey; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.StringMap; +import com.linkedin.datajob.DataFlowInfo; +import com.linkedin.datajob.DataJobInfo; +import com.linkedin.datajob.DataJobInputOutput; +import com.linkedin.dataprocess.DataProcessInstanceProperties; +import com.linkedin.dataprocess.DataProcessInstanceRelationships; +import com.linkedin.dataprocess.DataProcessInstanceRunEvent; +import com.linkedin.dataprocess.DataProcessInstanceRunResult; +import com.linkedin.dataprocess.DataProcessRunStatus; +import com.linkedin.dataprocess.RunResultType; +import com.linkedin.dataset.DatasetLineageType; +import com.linkedin.dataset.FineGrainedLineage; +import com.linkedin.dataset.FineGrainedLineageArray; +import com.linkedin.dataset.FineGrainedLineageDownstreamType; +import com.linkedin.dataset.FineGrainedLineageUpstreamType; +import com.linkedin.dataset.Upstream; +import com.linkedin.dataset.UpstreamArray; +import com.linkedin.dataset.UpstreamLineage; +import com.linkedin.domain.Domains; +import com.linkedin.schema.MapType; +import com.linkedin.schema.MySqlDDL; +import com.linkedin.schema.NullType; +import com.linkedin.schema.NumberType; +import com.linkedin.schema.SchemaField; +import com.linkedin.schema.SchemaFieldArray; +import com.linkedin.schema.SchemaFieldDataType; +import com.linkedin.schema.SchemaMetadata; +import com.linkedin.schema.StringType; +import com.linkedin.schema.TimeType; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.dataset.DatahubDataset; +import io.datahubproject.openlineage.dataset.DatahubJob; +import io.datahubproject.openlineage.dataset.HdfsPathDataset; +import io.datahubproject.openlineage.dataset.HdfsPlatform; +import io.openlineage.client.OpenLineage; +import io.openlineage.client.OpenLineageClientUtils; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.time.ZonedDateTime; +import java.util.Arrays; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import lombok.extern.slf4j.Slf4j; +import org.json.JSONArray; +import org.json.JSONException; + +@Slf4j +public class OpenLineageToDataHub { + + public static final String FILE_NAMESPACE = "file"; + public static final String SCHEME_SEPARATOR = "://"; + public static final String URN_LI_CORPUSER = "urn:li:corpuser:"; + public static final String URN_LI_CORPUSER_DATAHUB = URN_LI_CORPUSER + "datahub"; + public static final String URN_LI_DATA_PROCESS_INSTANCE = "urn:li:dataProcessInstance:"; + + public static final Map PLATFORM_MAP = + Stream.of( + new String[][] { + {"awsathena", "athena"}, {"sqlserver", "mssql"}, + }) + .collect(Collectors.toMap(data -> data[0], data -> data[1])); + + private OpenLineageToDataHub() {} + + public static Optional convertOpenlineageDatasetToDatasetUrn( + OpenLineage.Dataset dataset, DatahubOpenlineageConfig mappingConfig) { + String namespace = dataset.getNamespace(); + String datasetName = dataset.getName(); + + String platform; + if (namespace.contains(SCHEME_SEPARATOR)) { + try { + URI datasetUri; + if (!namespace.endsWith("/") && !datasetName.startsWith("/")) { + datasetUri = new URI(namespace + "/" + datasetName); + } else { + datasetUri = new URI(namespace + datasetName); + } + if (PLATFORM_MAP.containsKey(datasetUri.getScheme())) { + platform = PLATFORM_MAP.get(datasetUri.getScheme()); + } else { + platform = datasetUri.getScheme(); + } + datasetName = datasetUri.getPath(); + if (HdfsPlatform.isFsPlatformPrefix(platform)) { + try { + HdfsPathDataset hdfsPathDataset = HdfsPathDataset.create(datasetUri, mappingConfig); + return Optional.of(hdfsPathDataset.urn()); + } catch (InstantiationException e) { + log.warn( + "Unable to create urn from namespace: {} and dataset {}.", namespace, datasetName); + return Optional.empty(); + } + } else { + datasetName = dataset.getName(); + } + } catch (URISyntaxException e) { + log.warn("Unable to create URI from namespace: {} and dataset {}.", namespace, datasetName); + return Optional.empty(); + } + } else { + platform = namespace; + datasetName = dataset.getName(); + } + + if (mappingConfig.getCommonDatasetPlatformInstance() != null) { + datasetName = mappingConfig.getCommonDatasetPlatformInstance() + "." + datasetName; + } + + return Optional.of( + new DatasetUrn(new DataPlatformUrn(platform), datasetName, mappingConfig.getFabricType())); + } + + public static GlobalTags generateTags(List tags) { + tags.sort(String::compareToIgnoreCase); + GlobalTags globalTags = new GlobalTags(); + TagAssociationArray tagAssociationArray = new TagAssociationArray(); + for (String tag : tags) { + TagAssociation tagAssociation = new TagAssociation(); + tagAssociation.setTag(new TagUrn(tag)); + tagAssociationArray.add(tagAssociation); + } + globalTags.setTags(tagAssociationArray); + return globalTags; + } + + public static Domains generateDomains(List domains) { + domains.sort(String::compareToIgnoreCase); + Domains datahubDomains = new Domains(); + UrnArray domainArray = new UrnArray(); + for (String domain : domains) { + try { + domainArray.add(Urn.createFromString(domain)); + } catch (URISyntaxException e) { + log.warn("Unable to create domain urn for domain urn: {}", domain); + } + } + datahubDomains.setDomains(domainArray); + return datahubDomains; + } + + public static Urn dataPlatformInstanceUrn(String platform, String instance) + throws URISyntaxException { + return new Urn( + "dataPlatformInstance", + new TupleKey(Arrays.asList(new DataPlatformUrn(platform).toString(), instance))); + } + + public static DatahubJob convertRunEventToJob( + OpenLineage.RunEvent event, DatahubOpenlineageConfig datahubConf) + throws IOException, URISyntaxException { + DatahubJob.DatahubJobBuilder jobBuilder = DatahubJob.builder(); + + if (event.getEventTime() != null) { + jobBuilder.eventTime(event.getEventTime().toInstant().toEpochMilli()); + } + + log.info("Emitting lineage: {}", OpenLineageClientUtils.toJson(event)); + DataFlowInfo dfi = convertRunEventToDataFlowInfo(event, datahubConf.getPipelineName()); + + DataFlowUrn dataFlowUrn = + getFlowUrn( + event.getJob().getNamespace(), + event.getJob().getName(), + null, + event.getProducer(), + datahubConf); + jobBuilder.flowUrn(dataFlowUrn); + + if (datahubConf.getPlatformInstance() != null) { + DataPlatformInstance dpi = + new DataPlatformInstance() + .setPlatform(new DataPlatformUrn(dataFlowUrn.getOrchestratorEntity())) + .setInstance( + dataPlatformInstanceUrn( + dataFlowUrn.getOrchestratorEntity(), datahubConf.getPlatformInstance())); + jobBuilder.flowPlatformInstance(dpi); + } + + StringMap customProperties = generateCustomProperties(event, true); + dfi.setCustomProperties(customProperties); + + String description = getDescription(event); + if (description != null) { + dfi.setDescription(description); + } + jobBuilder.dataFlowInfo(dfi); + + Ownership ownership = generateOwnership(event); + jobBuilder.flowOwnership(ownership); + + GlobalTags tags = generateTags(event); + jobBuilder.flowGlobalTags(tags); + + DatahubJob datahubJob = jobBuilder.build(); + convertJobToDataJob(datahubJob, event, datahubConf); + return datahubJob; + } + + static void forEachValue(Map source, StringMap customProperties) { + for (final Map.Entry entry : source.entrySet()) { + if (entry.getValue() instanceof Map) { + forEachValue((Map) entry.getValue(), customProperties); + } else { + customProperties.put(entry.getKey(), entry.getValue().toString()); + } + } + } + + private static Ownership generateOwnership(OpenLineage.RunEvent event) { + Ownership ownership = new Ownership(); + OwnerArray owners = new OwnerArray(); + if ((event.getJob().getFacets() != null) + && (event.getJob().getFacets().getOwnership() != null)) { + for (OpenLineage.OwnershipJobFacetOwners ownerFacet : + event.getJob().getFacets().getOwnership().getOwners()) { + Owner owner = new Owner(); + try { + owner.setOwner(Urn.createFromString(URN_LI_CORPUSER + ":" + ownerFacet.getName())); + owner.setType(OwnershipType.DEVELOPER); + OwnershipSource source = new OwnershipSource(); + source.setType(OwnershipSourceType.SERVICE); + owner.setSource(source); + owners.add(owner); + } catch (URISyntaxException e) { + log.warn("Unable to create owner urn for owner: {}", ownerFacet.getName()); + } + } + } + ownership.setOwners(owners); + try { + AuditStamp auditStamp = new AuditStamp(); + auditStamp.setActor(Urn.createFromString(URN_LI_CORPUSER_DATAHUB)); + auditStamp.setTime(System.currentTimeMillis()); + ownership.setLastModified(auditStamp); + } catch (URISyntaxException e) { + log.warn("Unable to create actor urn for actor: {}", URN_LI_CORPUSER_DATAHUB); + } + return ownership; + } + + private static String getDescription(OpenLineage.RunEvent event) { + if (event.getJob().getFacets() != null + && event.getJob().getFacets().getDocumentation() != null) { + return event.getJob().getFacets().getDocumentation().getDescription(); + } + return null; + } + + private static UpstreamLineage getFineGrainedLineage( + OpenLineage.Dataset dataset, DatahubOpenlineageConfig mappingConfig) { + FineGrainedLineageArray fgla = new FineGrainedLineageArray(); + UpstreamArray upstreams = new UpstreamArray(); + + if ((dataset.getFacets() == null) || (dataset.getFacets().getColumnLineage() == null)) { + return null; + } + + OpenLineage.ColumnLineageDatasetFacet columLineage = dataset.getFacets().getColumnLineage(); + Set> fields = + columLineage.getFields().getAdditionalProperties().entrySet(); + for (Map.Entry field : fields) { + FineGrainedLineage fgl = new FineGrainedLineage(); + + UrnArray upstreamFields = new UrnArray(); + UrnArray downstreamsFields = new UrnArray(); + Optional datasetUrn = + convertOpenlineageDatasetToDatasetUrn(dataset, mappingConfig); + datasetUrn.ifPresent( + urn -> + downstreamsFields.add( + UrnUtils.getUrn("urn:li:schemaField:" + "(" + urn + "," + field.getKey() + ")"))); + OpenLineage.StaticDatasetBuilder staticDatasetBuilder = + new OpenLineage.StaticDatasetBuilder(); + field + .getValue() + .getInputFields() + .forEach( + inputField -> { + OpenLineage.Dataset staticDataset = + staticDatasetBuilder + .name(inputField.getName()) + .namespace(inputField.getNamespace()) + .build(); + Optional urn = + convertOpenlineageDatasetToDatasetUrn(staticDataset, mappingConfig); + if (urn.isPresent()) { + Urn datasetFieldUrn = + UrnUtils.getUrn( + "urn:li:schemaField:" + + "(" + + urn.get() + + "," + + inputField.getField() + + ")"); + upstreamFields.add(datasetFieldUrn); + upstreams.add( + new Upstream().setDataset(urn.get()).setType(DatasetLineageType.TRANSFORMED)); + } + }); + + // fgl.set(upstreamFields); + upstreamFields.sort(Comparator.comparing(Urn::toString)); + fgl.setUpstreams(upstreamFields); + fgl.setConfidenceScore(0.5f); + fgl.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + + downstreamsFields.sort(Comparator.comparing(Urn::toString)); + fgl.setDownstreams(downstreamsFields); + fgl.setDownstreamType(FineGrainedLineageDownstreamType.FIELD_SET); + fgla.add(fgl); + } + + UpstreamLineage upstreamLineage = new UpstreamLineage(); + upstreamLineage.setFineGrainedLineages(fgla); + upstreamLineage.setUpstreams(upstreams); + return upstreamLineage; + } + + private static GlobalTags generateTags(OpenLineage.RunEvent event) { + if (event.getRun().getFacets() == null + || event.getRun().getFacets().getAdditionalProperties() == null + || event.getRun().getFacets().getAdditionalProperties().get("airflow") == null + || event + .getRun() + .getFacets() + .getAdditionalProperties() + .get("airflow") + .getAdditionalProperties() + .get("dag") + == null) { + return null; + } + Map airflowProperties = + event + .getRun() + .getFacets() + .getAdditionalProperties() + .get("airflow") + .getAdditionalProperties(); + Map dagProperties = (Map) airflowProperties.get("dag"); + if (dagProperties.get("tags") != null) { + try { + JSONArray arr = new JSONArray(((String) dagProperties.get("tags")).replace("'", "\"")); + LinkedList tags = new LinkedList<>(); + for (int i = 0; i < arr.length(); i++) { + tags.add(arr.getString(i)); + } + return generateTags(tags); + } catch (JSONException e) { + log.warn("Unable to parse tags from airflow properties: {}", e.getMessage()); + return null; + } + } + + return null; + } + + private static StringMap generateCustomProperties( + OpenLineage.RunEvent event, boolean flowProperties) { + StringMap customProperties = new StringMap(); + if ((event.getRun().getFacets() != null) + && (event.getRun().getFacets().getProcessing_engine() != null)) { + if (event.getRun().getFacets().getProcessing_engine().getName() != null) { + customProperties.put( + "processingEngine", event.getRun().getFacets().getProcessing_engine().getName()); + } + + customProperties.put( + "processingEngine", event.getRun().getFacets().getProcessing_engine().getName()); + if (event.getRun().getFacets().getProcessing_engine().getVersion() != null) { + customProperties.put( + "processingEngineVersion", + event.getRun().getFacets().getProcessing_engine().getVersion()); + } + if (event.getRun().getFacets().getProcessing_engine().getOpenlineageAdapterVersion() + != null) { + customProperties.put( + "openlineageAdapterVersion", + event.getRun().getFacets().getProcessing_engine().getOpenlineageAdapterVersion()); + } + } + + if ((event.getRun().getFacets() == null) + || (event.getRun().getFacets().getAdditionalProperties() == null)) { + return customProperties; + } + + for (Map.Entry entry : + event.getRun().getFacets().getAdditionalProperties().entrySet()) { + switch (entry.getKey()) { + case "spark_version": + { + if (entry.getValue().getAdditionalProperties().get("spark-version") != null) { + customProperties.put( + "spark-version", + (String) entry.getValue().getAdditionalProperties().get("spark-version")); + } + if (entry.getValue().getAdditionalProperties().get("openlineage-spark-version") + != null) { + customProperties.put( + "openlineage-spark-version", + (String) + entry.getValue().getAdditionalProperties().get("openlineage-spark-version")); + } + } + break; + case "spark_properties": + { + if (entry.getValue() != null) { + Map sparkProperties = + (Map) + entry.getValue().getAdditionalProperties().get("properties"); + log.info("Spark properties: {}, Properties: {}", entry.getValue(), sparkProperties); + if (sparkProperties != null) { + forEachValue(sparkProperties, customProperties); + } + } + } + break; + case "airflow": + { + Map airflowProperties; + if (flowProperties) { + airflowProperties = + (Map) entry.getValue().getAdditionalProperties().get("dag"); + } else { + airflowProperties = + (Map) entry.getValue().getAdditionalProperties().get("task"); + } + forEachValue(airflowProperties, customProperties); + } + break; + case "unknownSourceAttribute": + { + if (!flowProperties) { + List> unknownItems = + (List>) + entry.getValue().getAdditionalProperties().get("unknownItems"); + for (Map item : unknownItems) { + forEachValue(item, customProperties); + } + } + } + break; + default: + break; + } + } + return customProperties; + } + + private static void convertJobToDataJob( + DatahubJob datahubJob, OpenLineage.RunEvent event, DatahubOpenlineageConfig datahubConf) + throws URISyntaxException, IOException { + + OpenLineage.Job job = event.getJob(); + DataJobInfo dji = new DataJobInfo(); + + if (job.getName().contains(".")) { + + String jobName = job.getName().substring(job.getName().indexOf(".") + 1); + dji.setName(jobName); + } else { + dji.setName(job.getName()); + } + + String jobProcessingEngine = null; + if ((event.getRun().getFacets() != null) + && (event.getRun().getFacets().getProcessing_engine() != null)) { + jobProcessingEngine = event.getRun().getFacets().getProcessing_engine().getName(); + } + + DataFlowUrn flowUrn = + getFlowUrn( + event.getJob().getNamespace(), + event.getJob().getName(), + jobProcessingEngine, + event.getProducer(), + datahubConf); + + dji.setFlowUrn(flowUrn); + dji.setType(DataJobInfo.Type.create(flowUrn.getOrchestratorEntity())); + + DataJobUrn dataJobUrn = new DataJobUrn(flowUrn, job.getName()); + datahubJob.setJobUrn(dataJobUrn); + StringMap customProperties = generateCustomProperties(event, false); + dji.setCustomProperties(customProperties); + + TimeStamp timestamp = new TimeStamp(); + + if (event.getEventTime() != null) { + dji.setCreated(timestamp.setTime(event.getEventTime().toInstant().toEpochMilli())); + } + + String description = getDescription(event); + if (description != null) { + dji.setDescription(description); + } + datahubJob.setJobInfo(dji); + DataJobInputOutput inputOutput = new DataJobInputOutput(); + + processJobInputs(datahubJob, event, datahubConf); + + processJobOutputs(datahubJob, event, datahubConf); + + DataProcessInstanceRunEvent dpire = processDataProcessInstanceResult(event); + datahubJob.setDataProcessInstanceRunEvent(dpire); + + DataProcessInstanceProperties dpiProperties = getJobDataProcessInstanceProperties(event); + datahubJob.setDataProcessInstanceProperties(dpiProperties); + + processParentJob(event, job, inputOutput, datahubConf); + + DataProcessInstanceRelationships dataProcessInstanceRelationships = + new DataProcessInstanceRelationships(); + dataProcessInstanceRelationships.setParentTemplate(dataJobUrn); + dataProcessInstanceRelationships.setUpstreamInstances(new UrnArray()); + datahubJob.setDataProcessInstanceRelationships(dataProcessInstanceRelationships); + + try { + Urn dpiUrn = + Urn.createFromString(URN_LI_DATA_PROCESS_INSTANCE + event.getRun().getRunId().toString()); + datahubJob.setDataProcessInstanceUrn(dpiUrn); + } catch (URISyntaxException e) { + throw new RuntimeException("Unable to create dataprocess instance urn:" + e); + } + } + + private static DataProcessInstanceProperties getJobDataProcessInstanceProperties( + OpenLineage.RunEvent event) throws URISyntaxException { + DataProcessInstanceProperties dpiProperties = new DataProcessInstanceProperties(); + dpiProperties.setName(event.getRun().getRunId().toString()); + AuditStamp auditStamp = new AuditStamp(); + if (event.getEventTime() != null) { + auditStamp.setTime(event.getEventTime().toInstant().toEpochMilli()); + } + auditStamp.setActor(Urn.createFromString(URN_LI_CORPUSER_DATAHUB)); + dpiProperties.setCreated(auditStamp); + return dpiProperties; + } + + public static Edge createEdge(Urn urn, ZonedDateTime eventTime) { + Edge edge = new Edge(); + edge.setLastModified(createAuditStamp(eventTime)); + edge.setDestinationUrn(urn); + return edge; + } + + public static AuditStamp createAuditStamp(ZonedDateTime eventTime) { + AuditStamp auditStamp = new AuditStamp(); + if (eventTime != null) { + auditStamp.setTime(eventTime.toInstant().toEpochMilli()); + } else { + auditStamp.setTime(System.currentTimeMillis()); + } + try { + auditStamp.setActor(Urn.createFromString(URN_LI_CORPUSER_DATAHUB)); + } catch (URISyntaxException e) { + throw new RuntimeException("Unable to create actor urn:" + e); + } + return auditStamp; + } + + private static void processParentJob( + OpenLineage.RunEvent event, + OpenLineage.Job job, + DataJobInputOutput inputOutput, + DatahubOpenlineageConfig datahubConf) { + if ((event.getRun().getFacets() != null) && (event.getRun().getFacets().getParent() != null)) { + DataJobUrn parentDataJobUrn = + new DataJobUrn( + getFlowUrn( + event.getRun().getFacets().getParent().getJob().getNamespace(), + event.getRun().getFacets().getParent().getJob().getName(), + null, + event.getRun().getFacets().getParent().get_producer(), + datahubConf), + job.getName()); + + Edge edge = createEdge(parentDataJobUrn, event.getEventTime()); + EdgeArray array = new EdgeArray(); + array.add(edge); + inputOutput.setInputDatajobEdges(array); + } + } + + private static void processJobInputs( + DatahubJob datahubJob, OpenLineage.RunEvent event, DatahubOpenlineageConfig datahubConf) { + for (OpenLineage.InputDataset input : + event.getInputs().stream() + .filter(input -> input.getFacets() != null) + .distinct() + .collect(Collectors.toList())) { + Optional datasetUrn = convertOpenlineageDatasetToDatasetUrn(input, datahubConf); + if (datasetUrn.isPresent()) { + DatahubDataset.DatahubDatasetBuilder builder = DatahubDataset.builder(); + builder.urn(datasetUrn.get()); + if (datahubConf.isMaterializeDataset()) { + builder.schemaMetadata(getSchemaMetadata(input)); + } + if (datahubConf.isCaptureColumnLevelLineage()) { + UpstreamLineage upstreamLineage = getFineGrainedLineage(input, datahubConf); + if (upstreamLineage != null) { + builder.lineage(upstreamLineage); + } + } + datahubJob.getInSet().add(builder.build()); + } + } + } + + private static void processJobOutputs( + DatahubJob datahubJob, OpenLineage.RunEvent event, DatahubOpenlineageConfig datahubConf) { + for (OpenLineage.OutputDataset output : + event.getOutputs().stream() + .filter(input -> input.getFacets() != null) + .distinct() + .collect(Collectors.toList())) { + Optional datasetUrn = convertOpenlineageDatasetToDatasetUrn(output, datahubConf); + if (datasetUrn.isPresent()) { + DatahubDataset.DatahubDatasetBuilder builder = DatahubDataset.builder(); + builder.urn(datasetUrn.get()); + if (datahubConf.isMaterializeDataset()) { + builder.schemaMetadata(getSchemaMetadata(output)); + } + if (datahubConf.isCaptureColumnLevelLineage()) { + UpstreamLineage upstreamLineage = getFineGrainedLineage(output, datahubConf); + if (upstreamLineage != null) { + builder.lineage(upstreamLineage); + } + } + datahubJob.getOutSet().add(builder.build()); + } + } + } + + private static DataProcessInstanceRunEvent processDataProcessInstanceResult( + OpenLineage.RunEvent event) { + DataProcessInstanceRunEvent dpire = new DataProcessInstanceRunEvent(); + + DataProcessInstanceRunResult result = new DataProcessInstanceRunResult(); + switch (event.getEventType()) { + case COMPLETE: + dpire.setStatus(DataProcessRunStatus.COMPLETE); + result.setType(RunResultType.SUCCESS); + result.setNativeResultType(event.getEventType().toString()); + if (event.getEventTime() != null) { + dpire.setTimestampMillis(event.getEventTime().toInstant().toEpochMilli()); + } + dpire.setResult(result); + break; + case FAIL: + case ABORT: + dpire.setStatus(DataProcessRunStatus.COMPLETE); + result.setType(RunResultType.FAILURE); + result.setNativeResultType(event.getEventType().toString()); + if (event.getEventTime() != null) { + dpire.setTimestampMillis(event.getEventTime().toInstant().toEpochMilli()); + } + dpire.setResult(result); + break; + case START: + case RUNNING: + dpire.setStatus(DataProcessRunStatus.STARTED); + // result.setNativeResultType(event.getEventType().toString()); + if (event.getEventTime() != null) { + dpire.setTimestampMillis(event.getEventTime().toInstant().toEpochMilli()); + } + break; + case OTHER: + default: + result.setNativeResultType(event.getEventType().toString()); + if (event.getEventTime() != null) { + dpire.setTimestampMillis(event.getEventTime().toInstant().toEpochMilli()); + } + result.setType(RunResultType.$UNKNOWN); + dpire.setResult(result); + break; + } + return dpire; + } + + public static String getFlowName(String jobName, String flowName) { + String[] nameSplit = jobName.split("\\."); + if (flowName != null) { + return flowName; + } else { + return nameSplit[0]; + } + } + + public static DataFlowUrn getFlowUrn( + String namespace, + String jobName, + String processingEngine, + URI producer, + DatahubOpenlineageConfig datahubOpenlineageConfig) { + String producerName = null; + if (producer != null) { + producerName = producer.toString(); + } + + String orchestrator = getOrchestrator(processingEngine, producerName); + String flowName = datahubOpenlineageConfig.getPipelineName(); + if (datahubOpenlineageConfig.getPlatformInstance() != null) { + namespace = datahubOpenlineageConfig.getPlatformInstance(); + } + return (new DataFlowUrn(orchestrator, getFlowName(jobName, flowName), namespace)); + } + + public static DataFlowInfo convertRunEventToDataFlowInfo( + OpenLineage.RunEvent event, String flowName) throws IOException { + DataFlowInfo dataFlowInfo = new DataFlowInfo(); + dataFlowInfo.setName(getFlowName(event.getJob().getName(), flowName)); + return dataFlowInfo; + } + + private static String getOrchestrator(String processingEngine, String producer) { + String regex = "https://github.com/OpenLineage/OpenLineage/.*/(.*)$"; + Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE); + String orchestrator = null; + if (processingEngine != null) { + orchestrator = processingEngine.toLowerCase(); + } else if (producer != null) { + Matcher matcher = pattern.matcher(producer); + if ((matcher.matches()) && ((matcher.groupCount() == 1))) { + orchestrator = matcher.group(1); + } else if (producer.startsWith("https://github.com/apache/airflow/")) { + orchestrator = "airflow"; + } + } + if (orchestrator == null) { + throw new RuntimeException("Unable to determine orchestrator"); + } + return orchestrator; + } + + public static SchemaFieldDataType.Type convertOlFieldTypeToDHFieldType( + String openLineageFieldType) { + switch (openLineageFieldType) { + case "string": + return SchemaFieldDataType.Type.create(new StringType()); + case "long": + case "int": + return SchemaFieldDataType.Type.create(new NumberType()); + case "timestamp": + return SchemaFieldDataType.Type.create(new TimeType()); + case "struct": + return SchemaFieldDataType.Type.create(new MapType()); + default: + return SchemaFieldDataType.Type.create(new NullType()); + } + } + + public static SchemaMetadata getSchemaMetadata(OpenLineage.Dataset dataset) { + SchemaFieldArray schemaFieldArray = new SchemaFieldArray(); + if ((dataset.getFacets() == null) || (dataset.getFacets().getSchema() == null)) { + return null; + } + dataset + .getFacets() + .getSchema() + .getFields() + .forEach( + field -> { + SchemaField schemaField = new SchemaField(); + schemaField.setFieldPath(field.getName()); + schemaField.setNativeDataType(field.getType()); + schemaField.setType( + new SchemaFieldDataType() + .setType(convertOlFieldTypeToDHFieldType(field.getType()))); + schemaFieldArray.add(schemaField); + }); + SchemaMetadata schemaMetadata = new SchemaMetadata(); + schemaMetadata.setPlatformSchema(new SchemaMetadata.PlatformSchema()); + schemaMetadata.setSchemaName(""); + schemaMetadata.setVersion(1L); + schemaMetadata.setHash(""); + + MySqlDDL ddl = new MySqlDDL(); + ddl.setTableSchema(OpenLineageClientUtils.toJson(dataset.getFacets().getSchema().getFields())); + SchemaMetadata.PlatformSchema platformSchema = new SchemaMetadata.PlatformSchema(); + platformSchema.setMySqlDDL(ddl); + schemaMetadata.setPlatformSchema(platformSchema); + + schemaMetadata.setPlatform(new DataPlatformUrn(dataset.getNamespace())); + + schemaMetadata.setFields(schemaFieldArray); + return schemaMetadata; + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/CatalogTableDataset.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/CatalogTableDataset.java new file mode 100644 index 00000000000000..ce2952a93945b9 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/CatalogTableDataset.java @@ -0,0 +1,13 @@ +package io.datahubproject.openlineage.dataset; + +import com.linkedin.common.FabricType; +import lombok.ToString; + +@ToString +public class CatalogTableDataset extends SparkDataset { + + public CatalogTableDataset( + String dsName, String platformInstance, String platform, FabricType fabricType) { + super(platform, platformInstance, dsName, fabricType); + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubDataset.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubDataset.java new file mode 100644 index 00000000000000..5c4b809571d371 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubDataset.java @@ -0,0 +1,19 @@ +package io.datahubproject.openlineage.dataset; + +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.dataset.UpstreamLineage; +import com.linkedin.schema.SchemaMetadata; +import lombok.Builder; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; + +@Getter +@Builder +@Setter +@ToString +public class DatahubDataset { + DatasetUrn urn; + SchemaMetadata schemaMetadata; + UpstreamLineage lineage; +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java new file mode 100644 index 00000000000000..3682a42bb3571c --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/DatahubJob.java @@ -0,0 +1,445 @@ +package io.datahubproject.openlineage.dataset; + +import static io.datahubproject.openlineage.converter.OpenLineageToDataHub.*; + +import com.linkedin.common.DataJobUrnArray; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.DatasetUrnArray; +import com.linkedin.common.Edge; +import com.linkedin.common.EdgeArray; +import com.linkedin.common.GlobalTags; +import com.linkedin.common.Ownership; +import com.linkedin.common.Status; +import com.linkedin.common.TagAssociation; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.DataFlowUrn; +import com.linkedin.common.urn.DataJobUrn; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.DataTemplate; +import com.linkedin.data.template.StringMap; +import com.linkedin.datajob.DataFlowInfo; +import com.linkedin.datajob.DataJobInfo; +import com.linkedin.datajob.DataJobInputOutput; +import com.linkedin.dataprocess.DataProcessInstanceInput; +import com.linkedin.dataprocess.DataProcessInstanceOutput; +import com.linkedin.dataprocess.DataProcessInstanceProperties; +import com.linkedin.dataprocess.DataProcessInstanceRelationships; +import com.linkedin.dataprocess.DataProcessInstanceRunEvent; +import com.linkedin.dataset.FineGrainedLineage; +import com.linkedin.dataset.Upstream; +import com.linkedin.domain.Domains; +import com.linkedin.metadata.aspect.patch.builder.DataJobInputOutputPatchBuilder; +import com.linkedin.metadata.aspect.patch.builder.GlobalTagsPatchBuilder; +import com.linkedin.metadata.aspect.patch.builder.UpstreamLineagePatchBuilder; +import com.linkedin.metadata.key.DatasetKey; +import com.linkedin.mxe.MetadataChangeProposal; +import datahub.event.EventFormatter; +import datahub.event.MetadataChangeProposalWrapper; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import java.io.IOException; +import java.nio.charset.Charset; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import lombok.Builder; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; + +@EqualsAndHashCode +@Getter +@Setter +@Builder +@ToString +@Slf4j +public class DatahubJob { + public static final String DATASET_ENTITY_TYPE = "dataset"; + public static final String DATA_FLOW_ENTITY_TYPE = "dataFlow"; + public static final String DATA_PROCESS_INSTANCE_ENTITY_TYPE = "dataProcessInstance"; + public static final String DATAFLOW_ENTITY_TYPE = "dataflow"; + public static final String DATAJOB_ENTITY_TYPE = "dataJob"; + DataFlowUrn flowUrn; + DataFlowInfo dataFlowInfo; + DataJobUrn jobUrn; + DataJobInfo jobInfo; + Ownership flowOwnership; + GlobalTags flowGlobalTags; + Domains flowDomains; + DataPlatformInstance flowPlatformInstance; + DataProcessInstanceRunEvent dataProcessInstanceRunEvent; + DataProcessInstanceProperties dataProcessInstanceProperties; + DataProcessInstanceRelationships dataProcessInstanceRelationships; + Urn dataProcessInstanceUrn; + + final Set inSet = new TreeSet<>(new DataSetComparator()); + final Set outSet = new TreeSet<>(new DataSetComparator()); + final Set parentJobs = new TreeSet<>(new DataJobUrnComparator()); + final Map datasetProperties = new HashMap<>(); + long startTime; + long endTime; + long eventTime; + final EventFormatter eventFormatter = new EventFormatter(); + + public static MetadataChangeProposalWrapper materializeDataset(DatasetUrn datasetUrn) { + DatasetKey datasetAspect = new DatasetKey().setOrigin(datasetUrn.getOriginEntity()); + datasetAspect + .setName(datasetUrn.getDatasetNameEntity()) + .setPlatform(new DataPlatformUrn(datasetUrn.getPlatformEntity().getPlatformNameEntity())); + + return MetadataChangeProposalWrapper.create( + b -> + b.entityType(DATASET_ENTITY_TYPE).entityUrn(datasetUrn).upsert().aspect(datasetAspect)); + } + + public List toMcps(DatahubOpenlineageConfig config) throws IOException { + List mcps = new ArrayList<>(); + + // Generate and add DataFlow Aspect + log.info("Generating MCPs for job: {}", jobUrn); + addAspectToMcps(flowUrn, DATA_FLOW_ENTITY_TYPE, dataFlowInfo, mcps); + generateStatus(flowUrn, DATA_FLOW_ENTITY_TYPE, mcps); + + // Generate and add PlatformInstance Aspect + if (flowPlatformInstance != null) { + addAspectToMcps(flowUrn, DATA_FLOW_ENTITY_TYPE, flowPlatformInstance, mcps); + } + + // Generate and add Properties Aspect + StringMap customProperties = new StringMap(); + if (!jobInfo.getCustomProperties().isEmpty()) { + customProperties.putAll(jobInfo.getCustomProperties()); + } + + if (startTime > 0) { + customProperties.put("startTime", String.valueOf(Instant.ofEpochMilli(startTime))); + } + + if (endTime > 0) { + customProperties.put("endTime", String.valueOf(Instant.ofEpochMilli(endTime))); + } + log.info("Setting custom properties for job: {}", jobUrn); + jobInfo.setCustomProperties(customProperties); + addAspectToMcps(jobUrn, DATAJOB_ENTITY_TYPE, jobInfo, mcps); + generateStatus(jobUrn, DATAJOB_ENTITY_TYPE, mcps); + + // Generate and add tags Aspect + generateFlowGlobalTagsAspect(flowUrn, flowGlobalTags, config, mcps); + + // Generate and add domain Aspect + generateFlowDomainsAspect(mcps, customProperties); + + log.info( + "Adding input and output to {} Number of outputs: {}, Number of inputs {}", + jobUrn, + outSet.size(), + inSet.size()); + + // Generate Input and Outputs + Pair inputsTuple = processUpstreams(config, mcps); + UrnArray inputUrnArray = inputsTuple.getLeft(); + EdgeArray inputEdges = inputsTuple.getRight(); + + Pair outputTuple = processDownstreams(config, mcps); + UrnArray outputUrnArray = outputTuple.getLeft(); + EdgeArray outputEdges = outputTuple.getRight(); + + // Generate and add DataJobInputOutput Aspect + generateDataJobInputOutputMcp(inputEdges, outputEdges, config, mcps); + + // Generate and add DataProcessInstance Aspect + generateDataProcessInstanceMcp(inputUrnArray, outputUrnArray, mcps); + + log.info("Mcp generation finished for urn {}", jobUrn); + return mcps; + } + + private void generateDataJobInputOutputMcp( + EdgeArray inputEdges, + EdgeArray outputEdges, + DatahubOpenlineageConfig config, + List mcps) { + DataJobInputOutput dataJobInputOutput = new DataJobInputOutput(); + log.info("Adding DataJob edges to {}", jobUrn); + if (config.isUsePatch()) { + DataJobInputOutputPatchBuilder dataJobInputOutputPatchBuilder = + new DataJobInputOutputPatchBuilder().urn(jobUrn); + for (DatahubDataset dataset : inSet) { + dataJobInputOutputPatchBuilder.addInputDatasetEdge(dataset.getUrn()); + } + for (DatahubDataset dataset : outSet) { + dataJobInputOutputPatchBuilder.addOutputDatasetEdge(dataset.getUrn()); + } + for (DataJobUrn parentJob : parentJobs) { + dataJobInputOutputPatchBuilder.addInputDatajobEdge(parentJob); + } + MetadataChangeProposal dataJobInputOutputMcp = dataJobInputOutputPatchBuilder.build(); + log.info( + "dataJobInputOutputMcp: {}", + dataJobInputOutputMcp.getAspect().getValue().asString(Charset.defaultCharset())); + mcps.add(dataJobInputOutputPatchBuilder.build()); + + } else { + dataJobInputOutput.setInputDatasetEdges(inputEdges); + dataJobInputOutput.setInputDatasets(new DatasetUrnArray()); + dataJobInputOutput.setOutputDatasetEdges(outputEdges); + dataJobInputOutput.setOutputDatasets(new DatasetUrnArray()); + DataJobUrnArray parentDataJobUrnArray = new DataJobUrnArray(); + parentDataJobUrnArray.addAll(parentJobs); + + log.info( + "Adding input data jobs {} Number of jobs: {}", jobUrn, parentDataJobUrnArray.size()); + dataJobInputOutput.setInputDatajobs(parentDataJobUrnArray); + addAspectToMcps(jobUrn, DATAJOB_ENTITY_TYPE, dataJobInputOutput, mcps); + } + } + + private void generateDataProcessInstanceMcp( + UrnArray inputUrnArray, UrnArray outputUrnArray, List mcps) { + DataProcessInstanceInput dataProcessInstanceInput = new DataProcessInstanceInput(); + dataProcessInstanceInput.setInputs(inputUrnArray); + + DataProcessInstanceOutput dataProcessInstanceOutput = new DataProcessInstanceOutput(); + dataProcessInstanceOutput.setOutputs(outputUrnArray); + + addAspectToMcps( + dataProcessInstanceUrn, DATA_PROCESS_INSTANCE_ENTITY_TYPE, dataProcessInstanceInput, mcps); + addAspectToMcps( + dataProcessInstanceUrn, DATA_PROCESS_INSTANCE_ENTITY_TYPE, dataProcessInstanceOutput, mcps); + + if (dataProcessInstanceProperties != null) { + log.info("Adding dataProcessInstanceProperties to {}", jobUrn); + addAspectToMcps( + dataProcessInstanceUrn, + DATA_PROCESS_INSTANCE_ENTITY_TYPE, + dataProcessInstanceProperties, + mcps); + } + + generateDataProcessInstanceRunEvent(mcps); + generateDataProcessInstanceRelationship(mcps); + } + + private Pair processDownstreams( + DatahubOpenlineageConfig config, List mcps) { + UrnArray outputUrnArray = new UrnArray(); + EdgeArray outputEdges = new EdgeArray(); + + outSet.forEach( + dataset -> { + outputUrnArray.add(dataset.getUrn()); + if (config.isMaterializeDataset()) { + try { + mcps.add(eventFormatter.convert(materializeDataset(dataset.getUrn()))); + generateStatus(dataset.getUrn(), DATASET_ENTITY_TYPE, mcps); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + Edge edge = + createEdge( + dataset.getUrn(), + ZonedDateTime.ofInstant(Instant.ofEpochMilli(eventTime), ZoneOffset.UTC)); + outputEdges.add(edge); + + if ((dataset.getSchemaMetadata() != null) && (config.isIncludeSchemaMetadata())) { + addAspectToMcps( + dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getSchemaMetadata(), mcps); + } + + if (dataset.getLineage() != null) { + if (config.isUsePatch()) { + UpstreamLineagePatchBuilder upstreamLineagePatchBuilder = + new UpstreamLineagePatchBuilder().urn(dataset.getUrn()); + for (Upstream upstream : dataset.getLineage().getUpstreams()) { + upstreamLineagePatchBuilder.addUpstream(upstream.getDataset(), upstream.getType()); + } + + log.info("Adding FineGrainedLineage to {}", dataset.getUrn()); + for (FineGrainedLineage fineGrainedLineage : + Objects.requireNonNull(dataset.getLineage().getFineGrainedLineages())) { + for (Urn upstream : Objects.requireNonNull(fineGrainedLineage.getUpstreams())) { + upstreamLineagePatchBuilder.addFineGrainedUpstreamField( + upstream, + fineGrainedLineage.getConfidenceScore(), + StringUtils.defaultIfEmpty( + fineGrainedLineage.getTransformOperation(), "TRANSFORM"), + fineGrainedLineage.getUpstreamType()); + } + for (Urn downstream : Objects.requireNonNull(fineGrainedLineage.getDownstreams())) { + upstreamLineagePatchBuilder.addFineGrainedDownstreamField( + downstream, + fineGrainedLineage.getConfidenceScore(), + StringUtils.defaultIfEmpty( + fineGrainedLineage.getTransformOperation(), "TRANSFORM"), + fineGrainedLineage.getDownstreamType()); + } + } + MetadataChangeProposal mcp = upstreamLineagePatchBuilder.build(); + log.info( + "upstreamLineagePatch: {}", + mcp.getAspect().getValue().asString(Charset.defaultCharset())); + mcps.add(mcp); + } else { + addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getLineage(), mcps); + } + } + }); + return Pair.of(outputUrnArray, outputEdges); + } + + private Pair processUpstreams( + DatahubOpenlineageConfig config, List mcps) { + UrnArray inputUrnArray = new UrnArray(); + EdgeArray inputEdges = new EdgeArray(); + + inSet.forEach( + dataset -> { + inputUrnArray.add(dataset.getUrn()); + Edge edge = + createEdge( + dataset.getUrn(), + ZonedDateTime.ofInstant(Instant.ofEpochMilli(eventTime), ZoneOffset.UTC)); + inputEdges.add(edge); + + if (config.isMaterializeDataset()) { + try { + mcps.add(eventFormatter.convert(materializeDataset(dataset.getUrn()))); + generateStatus(dataset.getUrn(), DATASET_ENTITY_TYPE, mcps); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + if (dataset.getSchemaMetadata() != null && config.isIncludeSchemaMetadata()) { + addAspectToMcps( + dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getSchemaMetadata(), mcps); + } + + if (dataset.getLineage() != null) { + addAspectToMcps(dataset.getUrn(), DATASET_ENTITY_TYPE, dataset.getLineage(), mcps); + } + }); + return Pair.of(inputUrnArray, inputEdges); + } + + private void generateFlowDomainsAspect( + List mcps, StringMap customProperties) { + if (flowDomains != null) { + MetadataChangeProposalWrapper domains = + MetadataChangeProposalWrapper.create( + b -> + b.entityType(DATAFLOW_ENTITY_TYPE) + .entityUrn(flowUrn) + .upsert() + .aspect(flowDomains)); + try { + mcps.add(eventFormatter.convert(domains)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private void generateFlowGlobalTagsAspect( + Urn flowUrn, + GlobalTags flowGlobalTags, + DatahubOpenlineageConfig config, + List mcps) { + if (flowGlobalTags != null) { + if (config.isUsePatch()) { + GlobalTagsPatchBuilder globalTagsPatchBuilder = new GlobalTagsPatchBuilder().urn(flowUrn); + for (TagAssociation tag : flowGlobalTags.getTags()) { + globalTagsPatchBuilder.addTag(tag.getTag(), null); + } + globalTagsPatchBuilder.urn(flowUrn); + mcps.add(globalTagsPatchBuilder.build()); + } else { + addAspectToMcps(flowUrn, DATA_FLOW_ENTITY_TYPE, flowGlobalTags, mcps); + } + } + } + + private void generateStatus(Urn entityUrn, String entityType, List mcps) { + Status statusInfo = new Status().setRemoved(false); + addAspectToMcps(entityUrn, entityType, statusInfo, mcps); + } + + private void addAspectToMcps( + Urn entityUrn, String entityType, DataTemplate aspect, List mcps) { + MetadataChangeProposalWrapper mcpw = + MetadataChangeProposalWrapper.create( + b -> b.entityType(entityType).entityUrn(entityUrn).upsert().aspect(aspect)); + try { + mcps.add(eventFormatter.convert(mcpw)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void generateDataProcessInstanceRelationship(List mcps) { + if (dataProcessInstanceRelationships != null) { + log.info("Adding dataProcessInstanceRelationships to {}", jobUrn); + try { + mcps.add( + eventFormatter.convert( + MetadataChangeProposalWrapper.create( + b -> + b.entityType(DATA_PROCESS_INSTANCE_ENTITY_TYPE) + .entityUrn(dataProcessInstanceUrn) + .upsert() + .aspect(dataProcessInstanceRelationships)))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private void generateDataProcessInstanceRunEvent(List mcps) { + if (dataProcessInstanceRunEvent != null) { + log.info("Adding dataProcessInstanceRunEvent to {}", jobUrn); + try { + mcps.add( + eventFormatter.convert( + MetadataChangeProposalWrapper.create( + b -> + b.entityType(DATA_PROCESS_INSTANCE_ENTITY_TYPE) + .entityUrn(dataProcessInstanceUrn) + .upsert() + .aspect(dataProcessInstanceRunEvent)))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } +} + +class DataSetComparator implements Comparator { + + @Override + public int compare(DatahubDataset dataset1, DatahubDataset dataset2) { + return dataset1.urn.toString().compareTo(dataset2.getUrn().toString()); + } +} + +class DataJobUrnComparator implements Comparator { + + @Override + public int compare(DataJobUrn urn1, DataJobUrn urn2) { + return urn1.toString().compareTo(urn2.toString()); + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/Dataset.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/Dataset.java new file mode 100644 index 00000000000000..11336301f72ad2 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/Dataset.java @@ -0,0 +1,29 @@ +package io.datahubproject.openlineage.dataset; + +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DatasetUrn; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.utils.DatahubUtils; +import lombok.EqualsAndHashCode; + +@EqualsAndHashCode +public abstract class Dataset { + + private final DatasetUrn urn; + + public Dataset(String platform, String platformInstance, String name, FabricType fabricType) { + super(); + this.urn = DatahubUtils.createDatasetUrn(platform, platformInstance, name, fabricType); + } + + public Dataset(String platform, String name, DatahubOpenlineageConfig datahubConfig) { + super(); + this.urn = + DatahubUtils.createDatasetUrn( + platform, datahubConfig.getPlatformInstance(), name, datahubConfig.getFabricType()); + } + + public DatasetUrn urn() { + return urn; + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPathDataset.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPathDataset.java new file mode 100644 index 00000000000000..0d0868afedfd9a --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPathDataset.java @@ -0,0 +1,206 @@ +package io.datahubproject.openlineage.dataset; + +import com.linkedin.common.FabricType; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import java.net.URI; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import lombok.ToString; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; + +@ToString +@Slf4j +public class HdfsPathDataset extends SparkDataset { + private final String datasetPath; + + private static final String TABLE = "{table}"; + private static final String TABLE_MARKER = "/{table}"; + private static final String TABLE_MARKER_REGEX = "/\\{table\\}"; + + private static final String URI_SPLITTER = "://"; + + public HdfsPathDataset( + String platform, + String name, + String platformInstance, + FabricType fabricType, + String datasetPath) { + super(platform, platformInstance, name, fabricType); + this.datasetPath = datasetPath; + } + + public HdfsPathDataset(String pathUri, String platformInstance, FabricType fabricType) { + super("hdfs", platformInstance, pathUri, fabricType); + this.datasetPath = pathUri; + } + + public HdfsPathDataset(String pathUri, DatahubOpenlineageConfig datahubConf) { + super("hdfs", pathUri, datahubConf); + this.datasetPath = pathUri; + } + + public HdfsPathDataset( + String platform, String name, String datasetPath, DatahubOpenlineageConfig datahubConf) { + super(platform, name, datahubConf); + this.datasetPath = datasetPath; + } + + public String getDatasetPath() { + return datasetPath; + } + + public static HdfsPathDataset create(URI path, DatahubOpenlineageConfig datahubConf) + throws InstantiationException { + String pathUri = path.toString(); + pathUri = StringUtils.stripEnd(pathUri, "/"); + String platform; + try { + platform = getPlatform(pathUri); + + if (datahubConf.getPathSpecs() == null) { + log.info("No path_spec_list configuration found for platform {}.", platform); + } else { + + // Filter out path specs that don't match the platform + for (PathSpec pathSpec : datahubConf.getPathSpecsForPlatform(platform)) { + log.debug("Checking match for path_alias: " + pathSpec.getAlias()); + + String rawName = getRawNameFromUri(pathUri, pathSpec.getPathSpecList()); + if (rawName != null) { + String platformInstance = + pathSpec.platformInstance.orElseGet(datahubConf::getCommonDatasetPlatformInstance); + FabricType fabricType = datahubConf.getFabricType(); + return new HdfsPathDataset( + platform, getDatasetName(rawName), platformInstance, fabricType, rawName); + } + } + } + if (datahubConf.getPathSpecs() == null) { + log.info("No path_spec_list configuration found for platform {}.", platform); + } + String rawName = getRawNameFromUri(pathUri, null); + if (rawName == null) { + String partitionRegexp = datahubConf.getFilePartitionRegexpPattern(); + if (partitionRegexp != null) { + rawName = getRawNameWithoutPartition(pathUri, partitionRegexp); + } else { + rawName = pathUri; + } + } + String datasetName = getDatasetName(rawName); + + // If platform is file then we want to keep the trailing slash + if (platform.equals("file")) { + datasetName = stripPrefix(rawName); + } + return new HdfsPathDataset(platform, datasetName, rawName, datahubConf); + } catch (IllegalArgumentException e) { + return new HdfsPathDataset("hdfs", pathUri, pathUri, datahubConf); + } + } + + private static String getDatasetName(String rawName) throws IllegalArgumentException { + return stripPrefix(rawName).replaceFirst("^/", ""); + } + + private static String getRawNameFromUri(String pathUri, List pathSpecs) { + + if (pathSpecs == null || pathSpecs.isEmpty()) { + log.info( + "No path_spec_list configuration found. Falling back to creating dataset name with complete uri"); + } else { + for (String pathSpec : pathSpecs) { + String uri = getMatchedUri(pathUri, pathSpec); + if (uri != null) { + return uri; + } + } + log.info( + "None of the path specs matched for path {} from pathSpecs: {}.", + pathUri, + String.join(",", pathSpecs)); + } + return null; + } + + private static String getRawNameWithoutPartition(String pathUri, String partitionRegexp) { + String result = pathUri.replaceAll(partitionRegexp + "$", ""); + // Remove trailing slash + return result.replaceAll("/$", ""); + } + + private static String[] getSplitUri(String pathUri) throws IllegalArgumentException { + if (pathUri.contains(URI_SPLITTER)) { + String[] split = pathUri.split(URI_SPLITTER); + if (split.length == 2) { + return split; + } + } + throw new IllegalArgumentException("Path URI is not as per expected format: " + pathUri); + } + + private static String getPlatform(String pathUri) throws IllegalArgumentException { + String prefix = getSplitUri(pathUri)[0]; + return HdfsPlatform.getPlatformFromPrefix(prefix); + } + + private static String stripPrefix(String pathUri) throws IllegalArgumentException { + + return getSplitUri(pathUri)[1]; + } + + static String getMatchedUri(String pathUri, String pathSpec) { + if (pathSpec.contains(TABLE_MARKER)) { + String miniSpec = pathSpec.split(TABLE_MARKER_REGEX)[0] + TABLE_MARKER; + String[] specFolderList = miniSpec.split("/"); + String[] pathFolderList = pathUri.split("/"); + StringBuilder uri = new StringBuilder(); + if (pathFolderList.length >= specFolderList.length) { + for (int i = 0; i < specFolderList.length; i++) { + if (specFolderList[i].equals(pathFolderList[i]) || specFolderList[i].equals("*")) { + uri.append(pathFolderList[i]).append("/"); + } else if (specFolderList[i].equals(TABLE)) { + uri.append(pathFolderList[i]); + log.debug("Actual path [" + pathUri + "] matched with path_spec [" + pathSpec + "]"); + return uri.toString(); + } else { + break; + } + } + } + log.debug("No path spec matched with actual path [" + pathUri + "]"); + } else { + log.warn("Invalid path spec [" + pathSpec + "]. Path spec should contain {table}"); + } + return null; + } + + public enum HdfsPlatform { + S3(Arrays.asList("s3", "s3a", "s3n"), "s3"), + GCS(Arrays.asList("gs", "gcs"), "gcs"), + ABFS(Arrays.asList("abfs", "abfss"), "abfs"), + DBFS(Collections.singletonList("dbfs"), "dbfs"), + FILE(Collections.singletonList("file"), "file"), + // default platform + HDFS(Collections.emptyList(), "hdfs"); + + public final List prefixes; + public final String platform; + + HdfsPlatform(List prefixes, String platform) { + this.prefixes = prefixes; + this.platform = platform; + } + + public static String getPlatformFromPrefix(String prefix) { + for (HdfsPlatform e : values()) { + if (e.prefixes.contains(prefix)) { + return e.platform; + } + } + return HDFS.platform; + } + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPlatform.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPlatform.java new file mode 100644 index 00000000000000..dcaf34f9d7b0fd --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/HdfsPlatform.java @@ -0,0 +1,32 @@ +package io.datahubproject.openlineage.dataset; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public enum HdfsPlatform { + S3(Arrays.asList("s3", "s3a", "s3n"), "s3"), + GCS(Arrays.asList("gs", "gcs"), "gcs"), + ABFS(Arrays.asList("abfs", "abfss"), "abfs"), + DBFS(Collections.singletonList("dbfs"), "dbfs"), + FILE(Collections.singletonList("file"), "file"), + // default platform + HDFS(Collections.emptyList(), "hdfs"); + + public final List prefixes; + public final String platform; + + HdfsPlatform(List prefixes, String platform) { + this.prefixes = prefixes; + this.platform = platform; + } + + public static boolean isFsPlatformPrefix(String prefix) { + for (HdfsPlatform e : values()) { + if (e.prefixes.contains(prefix)) { + return true; + } + } + return false; + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/PathSpec.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/PathSpec.java new file mode 100644 index 00000000000000..1a015cabc46cc5 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/PathSpec.java @@ -0,0 +1,20 @@ +package io.datahubproject.openlineage.dataset; + +import java.util.List; +import java.util.Optional; +import lombok.Builder; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; + +@Builder +@Getter +@Setter +@ToString +public class PathSpec { + final String alias; + final String platform; + @Builder.Default final String env = "PROD"; + final List pathSpecList; + @Builder.Default final Optional platformInstance = Optional.empty(); +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/SparkDataset.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/SparkDataset.java new file mode 100644 index 00000000000000..07663bb8c49e27 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/dataset/SparkDataset.java @@ -0,0 +1,33 @@ +package io.datahubproject.openlineage.dataset; + +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DatasetUrn; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.utils.DatahubUtils; +import lombok.EqualsAndHashCode; + +@EqualsAndHashCode +public abstract class SparkDataset { + + private final DatasetUrn urn; + + public SparkDataset( + String platform, String platformInstance, String name, FabricType fabricType) { + super(); + this.urn = DatahubUtils.createDatasetUrn(platform, platformInstance, name, fabricType); + } + + public SparkDataset(String platform, String name, DatahubOpenlineageConfig datahubConfig) { + super(); + this.urn = + DatahubUtils.createDatasetUrn( + platform, + datahubConfig.getCommonDatasetPlatformInstance(), + name, + datahubConfig.getFabricType()); + } + + public DatasetUrn urn() { + return urn; + } +} diff --git a/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/utils/DatahubUtils.java b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/utils/DatahubUtils.java new file mode 100644 index 00000000000000..6f504a489f53f9 --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/main/java/io/datahubproject/openlineage/utils/DatahubUtils.java @@ -0,0 +1,37 @@ +package io.datahubproject.openlineage.utils; + +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DataFlowUrn; +import com.linkedin.common.urn.DataJobUrn; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.metadata.key.DatasetKey; +import datahub.event.MetadataChangeProposalWrapper; + +public class DatahubUtils { + private DatahubUtils() {} + + public static DataFlowUrn flowUrn(String platformId, String appName) { + return new DataFlowUrn("spark", appName, platformId); + } + + public static DataJobUrn jobUrn(DataFlowUrn flowUrn, String jobName) { + return new DataJobUrn(flowUrn, jobName); + } + + public static MetadataChangeProposalWrapper generateDatasetMcp(DatasetUrn datasetUrn) { + DatasetKey datasetAspect = new DatasetKey().setOrigin(FabricType.PROD); + datasetAspect + .setName(datasetUrn.getDatasetNameEntity()) + .setPlatform(new DataPlatformUrn(datasetUrn.getPlatformEntity().getPlatformNameEntity())); + + return MetadataChangeProposalWrapper.create( + b -> b.entityType("dataset").entityUrn(datasetUrn).upsert().aspect(datasetAspect)); + } + + public static DatasetUrn createDatasetUrn( + String platform, String platformInstance, String name, FabricType fabricType) { + String datasteName = platformInstance == null ? name : platformInstance + "." + name; + return new DatasetUrn(new DataPlatformUrn(platform), datasteName, fabricType); + } +} diff --git a/metadata-integration/java/openlineage-converter/src/test/java/io/datahubproject/openlineage/HdfsPathDatasetTest.java b/metadata-integration/java/openlineage-converter/src/test/java/io/datahubproject/openlineage/HdfsPathDatasetTest.java new file mode 100644 index 00000000000000..e8981aeb0be59c --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/test/java/io/datahubproject/openlineage/HdfsPathDatasetTest.java @@ -0,0 +1,271 @@ +package io.datahubproject.openlineage; + +import com.linkedin.common.FabricType; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.dataset.HdfsPathDataset; +import io.datahubproject.openlineage.dataset.PathSpec; +import io.datahubproject.openlineage.dataset.SparkDataset; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; +import org.junit.Assert; +import org.junit.Test; + +public class HdfsPathDatasetTest { + + @Test + public void testNoPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig config = + DatahubOpenlineageConfig.builder().fabricType(FabricType.PROD).build(); + SparkDataset dataset = + HdfsPathDataset.create(new URI("s3://my-bucket/foo/tests/bar.avro"), config); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig config = + DatahubOpenlineageConfig.builder() + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("s3") + .pathSpecList( + new LinkedList<>( + Arrays.asList( + "s3a://wrong-my-bucket/foo/{table}", + "s3a://my-bucket/foo/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("s3a://my-bucket/foo/tests/bar.avro"), config); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } + + @Test + public void testNoMatchPathSpecListWithFolder() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder().fabricType(FabricType.PROD).build(); + + String gcsPath = + "gcs://gcs-spike-standard-offerwall-dev-useast1/events_creation_timestamp_enhanced"; + String expectedUrn = + "urn:li:dataset:(urn:li:dataPlatform:gcs,gcs-spike-standard-offerwall-dev-useast1/events_creation_timestamp_enhanced,PROD)"; + + SparkDataset dataset = HdfsPathDataset.create(new URI(gcsPath), datahubConfig); + Assert.assertEquals(expectedUrn, dataset.urn().toString()); + } + + @Test + public void testNoMatchPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder() + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("s3") + .pathSpecList( + new LinkedList<>( + Collections.singletonList( + "s3a://wrong-my-bucket/foo/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("s3a://my-bucket/foo/tests/bar.avro"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathSpecListPlatformInstance() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder() + .commonDatasetPlatformInstance("instance") + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("s3") + .pathSpecList( + new LinkedList<>( + Arrays.asList( + "s3a://wrong-my-bucket/foo/{table}", + "s3a://my-bucket/foo/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("s3a://my-bucket/foo/tests/bar.avro"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,instance.my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathSpecListPathSpecPlatformInstance() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder() + .commonDatasetPlatformInstance("instance") + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("s3") + .platformInstance(Optional.of("s3Instance")) + .pathSpecList( + new LinkedList<>( + Arrays.asList( + "s3a://wrong-my-bucket/foo/{table}", + "s3a://my-bucket/foo/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("s3a://my-bucket/foo/tests/bar.avro"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,s3Instance.my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathAliasList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder() + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("s3") + .pathSpecList( + new LinkedList<>( + Collections.singletonList("s3a://my-bucket/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("s3a://my-bucket/foo/tests/bar.avro"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo,PROD)", dataset.urn().toString()); + } + + // ==================================================================== + // GCS tests + // ==================================================================== + @Test + public void testGcsNoPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder() + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("gcs") + .pathSpecList( + new LinkedList<>( + Arrays.asList("s3a://wrong-my-bucket/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("gs://my-bucket/foo/tests/bar.avro"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo/tests/bar.avro,PROD)", + dataset.urn().toString()); + } + + @Test + public void testGcsPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + DatahubOpenlineageConfig datahubConfig = + DatahubOpenlineageConfig.builder() + .pathSpecs( + new HashMap>() { + { + put( + "s3", + Collections.singletonList( + PathSpec.builder() + .env("PROD") + .platform("gcs") + .pathSpecList( + new LinkedList<>( + Arrays.asList( + "s3a://wrong-my-bucket/foo/{table}", + "gs://my-bucket/foo/{table}"))) + .build())); + } + }) + .fabricType(FabricType.PROD) + .build(); + + SparkDataset dataset = + HdfsPathDataset.create(new URI("gs://my-bucket/foo/tests/bar.avro"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } +} diff --git a/metadata-integration/java/openlineage-converter/src/test/resources/log4j.properties b/metadata-integration/java/openlineage-converter/src/test/resources/log4j.properties new file mode 100644 index 00000000000000..3b61ea91e5516a --- /dev/null +++ b/metadata-integration/java/openlineage-converter/src/test/resources/log4j.properties @@ -0,0 +1,8 @@ +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# datahub to info +log4j.logger.io.datahub=DEBUG \ No newline at end of file diff --git a/metadata-integration/java/spark-lineage-beta/README.md b/metadata-integration/java/spark-lineage-beta/README.md new file mode 100644 index 00000000000000..6a520071ba7978 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/README.md @@ -0,0 +1,343 @@ +# Spark + +To integrate Spark with DataHub, we provide a lightweight Java agent that listens for Spark application and job events +and pushes metadata out to DataHub in real-time. The agent listens to events such application start/end, and +SQLExecution start/end to create pipelines (i.e. DataJob) and tasks (i.e. DataFlow) in Datahub along with lineage to +datasets that are being read from and written to. Read on to learn how to configure this for different Spark scenarios. + +## Configuring Spark agent + +The Spark agent can be configured using a config file or while creating a Spark Session. If you are using Spark on +Databricks, refer [Configuration Instructions for Databricks](#configuration-instructions--databricks). + +### Before you begin: Versions and Release Notes + +Versioning of the jar artifact will follow the semantic versioning of the +main [DataHub repo](https://github.com/datahub-project/datahub) and release notes will be +available [here](https://github.com/datahub-project/datahub/releases). +Always check [the Maven central repository](https://search.maven.org/search?q=a:acryl-spark-lineage) for the latest +released version. + +### Configuration Instructions: spark-submit + +When running jobs using spark-submit, the agent needs to be configured in the config file. + +```text +#Configuring DataHub spark agent jar +spark.jars.packages io.acryl:acryl-spark-lineage:0.2.1 +spark.extraListeners datahub.spark.DatahubSparkListener +spark.datahub.rest.server http://localhost:8080 +``` + +## spark-submit command line + +```sh +spark-submit --packages io.acryl:acryl-spark-lineage:0.2.1 --conf "spark.extraListeners=datahub.spark.DatahubSparkListener" my_spark_job_to_run.py +``` + +### Configuration Instructions: Amazon EMR + +Set the following spark-defaults configuration properties as it +stated [here](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html) + +```text +spark.jars.packages io.acryl:acryl-spark-lineage:0.2.1 +spark.extraListeners datahub.spark.DatahubSparkListener +spark.datahub.rest.server https://your_datahub_host/gms +#If you have authentication set up then you also need to specify the Datahub access token +spark.datahub.rest.token yourtoken +``` + +### Configuration Instructions: Notebooks + +When running interactive jobs from a notebook, the listener can be configured while building the Spark Session. + +```python +spark = SparkSession.builder +.master("spark://spark-master:7077") +.appName("test-application") +.config("spark.jars.packages", "io.acryl:acryl-spark-lineage:0.1.0") +.config("spark.extraListeners", "datahub.spark.DatahubSparkListener") +.config("spark.datahub.rest.server", "http://localhost:8080") +.enableHiveSupport() +.getOrCreate() +``` + +### Configuration Instructions: Standalone Java Applications + +The configuration for standalone Java apps is very similar. + +```java +spark =SparkSession. + +builder() + . + +appName("test-application") + . + +config("spark.master","spark://spark-master:7077") + . + +config("spark.jars.packages","io.acryl:acryl-spark-lineage:0.2.1") + . + +config("spark.extraListeners","datahub.spark.DatahubSparkListener") + . + +config("spark.datahub.rest.server","http://localhost:8080") + . + +enableHiveSupport() + . + +getOrCreate(); + ``` + +### Configuration Instructions: Databricks + +The Spark agent can be configured using Databricks +Cluster [Spark configuration](https://docs.databricks.com/clusters/configure.html#spark-configuration) +and [Init script](https://docs.databricks.com/clusters/configure.html#init-scripts). + +[Databricks Secrets](https://docs.databricks.com/security/secrets/secrets.html) can be leveraged to store sensitive +information like tokens. + +- Download `datahub-spark-lineage` jar + from [the Maven central repository](https://s01.oss.sonatype.org/content/groups/public/io/acryl/acryl-spark-lineage/). +- Create `init.sh` with below content + + ```sh + #!/bin/bash + cp /dbfs/datahub/datahub-spark-lineage*.jar /databricks/jars + ``` + +- Install and configure [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html). +- Copy jar and init script to Databricks File System(DBFS) using Databricks CLI. + + ```sh + databricks fs mkdirs dbfs:/datahub + databricks fs cp --overwrite datahub-spark-lineage*.jar dbfs:/datahub + databricks fs cp --overwrite init.sh dbfs:/datahub + ``` + +- Open Databricks Cluster configuration page. Click the **Advanced Options** toggle. Click the **Spark** tab. Add below + configurations under `Spark Config`. + + ```text + spark.extraListeners datahub.spark.DatahubSparkListener + spark.datahub.rest.server http://localhost:8080 + spark.datahub.databricks.cluster cluster-name + ``` + +- Click the **Init Scripts** tab. Set cluster init script as `dbfs:/datahub/init.sh`. + +- Configuring DataHub authentication token + + - Add below config in cluster spark config. + + ```text + spark.datahub.rest.token + ``` + + - Alternatively, Databricks secrets can be used to secure token. + - Create secret using Databricks CLI. + + ```sh + databricks secrets create-scope --scope datahub --initial-manage-principal users + databricks secrets put --scope datahub --key rest-token + databricks secrets list --scope datahub <<Edit prompted file with token value>> + ``` + + - Add in spark config + + ```text + spark.datahub.rest.token {{secrets/datahub/rest-token}} + ``` + +## Configuration Options + +| Field | Required | Default | Description | +|---------------------------------------------------------------------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| spark.jars.packages | ✅ | | Set with latest/required version io.acryl:datahub-spark-lineage:0.8.23 | +| spark.extraListeners | ✅ | | datahub.spark.DatahubSparkListener | +| spark.datahub.rest.server | ✅ | | Datahub server url eg: | +| spark.datahub.rest.token | | | Authentication token. | +| spark.datahub.rest.disable_ssl_verification | | false | Disable SSL certificate validation. Caution: Only use this if you know what you are doing! | +| spark.datahub.metadata.pipeline.platformInstance | | | Pipeline level platform instance | +| spark.datahub.metadata.dataset.platformInstance | | | dataset level platform instance | +| spark.datahub.metadata.dataset.env | | PROD | [Supported values](https://datahubproject.io/docs/graphql/enums#fabrictype). In all other cases, will fallback to PROD | +| spark.datahub.metadata.table.hive_platform_alias | | hive | By default, datahub assigns Hive-like tables to the Hive platform. If you are using Glue as your Hive metastore, set this config flag to `glue` | +| spark.datahub.metadata.include_scheme | | true | Include scheme from the path URI (e.g. hdfs://, s3://) in the dataset URN. We recommend setting this value to false, it is set to true for backwards compatibility with previous versions | +| spark.datahub.metadata.remove_partition_pattern | | | Remove partition pattern. (e.g. /partition=\d+) It change database/table/partition=123 to database/table | +| spark.datahub.coalesce_jobs | | true | Only one datajob(task) will be emitted containing all input and output datasets for the spark application | +| spark.datahub.parent.datajob_urn | | | Specified dataset will be set as upstream dataset for datajob created. Effective only when spark.datahub.coalesce_jobs is set to true | +| spark.datahub.metadata.dataset.materialize | | false | Materialize Datasets in DataHub | +| spark.datahub.platform.s3.path_spec_list | | | List of pathspec per platform | +| spark.datahub.metadata.dataset.experimental_include_schema_metadata | false | | Emit dataset schema metadata based on the spark | +| spark.datahub.flow_name | | | If it is set it will be used as the DataFlow name otherwise it uses spark app name as flow_name | +| spark.datahub.partition_regexp_pattern | | | Strip partition part from the path if path end matches with the specified regexp. Example `year=.*/month=.*/day=.*` | +| spark.datahub.tags | | | Comma separated list of tags to attach to the DataFlow | +| spark.datahub.domains | | | Comma separated list of domain urns to attach to the DataFlow | +| spark.datahub.stage_metadata_coalescing | | | Normally it coalesce and send metadata at the onApplicationEnd event which is never called on Databricsk. You should enable this on Databricks if you want coalesced run . | +| spark.datahub.patch.enabled | | | Set this to true to send lineage as a patch, which appends rather than overwrites existing Dataset lineage edges. By default it is enabled. +| + +## What to Expect: The Metadata Model + +As of current writing, the Spark agent produces metadata related to the Spark job, tasks and lineage edges to datasets. + +- A pipeline is created per Spark . +- A task is created per unique Spark query execution within an app. + +For Spark on Databricks, + +- A pipeline is created per + - cluster_identifier: specified with spark.datahub.databricks.cluster + - applicationID: on every restart of the cluster new spark applicationID will be created. +- A task is created per unique Spark query execution. + +### Custom properties & relating to Spark UI + +The following custom properties in pipelines and tasks relate to the Spark UI: + +- appName and appId in a pipeline can be used to determine the Spark application +- description and SQLQueryId in a task can be used to determine the Query Execution within the application on the SQL + tab of Spark UI +- Other custom properties of pipelines and tasks capture the start and end times of execution etc. + +For Spark on Databricks, pipeline start time is the cluster start time. + +### Spark versions supported + +Supports Spark 3.x series. + +### Environments tested with + +This initial release has been tested with the following environments: + +- spark-submit of Python/Java applications to local and remote servers +- Standalone Java applications +- Databricks Standalone Cluster + +Testing with Databricks Standard and High-concurrency Cluster is not done yet. + +### Configuring Hdfs based dataset URNs + +Spark emits lineage between datasets. It has its own logic for generating urns. Python sources emit metadata of +datasets. To link these 2 things, urns generated by both have to match. +This section will help you to match urns to that of other ingestion sources. +By default, URNs are created using +template `urn:li:dataset:(urn:li:dataPlatform:<$platform>,.,)`. We can configure these 4 +things to generate the desired urn. + +**Platform**: +Hdfs-based platforms supported explicitly: + +- AWS S3 (s3) +- Google Cloud Storage (gcs) +- local ( local file system) (local) + All other platforms will have "hdfs" as a platform. + +**Name**: +By default, the name is the complete path. For Hdfs base datasets, tables can be at different levels in the path than +that of the actual file read due to various reasons like partitioning, and sharding. 'path_spec' is used to alter the +name. +{table} marker is used to specify the table level. Below are a few examples. One can specify multiple path_specs for +different paths specified in the `path_spec_list`. Each actual path is matched against all path_spes present in the +list. First, one to match will be used to generate urn. + +**path_spec Examples** + +``` +spark.datahub.platform.s3.path_spec_list=s3://my-bucket/foo/{table}/year=*/month=*/day=*/*,s3://my-other-bucket/foo/{table}/year=*/month=*/day=*/*" +``` + +| Absolute path | path_spec | Urn | +|--------------------------------------|----------------------------------|------------------------------------------------------------------------------| +| s3://my-bucket/foo/tests/bar.avro | Not provided | urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD) | +| s3://my-bucket/foo/tests/bar.avro | s3://my-bucket/foo/{table}/* | urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests,PROD) | +| s3://my-bucket/foo/tests/bar.avro | s3://my-bucket/foo/tests/{table} | urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD) | +| gs://my-bucket/foo/tests/bar.avro | gs://my-bucket/{table}/*/* | urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD) | +| gs://my-bucket/foo/tests/bar.avro | gs://my-bucket/{table} | urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo,PROD) | +| file:///my-bucket/foo/tests/bar.avro | file:///my-bucket/*/*/{table} | urn:li:dataset:(urn:li:dataPlatform:local,my-bucket/foo/tests/bar.avro,PROD) | + +**platform instance and env:** + +The default value for env is 'PROD' and the platform instance is None. env and platform instances can be set for all +datasets using configurations 'spark.datahub.metadata.dataset.env' and 'spark.datahub.metadata.dataset.platformInstace'. +If spark is processing data that belongs to a different env or platform instance, then 'path_alias' can be used to +specify `path_spec` specific values of these. 'path_alias' groups the 'path_spec_list', its env, and platform instance +together. + +path_alias_list Example: + +The below example explains the configuration of the case, where files from 2 buckets are being processed in a single +spark application and files from my-bucket are supposed to have "instance1" as platform instance and "PROD" as env, and +files from bucket2 should have env "DEV" in their dataset URNs. + +``` +spark.datahub.platform.s3.path_alias_list : path1,path2 +spark.datahub.platform.s3.path1.env : PROD +spark.datahub.platform.s3.path1.path_spec_list: s3://my-bucket/*/*/{table} +spark.datahub.platform.s3.path1.platform_instance : instance-1 +spark.datahub.platform.s3.path2.env: DEV +spark.datahub.platform.s3.path2.path_spec_list: s3://bucket2/*/{table} +``` + +### Important notes on usage + +- It is advisable to ensure appName is used appropriately to ensure you can trace lineage from a pipeline back to your + source code. +- If multiple apps with the same appName run concurrently, dataset-lineage will be captured correctly but the + custom-properties e.g. app-id, SQLQueryId would be unreliable. We expect this to be quite rare. +- If spark execution fails, then an empty pipeline would still get created, but it may not have any tasks. +- For HDFS sources, the folder (name) is regarded as the dataset (name) to align with typical storage of parquet/csv + formats. + +### Debugging + +- Following info logs are generated + +On Spark context startup + +```text +YY/MM/DD HH:mm:ss INFO DatahubSparkListener: DatahubSparkListener initialised. +YY/MM/DD HH:mm:ss INFO SparkContext: Registered listener datahub.spark.DatahubSparkListener +``` + +On application start + +```text +YY/MM/DD HH:mm:ss INFO DatahubSparkListener: Application started: SparkListenerApplicationStart(AppName,Some(local-1644489736794),1644489735772,user,None,None) +YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: GMS url +YY/MM/DD HH:mm:ss INFO McpEmitter: REST Emitter Configuration: Token XXXXX +``` + +On pushing data to server + +```text +YY/MM/DD HH:mm:ss INFO McpEmitter: MetadataWriteResponse(success=true, responseContent={"value":""}, underlyingResponse=HTTP/1.1 200 OK [Date: day, DD month year HH:mm:ss GMT, Content-Type: application/json, X-RestLi-Protocol-Version: 2.0.0, Content-Length: 97, Server: Jetty(9.4.46.v20220331)] [Content-Length: 97,Chunked: false]) +``` + +On application end + +```text +YY/MM/DD HH:mm:ss INFO DatahubSparkListener: Application ended : AppName AppID +``` + +- To enable debugging logs, add below configuration in log4j.properties file + +```properties +log4j.logger.datahub.spark=DEBUG +log4j.logger.datahub.client.rest=DEBUG +``` + +## How to build +Use Java 8 to build the project. The project uses Gradle as the build tool. To build the project, run the following command: + +```shell +./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage-beta:shadowJar +``` +## Known limitations + diff --git a/metadata-integration/java/spark-lineage-beta/build.gradle b/metadata-integration/java/spark-lineage-beta/build.gradle new file mode 100644 index 00000000000000..4cd2ddfec3dfcf --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/build.gradle @@ -0,0 +1,254 @@ +plugins { + id("com.palantir.git-version") apply false +} +apply plugin: 'java' +apply plugin: 'com.github.johnrengelman.shadow' +apply plugin: 'signing' +apply plugin: 'io.codearte.nexus-staging' +apply plugin: 'maven-publish' +apply plugin: 'jacoco' +apply from: '../versioning.gradle' + +jar.enabled = false // Since we only want to build shadow jars, disabling the regular jar creation + +//to rename artifacts for publish +project.archivesBaseName = 'acryl-spark-lineage' + +//mark implementaion dependencies which needs to excluded along with transitive dependencies from shadowjar +//functionality is exactly same as "implementation" +configurations { + provided + implementation.extendsFrom provided +} + +dependencies { + constraints { + provided(externalDependency.hadoopMapreduceClient) { + because 'Needed for tie breaking of guava version need for spark and wiremock' + } + provided(externalDependency.hadoopCommon) { + because 'required for org.apache.hadoop.util.StopWatch' + } + provided(externalDependency.commonsIo) { + because 'required for org.apache.commons.io.Charsets that is used internally' + } + } + + provided(externalDependency.sparkSql) + provided(externalDependency.sparkHive) + implementation 'org.slf4j:slf4j-log4j12:2.0.7' + implementation externalDependency.httpAsyncClient + implementation externalDependency.logbackClassicJava8 + implementation externalDependency.typesafeConfig + implementation externalDependency.commonsLang + + implementation externalDependency.slf4jApi + compileOnly externalDependency.lombok + annotationProcessor externalDependency.lombok + + implementation externalDependency.typesafeConfig + implementation externalDependency.json + + implementation project(path: ':metadata-integration:java:datahub-client', configuration: 'shadow') + implementation project(path: ':metadata-integration:java:openlineage-converter', configuration: 'shadow') + + //implementation "io.acryl:datahub-client:0.10.2" + implementation "io.openlineage:openlineage-spark:$openLineageVersion" + compileOnly "org.apache.iceberg:iceberg-spark3-runtime:0.12.1" + compileOnly "org.apache.spark:spark-sql_2.12:3.1.3" + + testCompileOnly externalDependency.lombok + testAnnotationProcessor externalDependency.lombok + + // Tests need a concrete log4j available. Providing it here + testImplementation 'org.apache.logging.log4j:log4j-api:2.17.1' + testImplementation 'org.slf4j:slf4j-log4j12:2.0.7' + + testImplementation(externalDependency.postgresql) { + exclude group: "com.fasterxml.jackson.core" + } + + testImplementation externalDependency.mockito + + testImplementation(externalDependency.mockServer) { + exclude group: "com.fasterxml.jackson.core" + } // older version to allow older guava + testImplementation(externalDependency.mockServerClient) { + exclude group: "com.fasterxml.jackson.core" + } // older version to allow older guava + + testImplementation(externalDependency.testContainersPostgresql) +} + +tasks.register('checkShadowJar', Exec) { + commandLine 'sh', '-c', 'scripts/check_jar.sh' +} + +shadowJar { + zip64 = true + archiveClassifier = '' + mergeServiceFiles() + + def exclude_modules = project + .configurations + .provided + .resolvedConfiguration + .getLenientConfiguration() + .getAllModuleDependencies() + .collect { + it.name + } + dependencies { + + exclude(dependency { + exclude_modules.contains(it.name) + }) + } + + // preventing java multi-release JAR leakage + // https://github.com/johnrengelman/shadow/issues/729 + exclude('module-info.class', 'META-INF/versions/**', 'LICENSE', 'NOTICE') + + // prevent jni conflict with spark + exclude '**/libzstd-jni.*' + exclude '**/com_github_luben_zstd_*' + + relocate 'avro.com', 'io.acryl.shaded.avro.com' + relocate 'org.json', 'io.acryl.shaded.org.json' + relocate 'com.github', 'io.acryl.shaded.com.github' + relocate 'avroutil1', 'io.acryl.shaded.avroutil1' + relocate 'com.sun.activation', 'io.acryl.shaded.com.sun.activation' + relocate 'com.sun.codemodel', 'io.acryl.shaded.com.sun.codemodel' + relocate 'com.sun.mail', 'io.acryl.shaded.com.sun.mail' + relocate 'com.fasterxml.jackson', 'datahub.spark2.shaded.jackson' + relocate 'org.slf4j', 'datahub.spark2.shaded.org.slf4j' + // + relocate 'org.apache.http', 'io.acryl.shaded.http' + relocate 'org.apache.commons.codec', 'datahub.spark2.shaded.o.a.c.codec' + relocate 'org.apache.commons.compress', 'datahub.spark2.shaded.o.a.c.compress' + relocate 'org.apache.commons.lang3', 'datahub.spark2.shaded.o.a.c.lang3' + relocate 'mozilla', 'datahub.spark2.shaded.mozilla' + relocate 'com.typesafe', 'datahub.spark2.shaded.typesafe' + relocate 'io.opentracing', 'datahub.spark2.shaded.io.opentracing' + relocate 'io.netty', 'datahub.spark2.shaded.io.netty' + relocate 'ch.randelshofer', 'datahub.spark2.shaded.ch.randelshofer' + relocate 'ch.qos', 'datahub.spark2.shaded.ch.qos' + relocate 'org.springframework', 'io.acryl.shaded.org.springframework' + relocate 'com.fasterxml.jackson', 'io.acryl.shaded.jackson' + relocate 'org.yaml', 'io.acryl.shaded.org.yaml' // Required for shading snakeyaml + relocate 'net.jcip.annotations', 'io.acryl.shaded.annotations' + relocate 'javassist', 'io.acryl.shaded.javassist' + relocate 'edu.umd.cs.findbugs', 'io.acryl.shaded.findbugs' + relocate 'org.antlr', 'io.acryl.shaded.org.antlr' + relocate 'antlr', 'io.acryl.shaded.antlr' + relocate 'com.google.common', 'io.acryl.shaded.com.google.common' + relocate 'org.apache.commons', 'io.acryl.shaded.org.apache.commons' + relocate 'org.reflections', 'io.acryl.shaded.org.reflections' + relocate 'st4hidden', 'io.acryl.shaded.st4hidden' + relocate 'org.stringtemplate', 'io.acryl.shaded.org.stringtemplate' + relocate 'org.abego.treelayout', 'io.acryl.shaded.treelayout' + relocate 'org.slf4j', 'io.acryl.shaded.slf4j' + relocate 'javax.annotation', 'io.acryl.shaded.javax.annotation' + relocate 'com.github.benmanes.caffeine', 'io.acryl.shaded.com.github.benmanes.caffeine' + relocate 'org.checkerframework', 'io.acryl.shaded.org.checkerframework' + relocate 'com.google.errorprone', 'io.acryl.shaded.com.google.errorprone' + relocate 'com.sun.jna', 'io.acryl.shaded.com.sun.jna' +} + +checkShadowJar { + dependsOn shadowJar +} + +jacocoTestReport { + dependsOn test // tests are required to run before generating the report +} + +test { + forkEvery = 1 + useJUnit() + finalizedBy jacocoTestReport +} + +assemble { + dependsOn shadowJar +} + +task integrationTest(type: Exec, dependsOn: [shadowJar, ':docker:quickstart']) { + environment "RUN_QUICKSTART", "false" + commandLine "spark-smoke-test/smoke.sh" +} + +task sourcesJar(type: Jar) { + archiveClassifier = 'sources' + from sourceSets.main.allJava +} + +task javadocJar(type: Jar, dependsOn: javadoc) { + archiveClassifier = 'javadoc' + from javadoc.destinationDir +} + +publishing { + publications { + shadow(MavenPublication) { publication -> + project.shadow.component(publication) + pom { + name = 'Acryl Spark Lineage' + group = 'io.acryl' + artifactId = 'acryl-spark-lineage' + description = 'Library to push data lineage from spark to datahub' + url = 'https://datahubproject.io' + artifacts = [shadowJar, javadocJar, sourcesJar] + + scm { + connection = 'scm:git:git://github.com/datahub-project/datahub.git' + developerConnection = 'scm:git:ssh://github.com:datahub-project/datahub.git' + url = 'https://github.com/datahub-project/datahub.git' + } + + licenses { + license { + name = 'The Apache License, Version 2.0' + url = 'http://www.apache.org/licenses/LICENSE-2.0.txt' + } + } + + developers { + developer { + id = 'datahub' + name = 'Datahub' + email = 'datahub@acryl.io' + } + } + } + } + } + + repositories { + maven { + def releasesRepoUrl = "https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/" + def snapshotsRepoUrl = "https://s01.oss.sonatype.org/content/repositories/snapshots/" + def ossrhUsername = System.getenv('RELEASE_USERNAME') + def ossrhPassword = System.getenv('RELEASE_PASSWORD') + credentials { + username ossrhUsername + password ossrhPassword + } + url = version.endsWith('SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl + } + } +} + +signing { + def signingKey = findProperty("signingKey") + def signingPassword = System.getenv("SIGNING_PASSWORD") + useInMemoryPgpKeys(signingKey, signingPassword) + sign publishing.publications.shadow +} + +nexusStaging { + serverUrl = "https://s01.oss.sonatype.org/service/local/" + //required only for projects registered in Sonatype after 2021-02-24 + username = System.getenv("NEXUS_USERNAME") + password = System.getenv("NEXUS_PASSWORD") +} diff --git a/metadata-integration/java/spark-lineage-beta/scripts/check_jar.sh b/metadata-integration/java/spark-lineage-beta/scripts/check_jar.sh new file mode 100755 index 00000000000000..41b09e0705b897 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/scripts/check_jar.sh @@ -0,0 +1,55 @@ +# This script checks the shadow jar to ensure that we only have allowed classes being exposed through the jar +set -x +libName=acryl-spark-lineage +jarishFile=$(find build/libs -name "${libName}*.jar" -exec ls -1rt "{}" +;) +jarFiles=$(echo "$jarishFile" | grep -v sources | grep -v javadoc | tail -n 1) +for jarFile in ${jarFiles}; do + jar -tvf $jarFile |\ + grep -v "log4j.xml" |\ + grep -v "log4j2.xml" |\ + grep -v "org/apache/log4j" |\ + grep -v "io/acryl/" |\ + grep -v "datahub/shaded" |\ + grep -v "licenses" |\ + grep -v "META-INF" |\ + grep -v "com/linkedin" |\ + grep -v "com/datahub" |\ + grep -v "datahub" |\ + grep -v "entity-registry" |\ + grep -v "pegasus/" |\ + grep -v "legacyPegasusSchemas/" |\ + grep -v " com/$" |\ + grep -v "git.properties" |\ + grep -v " org/$" |\ + grep -v " io/$" |\ + grep -v "git.properties" |\ + grep -v "org/aopalliance" |\ + grep -v "javax/" |\ + grep -v "io/swagger" |\ + grep -v "JavaSpring" |\ + grep -v "java-header-style.xml" |\ + grep -v "xml-header-style.xml" |\ + grep -v "license.header" |\ + grep -v "module-info.class" |\ + grep -v "com/google/" |\ + grep -v "org/codehaus/" |\ + grep -v "client.properties" |\ + grep -v "kafka" |\ + grep -v "win/" |\ + grep -v "include/" |\ + grep -v "linux/" |\ + grep -v "darwin" |\ + grep -v "MetadataChangeProposal.avsc" |\ + grep -v "io.openlineage" |\ + grep -v "org.apache" |\ + grep -v "aix" + + +if [ $? -ne 0 ]; then + echo "✅ No unexpected class paths found in ${jarFile}" +else + echo "💥 Found unexpected class paths in ${jarFile}" + exit 1 +fi +done +exit 0 diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubEventEmitter.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubEventEmitter.java new file mode 100644 index 00000000000000..6b430c5c2ab262 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubEventEmitter.java @@ -0,0 +1,414 @@ +package datahub.spark; + +import static datahub.spark.converter.SparkStreamingEventToDatahub.*; +import static io.datahubproject.openlineage.converter.OpenLineageToDataHub.*; +import static io.datahubproject.openlineage.utils.DatahubUtils.*; + +import com.linkedin.common.GlobalTags; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.DataJobUrn; +import com.linkedin.data.template.StringMap; +import com.linkedin.dataprocess.DataProcessInstanceRelationships; +import com.linkedin.dataprocess.RunResultType; +import com.linkedin.domain.Domains; +import com.linkedin.mxe.MetadataChangeProposal; +import datahub.client.Emitter; +import datahub.client.rest.RestEmitter; +import datahub.event.EventFormatter; +import datahub.event.MetadataChangeProposalWrapper; +import datahub.spark.conf.RestDatahubEmitterConfig; +import datahub.spark.conf.SparkLineageConf; +import io.datahubproject.openlineage.converter.OpenLineageToDataHub; +import io.datahubproject.openlineage.dataset.DatahubDataset; +import io.datahubproject.openlineage.dataset.DatahubJob; +import io.openlineage.client.OpenLineage; +import io.openlineage.client.OpenLineageClientUtils; +import io.openlineage.spark.agent.ArgumentParser; +import io.openlineage.spark.agent.EventEmitter; +import java.io.IOException; +import java.net.URISyntaxException; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import lombok.extern.slf4j.Slf4j; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.streaming.StreamingQueryProgress; + +@Slf4j +public class DatahubEventEmitter extends EventEmitter { + private final AtomicBoolean streaming = new AtomicBoolean(false); + + private final List _datahubJobs = new LinkedList<>(); + private final Map schemaMap = new HashMap<>(); + private SparkLineageConf datahubConf; + + private EventFormatter eventFormatter = new EventFormatter(); + + public DatahubEventEmitter() throws URISyntaxException { + super(ArgumentParser.parse(new SparkConf())); + } + + private Optional getEmitter() { + Optional emitter = Optional.empty(); + if (datahubConf.getDatahubEmitterConfig() != null) { + if (datahubConf.getDatahubEmitterConfig() instanceof RestDatahubEmitterConfig) { + RestDatahubEmitterConfig datahubRestEmitterConfig = + (RestDatahubEmitterConfig) datahubConf.getDatahubEmitterConfig(); + emitter = Optional.of(new RestEmitter(datahubRestEmitterConfig.getRestEmitterConfig())); + } else { + log.error( + "DataHub Transport {} not recognized. DataHub Lineage emission will not work", + RestDatahubEmitterConfig.class.getName()); + } + } else { + log.error("No Transport set. DataHub Lineage emission will not work"); + } + return emitter; + } + + public Optional convertOpenLineageRunEventToDatahubJob(OpenLineage.RunEvent event) { + Optional datahubJob = Optional.empty(); + try { + log.debug("Emitting lineage: {}", OpenLineageClientUtils.toJson(event)); + if (!isStreaming()) { + datahubJob = + Optional.ofNullable(convertRunEventToJob(event, datahubConf.getOpenLineageConf())); + if (!datahubJob.isPresent()) { + return datahubJob; + } + log.info( + "Converted Job: {}, from {}", datahubJob.get(), OpenLineageClientUtils.toJson(event)); + _datahubJobs.add(datahubJob.get()); + return datahubJob; + } + } catch (IOException | URISyntaxException e) { + throw new RuntimeException("Error: " + e.getMessage(), e); + } + return datahubJob; + } + + public void emit(OpenLineage.RunEvent event) { + long startTime = System.currentTimeMillis(); + // We have to serialize and deserialize the event to make sure the event is in the correct + // format + event = OpenLineageClientUtils.runEventFromJson(OpenLineageClientUtils.toJson(event)); + Optional job = convertOpenLineageRunEventToDatahubJob(event); + if (!job.isPresent()) { + return; + } + + if (!datahubConf.getTags().isEmpty()) { + GlobalTags tags = OpenLineageToDataHub.generateTags(datahubConf.getTags()); + job.get().setFlowGlobalTags(tags); + } + + if (!datahubConf.getDomains().isEmpty()) { + Domains domains = OpenLineageToDataHub.generateDomains(datahubConf.getDomains()); + job.get().setFlowDomains(domains); + } + + if (isStreaming()) { + log.info("Streaming mode is enabled. Skipping lineage emission."); + return; + } + if (!datahubConf.isCoalesceEnabled()) { + log.info("Emitting lineage"); + try { + emitMcps(job.get().toMcps(datahubConf.getOpenLineageConf())); + } catch (IOException e) { + throw new RuntimeException(e); + } + log.debug( + "Emitting non-coalesced lineage completed successfully: {}", + OpenLineageClientUtils.toJson(event)); + } + if (datahubConf.isCoalesceEnabled() && datahubConf.isEmitCoalescePeriodically()) { + log.info("Emitting coalesced lineage periodically"); + emitCoalesced(); + log.debug( + "Collecting coalesced lineage periodically completed successfully: {}", + OpenLineageClientUtils.toJson(event)); + } + long elapsedTime = System.currentTimeMillis() - startTime; + log.info("Collecting lineage successfully in {} ms", elapsedTime); + } + + public void emitCoalesced() { + long startTime = System.currentTimeMillis(); + + if (isStreaming()) { + log.info("Streaming mode is enabled. Skipping lineage emission."); + return; + } + + if (datahubConf.isCoalesceEnabled()) { + List mcps = generateCoalescedMcps(); + log.info("Emitting Coalesced lineage completed successfully"); + emitMcps(mcps); + } + long elapsedTime = System.currentTimeMillis() - startTime; + log.info("Emitting coalesced lineage completed in {} ms", elapsedTime); + } + + public List generateCoalescedMcps() { + List mcps = new ArrayList<>(); + + if (_datahubJobs.isEmpty()) { + log.warn("No lineage events to emit. Maybe the spark job finished premaraturely?"); + return mcps; + } + + DatahubJob datahubJob = DatahubJob.builder().build(); + AtomicLong minStartTime = new AtomicLong(Long.MAX_VALUE); + AtomicLong maxEndTime = new AtomicLong(); + _datahubJobs.forEach( + storedDatahubJob -> { + log.info("Merging job stored job {} to {}", storedDatahubJob, datahubJob); + DataJobUrn jobUrn = + jobUrn( + storedDatahubJob.getFlowUrn(), storedDatahubJob.getFlowUrn().getFlowIdEntity()); + datahubJob.setJobUrn(jobUrn); + datahubJob.setFlowUrn(storedDatahubJob.getFlowUrn()); + datahubJob.setFlowPlatformInstance(storedDatahubJob.getFlowPlatformInstance()); + if ((datahubJob.getJobInfo() == null) && (storedDatahubJob.getJobInfo() != null)) { + datahubJob.setJobInfo(storedDatahubJob.getJobInfo()); + datahubJob.getJobInfo().setName(storedDatahubJob.getFlowUrn().getFlowIdEntity()); + } + if (storedDatahubJob.getJobInfo() != null + && storedDatahubJob.getJobInfo().getCustomProperties() != null) { + if (datahubJob.getJobInfo().getCustomProperties() == null) { + datahubJob + .getJobInfo() + .setCustomProperties(storedDatahubJob.getJobInfo().getCustomProperties()); + } else { + Map mergedProperties = + Stream.of( + datahubJob.getJobInfo().getCustomProperties(), + storedDatahubJob.getJobInfo().getCustomProperties()) + .flatMap(map -> map.entrySet().stream()) + .collect( + Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (v1, v2) -> v1)); + datahubJob.getJobInfo().setCustomProperties(new StringMap(mergedProperties)); + } + } + if (datahubJob.getDataFlowInfo() == null) { + datahubJob.setDataFlowInfo(storedDatahubJob.getDataFlowInfo()); + } + + if (storedDatahubJob.getStartTime() < minStartTime.get()) { + minStartTime.set(storedDatahubJob.getStartTime()); + } + + if (storedDatahubJob.getEndTime() > maxEndTime.get()) { + maxEndTime.set(storedDatahubJob.getEndTime()); + } + + mergeDatasets(storedDatahubJob.getOutSet(), datahubJob.getOutSet()); + + mergeDatasets(storedDatahubJob.getInSet(), datahubJob.getInSet()); + + mergeDataProcessInstance(datahubJob, storedDatahubJob); + + mergeCustomProperties(datahubJob, storedDatahubJob); + }); + + datahubJob.setStartTime(minStartTime.get()); + datahubJob.setEndTime(maxEndTime.get()); + if (!datahubConf.getTags().isEmpty()) { + GlobalTags tags = OpenLineageToDataHub.generateTags(datahubConf.getTags()); + datahubJob.setFlowGlobalTags(tags); + } + + if (!datahubConf.getDomains().isEmpty()) { + Domains domains = OpenLineageToDataHub.generateDomains(datahubConf.getDomains()); + datahubJob.setFlowDomains(domains); + } + try { + if (datahubConf.getOpenLineageConf().getParentJobUrn() != null) { + datahubJob.getParentJobs().add(datahubConf.getOpenLineageConf().getParentJobUrn()); + } + } catch (ClassCastException e) { + log.warn( + datahubConf.getOpenLineageConf().getParentJobUrn() + + " is not a valid Datajob URN. Skipping setting up upstream job."); + } + + log.info("Generating MCPs for job: {}", datahubJob); + try { + return datahubJob.toMcps(datahubConf.getOpenLineageConf()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void mergeDatasets( + Set storedDatahubJob, Set datahubJob) { + for (DatahubDataset dataset : storedDatahubJob) { + Optional oldDataset = + datahubJob.stream().filter(ds -> ds.getUrn().equals(dataset.getUrn())).findFirst(); + if (oldDataset.isPresent()) { + if (dataset.getSchemaMetadata() != null) { + oldDataset.get().setSchemaMetadata(dataset.getSchemaMetadata()); + } + if (dataset.getLineage() != null) { + oldDataset.get().setLineage(dataset.getLineage()); + } + } else { + datahubJob.add(dataset); + } + } + } + + private static void mergeDataProcessInstance(DatahubJob datahubJob, DatahubJob storedDatahubJob) { + // To merge multiple events into one DataProcess we should do the following steps: + // 1. A run is only in SUCCESS if all the process instance status are SUCCESS + // 2. A run is in failed state if any of the run events is in FAILED/UNKNOWN/SKIPPED state + // + // We should set as id the first event to make sure it won't change if we ingest periodically + // coalesced data + // Todo: Status can be SUCCESS only if all the process instance status are SUCCESS + if (datahubJob.getDataProcessInstanceUrn() == null) { + datahubJob.setDataProcessInstanceUrn(storedDatahubJob.getDataProcessInstanceUrn()); + } + + if (storedDatahubJob.getEventTime() > datahubJob.getEventTime()) { + datahubJob.setEventTime(storedDatahubJob.getEventTime()); + datahubJob.setDataProcessInstanceProperties( + storedDatahubJob.getDataProcessInstanceProperties()); + DataProcessInstanceRelationships dataProcessInstanceRelationships = + new DataProcessInstanceRelationships(); + dataProcessInstanceRelationships.setParentTemplate(datahubJob.getJobUrn()); + dataProcessInstanceRelationships.setUpstreamInstances(new UrnArray()); + datahubJob.setDataProcessInstanceRelationships(dataProcessInstanceRelationships); + } + log.info("DataProcessInstanceRunEvent: {}", storedDatahubJob.getDataProcessInstanceRunEvent()); + if ((storedDatahubJob.getDataProcessInstanceRunEvent() != null) + && (storedDatahubJob.getDataProcessInstanceRunEvent().getResult() != null)) { + RunResultType result = + storedDatahubJob.getDataProcessInstanceRunEvent().getResult().getType(); + if (datahubJob.getDataProcessInstanceRunEvent() == null) { + datahubJob.setDataProcessInstanceRunEvent( + storedDatahubJob.getDataProcessInstanceRunEvent()); + } else if (result == RunResultType.FAILURE) { + datahubJob.setDataProcessInstanceRunEvent( + storedDatahubJob.getDataProcessInstanceRunEvent()); + } + } + log.info("DataProcessInstanceRunEvent: {}", datahubJob.getDataProcessInstanceRunEvent()); + } + + private void mergeCustomProperties(DatahubJob datahubJob, DatahubJob storedDatahubJob) { + if (storedDatahubJob.getDataFlowInfo().getCustomProperties() != null) { + if (datahubJob.getDataFlowInfo().getCustomProperties() == null) { + datahubJob + .getDataFlowInfo() + .setCustomProperties(storedDatahubJob.getDataFlowInfo().getCustomProperties()); + } else { + Map mergedProperties = + Stream.of( + datahubJob.getDataFlowInfo().getCustomProperties(), + storedDatahubJob.getDataFlowInfo().getCustomProperties()) + .flatMap(map -> map.entrySet().stream()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (v1, v2) -> v1)); + mergedProperties.put("finishedAt", ZonedDateTime.now(ZoneOffset.UTC).toString()); + + if (datahubConf.getSparkAppContext() != null) { + if (datahubConf.getSparkAppContext().getStartTime() != null) { + mergedProperties.put( + "startedAt", + ZonedDateTime.ofInstant( + Instant.ofEpochMilli(datahubConf.getSparkAppContext().getStartTime()), + ZoneOffset.UTC) + .toString()); + } + if (datahubConf.getSparkAppContext().getAppAttemptId() != null) { + mergedProperties.put("attemptId", datahubConf.getSparkAppContext().getAppAttemptId()); + } + if (datahubConf.getSparkAppContext().getSparkUser() != null) { + mergedProperties.put("sparkUser", datahubConf.getSparkAppContext().getSparkUser()); + } + + if (datahubConf.getSparkAppContext().getAppId() != null) { + mergedProperties.put("appId", datahubConf.getSparkAppContext().getAppId()); + } + + if (datahubConf.getSparkAppContext().getDatabricksTags() != null) { + mergedProperties.putAll(datahubConf.getSparkAppContext().getDatabricksTags()); + } + } + datahubJob.getDataFlowInfo().setCustomProperties(new StringMap(mergedProperties)); + } + } + } + + public void emit(StreamingQueryProgress event) throws URISyntaxException { + List mcps = new ArrayList<>(); + for (MetadataChangeProposalWrapper mcpw : + generateMcpFromStreamingProgressEvent(event, datahubConf, schemaMap)) { + try { + mcps.add(eventFormatter.convert(mcpw)); + } catch (IOException e) { + log.error("Failed to convert mcpw to mcp", e); + } + } + emitMcps(mcps); + } + + protected void emitMcps(List mcps) { + Optional emitter = getEmitter(); + if (emitter.isPresent()) { + mcps.stream() + .map( + mcp -> { + try { + log.info("emitting mcpw: " + mcp); + return emitter.get().emit(mcp); + } catch (IOException ioException) { + log.error("Failed to emit metadata to DataHub", ioException); + return null; + } + }) + .filter(Objects::nonNull) + .collect(Collectors.toList()) + .forEach( + future -> { + try { + log.info(future.get().toString()); + } catch (InterruptedException | ExecutionException e) { + // log error, but don't impact thread + log.error("Failed to emit metadata to DataHub", e); + } + }); + try { + emitter.get().close(); + } catch (IOException e) { + log.error("Issue while closing emitter" + e); + } + } + } + + public void setConfig(SparkLineageConf sparkConfig) { + this.datahubConf = sparkConfig; + } + + public boolean isStreaming() { + return streaming.get(); + } + + public void setStreaming(boolean enabled) { + streaming.set(enabled); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java new file mode 100644 index 00000000000000..060402723d1940 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java @@ -0,0 +1,230 @@ +package datahub.spark; + +import static datahub.spark.conf.SparkConfigParser.*; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import datahub.client.rest.RestEmitterConfig; +import datahub.spark.conf.DatahubEmitterConfig; +import datahub.spark.conf.RestDatahubEmitterConfig; +import datahub.spark.conf.SparkAppContext; +import datahub.spark.conf.SparkConfigParser; +import datahub.spark.conf.SparkLineageConf; +import io.openlineage.spark.agent.OpenLineageSparkListener; +import io.openlineage.spark.agent.lifecycle.ContextFactory; +import java.net.URISyntaxException; +import java.time.Instant; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import org.apache.spark.SparkEnv; +import org.apache.spark.SparkEnv$; +import org.apache.spark.scheduler.SparkListener; +import org.apache.spark.scheduler.SparkListenerApplicationEnd; +import org.apache.spark.scheduler.SparkListenerApplicationStart; +import org.apache.spark.scheduler.SparkListenerEvent; +import org.apache.spark.scheduler.SparkListenerJobEnd; +import org.apache.spark.scheduler.SparkListenerJobStart; +import org.apache.spark.scheduler.SparkListenerTaskEnd; +import org.apache.spark.sql.streaming.StreamingQueryListener; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DatahubSparkListener extends SparkListener { + private static final Logger log = LoggerFactory.getLogger(DatahubSparkListener.class); + private final Map batchLastUpdated = new HashMap(); + private final OpenLineageSparkListener listener; + private final DatahubEventEmitter emitter; + private Config datahubConf = ConfigFactory.empty(); + private SparkAppContext appContext; + + public DatahubSparkListener() throws URISyntaxException { + listener = new OpenLineageSparkListener(); + emitter = new DatahubEventEmitter(); + ContextFactory contextFactory = new ContextFactory(emitter); + OpenLineageSparkListener.init(contextFactory); + } + + private static SparkAppContext getSparkAppContext( + SparkListenerApplicationStart applicationStart) { + SparkAppContext appContext = new SparkAppContext(); + appContext.setAppName(applicationStart.appName()); + if (applicationStart.appAttemptId().isDefined()) { + appContext.setAppAttemptId(applicationStart.appAttemptId().get()); + } + appContext.setSparkUser(applicationStart.sparkUser()); + appContext.setStartTime(applicationStart.time()); + appContext.setAppId(applicationStart.appId().get()); + return appContext; + } + + public void onApplicationStart(SparkListenerApplicationStart applicationStart) { + long startTime = System.currentTimeMillis(); + + log.debug("Application start called"); + this.appContext = getSparkAppContext(applicationStart); + + listener.onApplicationStart(applicationStart); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("onApplicationStart completed successfully in {} ms", elapsedTime); + } + + public Optional initializeEmitter(Config sparkConf) { + String emitterType = + sparkConf.hasPath(SparkConfigParser.TRANSPORT_KEY) + ? sparkConf.getString(SparkConfigParser.TRANSPORT_KEY) + : "rest"; + if (emitterType.equals("rest")) { + String gmsUrl = + sparkConf.hasPath(SparkConfigParser.GMS_URL_KEY) + ? sparkConf.getString(SparkConfigParser.GMS_URL_KEY) + : "http://localhost:8080"; + String token = + sparkConf.hasPath(SparkConfigParser.GMS_AUTH_TOKEN) + ? sparkConf.getString(SparkConfigParser.GMS_AUTH_TOKEN) + : null; + boolean disableSslVerification = + sparkConf.hasPath(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY) + && sparkConf.getBoolean(SparkConfigParser.DISABLE_SSL_VERIFICATION_KEY); + log.info( + "REST Emitter Configuration: GMS url {}{}", + gmsUrl, + (sparkConf.hasPath(SparkConfigParser.GMS_URL_KEY) ? "" : "(default)")); + if (token != null) { + log.info("REST Emitter Configuration: Token {}", "XXXXX"); + } + if (disableSslVerification) { + log.warn("REST Emitter Configuration: ssl verification will be disabled."); + } + RestEmitterConfig restEmitterConf = + RestEmitterConfig.builder() + .server(gmsUrl) + .token(token) + .disableSslVerification(disableSslVerification) + .build(); + return Optional.of(new RestDatahubEmitterConfig(restEmitterConf)); + } else { + log.error( + "DataHub Transport {} not recognized. DataHub Lineage emission will not work", + emitterType); + } + + return Optional.empty(); + } + + private synchronized void loadDatahubConfig(SparkAppContext appContext, Properties properties) { + long startTime = System.currentTimeMillis(); + datahubConf = parseSparkConfig(); + SparkEnv sparkEnv = SparkEnv$.MODULE$.get(); + if (sparkEnv != null) { + log.info("sparkEnv: {}", sparkEnv.conf().toDebugString()); + sparkEnv.conf().set("spark.openlineage.facets.disabled", "[spark_unknown;spark.logicalPlan]"); + } + + if (properties != null) { + datahubConf = parsePropertiesToConfig(properties); + Optional> databricksTags = getDatabricksTags(datahubConf); + this.appContext.setDatabricksTags(databricksTags.orElse(null)); + } + log.info("Datahub configuration: {}", datahubConf.root().render()); + Optional restEmitter = initializeEmitter(datahubConf); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConf, appContext, restEmitter.orElse(null)); + + emitter.setConfig(sparkLineageConf); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("loadDatahubConfig completed successfully in {} ms", elapsedTime); + } + + public void onApplicationEnd(SparkListenerApplicationEnd applicationEnd) { + long startTime = System.currentTimeMillis(); + + log.debug("Application end called"); + listener.onApplicationEnd(applicationEnd); + if (datahubConf.hasPath(STREAMING_JOB) && (datahubConf.getBoolean(STREAMING_JOB))) { + return; + } + emitter.emitCoalesced(); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("onApplicationEnd completed successfully in {} ms", elapsedTime); + } + + public void onTaskEnd(SparkListenerTaskEnd taskEnd) { + long startTime = System.currentTimeMillis(); + + log.debug("Task end called"); + listener.onTaskEnd(taskEnd); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("onTaskEnd completed successfully in {} ms", elapsedTime); + } + + public void onJobEnd(SparkListenerJobEnd jobEnd) { + long startTime = System.currentTimeMillis(); + + log.debug("Job end called"); + listener.onJobEnd(jobEnd); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("onJobEnd completed successfully in {} ms", elapsedTime); + } + + public void onJobStart(SparkListenerJobStart jobStart) { + long startTime = System.currentTimeMillis(); + log.debug("Job start called"); + loadDatahubConfig(this.appContext, jobStart.properties()); + listener.onJobStart(jobStart); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("onJobStart completed successfully in {} ms", elapsedTime); + } + + public void onOtherEvent(SparkListenerEvent event) { + long startTime = System.currentTimeMillis(); + + log.debug("Other event called {}", event.getClass().getName()); + // Switch to streaming mode if streaming mode is not set, but we get a progress event + if ((event instanceof StreamingQueryListener.QueryProgressEvent) + || (event instanceof StreamingQueryListener.QueryStartedEvent)) { + if (!emitter.isStreaming()) { + if (!datahubConf.hasPath(STREAMING_JOB)) { + log.info("Streaming mode not set explicitly, switching to streaming mode"); + emitter.setStreaming(true); + } else { + emitter.setStreaming(datahubConf.getBoolean(STREAMING_JOB)); + log.info("Streaming mode set to {}", datahubConf.getBoolean(STREAMING_JOB)); + } + } + } + + if (datahubConf.hasPath(STREAMING_JOB) && !datahubConf.getBoolean(STREAMING_JOB)) { + log.info("Not in streaming mode"); + return; + } + + listener.onOtherEvent(event); + + if (event instanceof StreamingQueryListener.QueryProgressEvent) { + int streamingHeartbeatIntervalSec = SparkConfigParser.getStreamingHeartbeatSec(datahubConf); + StreamingQueryListener.QueryProgressEvent queryProgressEvent = + (StreamingQueryListener.QueryProgressEvent) event; + ((StreamingQueryListener.QueryProgressEvent) event).progress().id(); + if ((batchLastUpdated.containsKey(queryProgressEvent.progress().id().toString())) + && (batchLastUpdated + .get(queryProgressEvent.progress().id().toString()) + .isAfter(Instant.now().minusSeconds(streamingHeartbeatIntervalSec)))) { + log.debug( + "Skipping lineage emit as it was emitted in the last {} seconds", + streamingHeartbeatIntervalSec); + return; + } + try { + batchLastUpdated.put(queryProgressEvent.progress().id().toString(), Instant.now()); + emitter.emit(queryProgressEvent.progress()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + log.debug("Query progress event: {}", queryProgressEvent.progress()); + long elapsedTime = System.currentTimeMillis() - startTime; + log.debug("onOtherEvent completed successfully in {} ms", elapsedTime); + } + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/DatahubEmitterConfig.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/DatahubEmitterConfig.java new file mode 100644 index 00000000000000..1f9a37c068f91d --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/DatahubEmitterConfig.java @@ -0,0 +1,5 @@ +package datahub.spark.conf; + +public interface DatahubEmitterConfig { + String getType(); +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/RestDatahubEmitterConfig.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/RestDatahubEmitterConfig.java new file mode 100644 index 00000000000000..8b02f2682a5116 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/RestDatahubEmitterConfig.java @@ -0,0 +1,18 @@ +package datahub.spark.conf; + +import datahub.client.rest.RestEmitterConfig; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; + +@Setter +@ToString +@Getter +public class RestDatahubEmitterConfig implements DatahubEmitterConfig { + final String type = "rest"; + datahub.client.rest.RestEmitterConfig restEmitterConfig; + + public RestDatahubEmitterConfig(RestEmitterConfig restEmitterConfig) { + this.restEmitterConfig = restEmitterConfig; + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkAppContext.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkAppContext.java new file mode 100644 index 00000000000000..0999eadc669538 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkAppContext.java @@ -0,0 +1,14 @@ +package datahub.spark.conf; + +import java.util.Map; +import lombok.Data; + +@Data +public class SparkAppContext { + String appName; + String appId; + Long startTime; + String sparkUser; + String appAttemptId; + Map databricksTags; +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkConfigParser.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkConfigParser.java new file mode 100644 index 00000000000000..7e10f51feb38a4 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkConfigParser.java @@ -0,0 +1,323 @@ +package datahub.spark.conf; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DataJobUrn; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.dataset.PathSpec; +import java.net.URISyntaxException; +import java.util.Arrays; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.stream.Collectors; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkEnv; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SparkConfigParser { + public static final String PARENT_JOB_KEY = "parent.datajob_urn"; + public static final String TRANSPORT_KEY = "transport"; + public static final String GMS_URL_KEY = "rest.server"; + public static final String GMS_AUTH_TOKEN = "rest.token"; + public static final String DISABLE_SSL_VERIFICATION_KEY = "rest.disable_ssl_verification"; + public static final String COALESCE_KEY = "coalesce_jobs"; + public static final String PATCH_ENABLED = "patch.enabled"; + + public static final String STAGE_METADATA_COALESCING = "stage_metadata_coalescing"; + public static final String STREAMING_JOB = "streaming_job"; + public static final String STREAMING_HEARTBEAT = "streaming_heartbeat"; + public static final String DATAHUB_FLOW_NAME = "flow_name"; + public static final String DATASET_ENV_KEY = "metadata.dataset.env"; + public static final String DATASET_MATERIALIZE_KEY = "metadata.dataset.materialize"; + public static final String DATASET_PLATFORM_INSTANCE_KEY = "metadata.dataset.platformInstance"; + public static final String DATASET_INCLUDE_SCHEMA_METADATA = + "metadata.dataset.experimental_include_schema_metadata"; + public static final String SPARK_PLATFORM_INSTANCE_KEY = "platformInstance"; + public static final String REMOVE_PARTITION_PATTERN = "metadata.remove_partition_pattern"; + public static final String SPARK_APP_NAME = "spark.app.name"; + public static final String SPARK_MASTER = "spark.master"; + public static final String PLATFORM_KEY = "platform"; + public static final String PATH_SPEC_LIST_KEY = "path_spec_list"; + public static final String FILE_PARTITION_REGEXP_PATTERN = "file_partition_regexp"; + public static final String FABRIC_TYPE_KEY = "env"; + public static final String PLATFORM_INSTANCE_KEY = "platformInstance"; + public static final String DATABRICKS_CLUSTER_KEY = "databricks.cluster"; + public static final String PIPELINE_KEY = "metadata.pipeline"; + public static final String PIPELINE_PLATFORM_INSTANCE_KEY = PIPELINE_KEY + ".platformInstance"; + + public static final String TAGS_KEY = "tags"; + + public static final String DOMAINS_KEY = "domains"; + + private static final Logger log = LoggerFactory.getLogger(SparkConfigParser.class); + public static final String SPARK_DATABRICKS_CLUSTER_USAGE_TAGS_CLUSTER_ALL_TAGS = + "spark.databricks.clusterUsageTags.clusterAllTags"; + + private static final ObjectMapper mapper = new ObjectMapper(); + + private SparkConfigParser() {} + + public static Properties moveKeysToRoot(Properties properties, String prefix) { + Properties newProperties = new Properties(); + Enumeration propertyNames = properties.propertyNames(); + + while (propertyNames.hasMoreElements()) { + String key = (String) propertyNames.nextElement(); + String value = properties.getProperty(key); + + if (key.startsWith(prefix)) { + key = key.substring(prefix.length()); + } + + newProperties.setProperty(key, value); + log.info("Setting property {} to {}", key, value); + } + + return newProperties; + } + + public static Config parsePropertiesToConfig(Properties properties) { + properties + .keySet() + .removeIf( + o -> + (!o.toString().startsWith("spark.datahub.") + && !o.toString() + .startsWith(SPARK_DATABRICKS_CLUSTER_USAGE_TAGS_CLUSTER_ALL_TAGS))); + properties = SparkConfigParser.moveKeysToRoot(properties, "spark.datahub."); + return ConfigFactory.parseProperties(properties); + } + + public static Config parseSparkConfig() { + if (SparkEnv.get() == null) { + return ConfigFactory.empty(); + } + + SparkConf conf = SparkEnv.get().conf(); + String propertiesString = + Arrays.stream(conf.getAllWithPrefix("spark.datahub.")) + .map(tup -> tup._1 + "= \"" + tup._2 + "\"") + .collect(Collectors.joining("\n")); + + return ConfigFactory.parseString(propertiesString); + } + + public static Optional> getDatabricksClusterTags( + String databricksClusterTags) { + try { + List> list = + mapper.readValue( + databricksClusterTags, new TypeReference>>() {}); + Map hashMap = new HashMap<>(); + for (Map map : list) { + hashMap.put(map.get("key"), map.get("value")); + } + return Optional.of(hashMap); + } catch (Exception e) { + log.warn("Error parsing databricks cluster tags", e); + } + return Optional.empty(); + } + + public static DatahubOpenlineageConfig sparkConfigToDatahubOpenlineageConf( + Config sparkConfig, SparkAppContext sparkAppContext) { + DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder = + DatahubOpenlineageConfig.builder(); + builder.filePartitionRegexpPattern( + SparkConfigParser.getFilePartitionRegexpPattern(sparkConfig)); + builder.fabricType(SparkConfigParser.getCommonFabricType(sparkConfig)); + builder.includeSchemaMetadata(SparkConfigParser.isIncludeSchemaMetadata(sparkConfig)); + builder.materializeDataset(SparkConfigParser.isDatasetMaterialize(sparkConfig)); + builder.pathSpecs(SparkConfigParser.getPathSpecListMap(sparkConfig)); + String pipelineName = SparkConfigParser.getPipelineName(sparkConfig, sparkAppContext); + if (pipelineName != null) { + builder.pipelineName(pipelineName); + } + builder.platformInstance(SparkConfigParser.getPlatformInstance(sparkConfig)); + builder.commonDatasetPlatformInstance(SparkConfigParser.getCommonPlatformInstance(sparkConfig)); + builder.usePatch(SparkConfigParser.isPatchEnabled(sparkConfig)); + try { + String parentJob = SparkConfigParser.getParentJobKey(sparkConfig); + if (parentJob != null) { + builder.parentJobUrn(DataJobUrn.createFromString(parentJob)); + } + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + return builder.build(); + } + + public static FabricType getCommonFabricType(Config datahubConfig) { + String fabricTypeString = + datahubConfig.hasPath(DATASET_ENV_KEY) + ? datahubConfig.getString(DATASET_ENV_KEY).toUpperCase() + : "PROD"; + FabricType fabricType = null; + try { + fabricType = FabricType.valueOf(fabricTypeString); + } catch (IllegalArgumentException e) { + log.warn("Invalid env ({}). Setting env to default PROD", fabricTypeString); + fabricType = FabricType.PROD; + } + return fabricType; + } + + public static String getCommonPlatformInstance(Config datahubConfig) { + return datahubConfig.hasPath(DATASET_PLATFORM_INSTANCE_KEY) + ? datahubConfig.getString(DATASET_PLATFORM_INSTANCE_KEY) + : null; + } + + public static Optional> getDatabricksTags(Config datahubConfig) { + return datahubConfig.hasPath(SPARK_DATABRICKS_CLUSTER_USAGE_TAGS_CLUSTER_ALL_TAGS) + ? getDatabricksClusterTags( + datahubConfig.getString(SPARK_DATABRICKS_CLUSTER_USAGE_TAGS_CLUSTER_ALL_TAGS)) + : Optional.empty(); + } + + public static String getParentJobKey(Config datahubConfig) { + return datahubConfig.hasPath(PARENT_JOB_KEY) ? datahubConfig.getString(PARENT_JOB_KEY) : null; + } + + public static String[] getTags(Config datahubConfig) { + return datahubConfig.hasPath(TAGS_KEY) ? datahubConfig.getString(TAGS_KEY).split(",") : null; + } + + public static String[] getDomains(Config datahubConfig) { + return datahubConfig.hasPath(DOMAINS_KEY) + ? datahubConfig.getString(DOMAINS_KEY).split(",") + : null; + } + + public static String getSparkMaster(Config datahubConfig) { + return datahubConfig.hasPath(SPARK_MASTER) + ? datahubConfig + .getString(SPARK_MASTER) + .replaceAll(":", "_") + .replaceAll("/", "_") + .replaceAll(",", "_") + .replaceAll("[_]+", "_") + : "default"; + } + + public static String getRemovePartitionPattern(Config datahubConfig) { + return datahubConfig.hasPath(REMOVE_PARTITION_PATTERN) + ? datahubConfig.getString(REMOVE_PARTITION_PATTERN) + : null; + } + + public static String getSparkAppName(Config datahubConfig) { + return datahubConfig.hasPath(SPARK_APP_NAME) + ? datahubConfig.getString(SPARK_APP_NAME) + : "default"; + } + + public static Map> getPathSpecListMap(Config datahubConfig) { + HashMap> pathSpecMap = new HashMap<>(); + + if (datahubConfig.hasPath(PLATFORM_KEY)) { + for (String key : datahubConfig.getConfig(PLATFORM_KEY).root().keySet()) { + String aliasKey = PLATFORM_KEY + "." + key; + List platformSpecs = new LinkedList<>(); + for (String pathSpecKey : datahubConfig.getConfig(aliasKey).root().keySet()) { + PathSpec.PathSpecBuilder pathSpecBuilder = PathSpec.builder(); + pathSpecBuilder.alias(pathSpecKey); + pathSpecBuilder.platform(key); + if (datahubConfig.hasPath(aliasKey + ".env")) { + pathSpecBuilder.env(datahubConfig.getString(aliasKey + ".env")); + } + if (datahubConfig.hasPath(aliasKey + ".platformInstance")) { + pathSpecBuilder.platformInstance( + Optional.ofNullable(datahubConfig.getString(aliasKey + ".platformInstance"))); + } + pathSpecBuilder.pathSpecList( + Arrays.asList(datahubConfig.getString(aliasKey + "." + pathSpecKey).split(","))); + + platformSpecs.add(pathSpecBuilder.build()); + } + pathSpecMap.put(key, platformSpecs); + } + } + return pathSpecMap; + } + + public static String getPlatformInstance(Config pathSpecConfig) { + return pathSpecConfig.hasPath(PLATFORM_INSTANCE_KEY) + ? pathSpecConfig.getString(PLATFORM_INSTANCE_KEY) + : null; + } + + public static String getFilePartitionRegexpPattern(Config config) { + return config.hasPath(FILE_PARTITION_REGEXP_PATTERN) + ? config.getString(FILE_PARTITION_REGEXP_PATTERN) + : null; + } + + public static int getStreamingHeartbeatSec(Config datahubConfig) { + return datahubConfig.hasPath(STREAMING_HEARTBEAT) + ? datahubConfig.getInt(STREAMING_HEARTBEAT) + : 5 * 60; + } + + public static boolean isDatasetMaterialize(Config datahubConfig) { + return datahubConfig.hasPath(DATASET_MATERIALIZE_KEY) + && datahubConfig.getBoolean(DATASET_MATERIALIZE_KEY); + } + + public static boolean isIncludeSchemaMetadata(Config datahubConfig) { + return datahubConfig.hasPath(DATASET_INCLUDE_SCHEMA_METADATA) + && datahubConfig.getBoolean(DATASET_INCLUDE_SCHEMA_METADATA); + } + + public static String getPipelineName(Config datahubConfig, SparkAppContext appContext) { + String name = appContext != null && appContext.appName != null ? appContext.appName : null; + if (datahubConfig.hasPath(DATAHUB_FLOW_NAME)) { + name = datahubConfig.getString(DATAHUB_FLOW_NAME); + } + if (datahubConfig.hasPath(DATABRICKS_CLUSTER_KEY)) { + return (datahubConfig.getString(DATABRICKS_CLUSTER_KEY) + "_" + name).replaceAll("[,]", ""); + } + + // TODO: appending of platform instance needs to be done at central location + // like adding constructor to dataflowurl + if (datahubConfig.hasPath(PIPELINE_PLATFORM_INSTANCE_KEY)) { + name = datahubConfig.getString(PIPELINE_PLATFORM_INSTANCE_KEY) + "." + name; + } + return name; + } + + public static boolean isCoalesceEnabled(Config datahubConfig) { + if (!datahubConfig.hasPath(COALESCE_KEY)) { + return true; + } + return datahubConfig.hasPath(COALESCE_KEY) && datahubConfig.getBoolean(COALESCE_KEY); + } + + public static boolean isPatchEnabled(Config datahubConfig) { + if (!datahubConfig.hasPath(PATCH_ENABLED)) { + return true; + } + return datahubConfig.hasPath(PATCH_ENABLED) && datahubConfig.getBoolean(PATCH_ENABLED); + } + + public static boolean isEmitCoalescePeriodically(Config datahubConfig) { + if (!datahubConfig.hasPath(STAGE_METADATA_COALESCING)) { + // if databricks tags are present and stage_metadata_coalescing is not present, then default + // to true for coalescing periodically + // because on DataBricks platform we don't get application stop event + return getDatabricksTags(datahubConfig).isPresent() && isCoalesceEnabled(datahubConfig); + } + + return datahubConfig.hasPath(STAGE_METADATA_COALESCING) + && datahubConfig.getBoolean(STAGE_METADATA_COALESCING); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkLineageConf.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkLineageConf.java new file mode 100644 index 00000000000000..014cff873bbde9 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/conf/SparkLineageConf.java @@ -0,0 +1,54 @@ +package datahub.spark.conf; + +import com.typesafe.config.Config; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Objects; +import lombok.Builder; +import lombok.Getter; +import lombok.Setter; + +@Getter +@Builder +@Setter +public class SparkLineageConf { + final DatahubOpenlineageConfig openLineageConf; + @Builder.Default final boolean coalesceEnabled = true; + @Builder.Default final boolean emitCoalescePeriodically = false; + final SparkAppContext sparkAppContext; + final DatahubEmitterConfig datahubEmitterConfig; + @Builder.Default final List tags = new LinkedList<>(); + + @Builder.Default final List domains = new LinkedList<>(); + + public static SparkLineageConf toSparkLineageConf( + Config sparkConfig, + SparkAppContext sparkAppContext, + DatahubEmitterConfig datahubEmitterConfig) { + SparkLineageConfBuilder builder = SparkLineageConf.builder(); + DatahubOpenlineageConfig datahubOpenlineageConfig = + SparkConfigParser.sparkConfigToDatahubOpenlineageConf(sparkConfig, sparkAppContext); + builder.openLineageConf(datahubOpenlineageConfig); + builder.coalesceEnabled(SparkConfigParser.isCoalesceEnabled(sparkConfig)); + if (SparkConfigParser.getTags(sparkConfig) != null) { + builder.tags(Arrays.asList(Objects.requireNonNull(SparkConfigParser.getTags(sparkConfig)))); + } + + if (SparkConfigParser.getDomains(sparkConfig) != null) { + builder.domains( + Arrays.asList(Objects.requireNonNull(SparkConfigParser.getDomains(sparkConfig)))); + } + + builder.emitCoalescePeriodically(SparkConfigParser.isEmitCoalescePeriodically(sparkConfig)); + if (sparkAppContext != null) { + builder.sparkAppContext(sparkAppContext); + } + + if (datahubEmitterConfig != null) { + builder.datahubEmitterConfig = datahubEmitterConfig; + } + return builder.build(); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/converter/SparkStreamingEventToDatahub.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/converter/SparkStreamingEventToDatahub.java new file mode 100644 index 00000000000000..f0bfc021bbea92 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/converter/SparkStreamingEventToDatahub.java @@ -0,0 +1,194 @@ +package datahub.spark.converter; + +import static io.datahubproject.openlineage.utils.DatahubUtils.*; + +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; +import com.linkedin.common.DatasetUrnArray; +import com.linkedin.common.TimeStamp; +import com.linkedin.common.urn.DataFlowUrn; +import com.linkedin.common.urn.DataJobUrn; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.data.template.StringMap; +import com.linkedin.datajob.DataFlowInfo; +import com.linkedin.datajob.DataJobInfo; +import com.linkedin.datajob.DataJobInputOutput; +import datahub.event.MetadataChangeProposalWrapper; +import datahub.spark.conf.SparkLineageConf; +import io.datahubproject.openlineage.dataset.HdfsPathDataset; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.sql.streaming.StreamingQueryProgress; + +@Slf4j +public class SparkStreamingEventToDatahub { + private SparkStreamingEventToDatahub() {} + + public static final String DELTA_LAKE_PLATFORM = "delta-lake"; + public static final String FILE_PLATFORM = "file"; + public static final String KAFKA_PLATFORM = "kafka"; + + public static List generateMcpFromStreamingProgressEvent( + StreamingQueryProgress event, + SparkLineageConf conf, + Map schemaMap) { + List mcps = new ArrayList<>(); + + DataFlowInfo dataFlowInfo = new DataFlowInfo(); + dataFlowInfo.setName(conf.getOpenLineageConf().getPipelineName()); + StringMap flowCustomProperties = new StringMap(); + + Long appStartTime; + if (conf.getSparkAppContext() != null) { + appStartTime = conf.getSparkAppContext().getStartTime(); + if (appStartTime != null) { + flowCustomProperties.put("createdAt", appStartTime.toString()); + flowCustomProperties.put("id", event.id().toString()); + dataFlowInfo.setCreated(new TimeStamp().setTime(appStartTime)); + } + } + + flowCustomProperties.put("plan", event.json()); + dataFlowInfo.setCustomProperties(flowCustomProperties); + + DataFlowUrn flowUrn = + flowUrn( + conf.getOpenLineageConf().getPlatformInstance(), + conf.getOpenLineageConf().getPipelineName()); + MetadataChangeProposalWrapper dataflowMcp = + MetadataChangeProposalWrapper.create( + b -> b.entityType("dataFlow").entityUrn(flowUrn).upsert().aspect(dataFlowInfo)); + mcps.add(dataflowMcp); + + DataJobInfo dataJobInfo = new DataJobInfo(); + dataJobInfo.setName(conf.getOpenLineageConf().getPipelineName()); + dataJobInfo.setType(DataJobInfo.Type.create("SPARK")); + + StringMap jobCustomProperties = new StringMap(); + jobCustomProperties.put("batchId", Long.toString(event.batchId())); + jobCustomProperties.put("inputRowsPerSecond", Double.toString(event.inputRowsPerSecond())); + jobCustomProperties.put( + "processedRowsPerSecond", Double.toString(event.processedRowsPerSecond())); + jobCustomProperties.put("numInputRows", Long.toString(event.numInputRows())); + dataJobInfo.setCustomProperties(jobCustomProperties); + + DataJobUrn jobUrn = jobUrn(flowUrn, conf.getOpenLineageConf().getPipelineName()); + MetadataChangeProposalWrapper dataJobMcp = + MetadataChangeProposalWrapper.create( + b -> b.entityType("dataJob").entityUrn(jobUrn).upsert().aspect(dataJobInfo)); + mcps.add(dataJobMcp); + + DataJobInputOutput dataJobInputOutput = new DataJobInputOutput(); + + JsonElement root = new JsonParser().parse(event.json()); + DatasetUrnArray inputDatasetUrnArray = new DatasetUrnArray(); + for (JsonElement source : root.getAsJsonObject().get("sources").getAsJsonArray()) { + String description = source.getAsJsonObject().get("description").getAsString(); + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription(description, conf); + if (urn.isPresent()) { + if (inputDatasetUrnArray.contains(urn.get())) { + log.debug("We already have dataset {} in the list, skipping it.", urn.get()); + continue; + } + inputDatasetUrnArray.add(urn.get()); + if (conf.getOpenLineageConf().isMaterializeDataset()) { + MetadataChangeProposalWrapper datasetMcp = generateDatasetMcp(urn.get()); + mcps.add(datasetMcp); + if (conf.getOpenLineageConf().isIncludeSchemaMetadata() + && schemaMap.containsKey(urn.get().toString())) { + mcps.add(schemaMap.get(urn.get().toString())); + } + } + } + } + + DatasetUrnArray outputDatasetUrnArray = new DatasetUrnArray(); + String sinkDescription = + root.getAsJsonObject().get("sink").getAsJsonObject().get("description").getAsString(); + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription(sinkDescription, conf); + if (urn.isPresent()) { + MetadataChangeProposalWrapper datasetMcp = generateDatasetMcp(urn.get()); + outputDatasetUrnArray.add(urn.get()); + mcps.add(datasetMcp); + if (conf.getOpenLineageConf().isIncludeSchemaMetadata() + && schemaMap.containsKey(urn.get().toString())) { + mcps.add(schemaMap.get(urn.get().toString())); + } + } + + dataJobInputOutput.setInputDatasets(inputDatasetUrnArray); + dataJobInputOutput.setOutputDatasets(outputDatasetUrnArray); + + MetadataChangeProposalWrapper inputOutputMcp = + MetadataChangeProposalWrapper.create( + b -> b.entityType("dataJob").entityUrn(jobUrn).upsert().aspect(dataJobInputOutput)); + + mcps.add(inputOutputMcp); + return (mcps); + } + + public static Optional generateUrnFromStreamingDescription( + String description, SparkLineageConf sparkLineageConf) { + String pattern = "(.*?)\\[(.*)]"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(description); + if (m.find()) { + String namespace = m.group(1); + String platform = getDatahubPlatform(namespace); + String path = m.group(2); + log.debug("Streaming description Platform: {}, Path: {}", platform, path); + if (platform.equals(KAFKA_PLATFORM)) { + path = getKafkaTopicFromPath(m.group(2)); + } else if (platform.equals(FILE_PLATFORM) || platform.equals(DELTA_LAKE_PLATFORM)) { + try { + DatasetUrn urn = + HdfsPathDataset.create(new URI(path), sparkLineageConf.getOpenLineageConf()).urn(); + return Optional.of(urn); + } catch (InstantiationException e) { + return Optional.empty(); + } catch (URISyntaxException e) { + log.error("Failed to parse path {}", path, e); + return Optional.empty(); + } + } + return Optional.of( + new DatasetUrn( + new DataPlatformUrn(platform), + path, + sparkLineageConf.getOpenLineageConf().getFabricType())); + } else { + return Optional.empty(); + } + } + + public static String getDatahubPlatform(String namespace) { + switch (namespace) { + case "KafkaV2": + return "kafka"; + case "DeltaSink": + return "delta-lake"; + case "CloudFilesSource": + return "dbfs"; + case "FileSink": + case "FileStreamSource": + return "file"; + default: + return namespace; + } + } + + public static String getKafkaTopicFromPath(String path) { + return StringUtils.substringBetween(path, "[", "]"); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/OpenLineageRunEventBuilder.java b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/OpenLineageRunEventBuilder.java new file mode 100644 index 00000000000000..99643592dc200e --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/OpenLineageRunEventBuilder.java @@ -0,0 +1,493 @@ +/* +/* Copyright 2018-2023 contributors to the OpenLineage project +/* SPDX-License-Identifier: Apache-2.0 +*/ + +package io.openlineage.spark.agent.lifecycle; + +import static io.openlineage.client.OpenLineageClientUtils.mergeFacets; +import static io.openlineage.spark.agent.util.ScalaConversionUtils.fromSeq; +import static io.openlineage.spark.agent.util.ScalaConversionUtils.toScalaFn; + +import io.openlineage.client.OpenLineage; +import io.openlineage.client.OpenLineage.DatasetFacet; +import io.openlineage.client.OpenLineage.DatasetFacets; +import io.openlineage.client.OpenLineage.InputDataset; +import io.openlineage.client.OpenLineage.InputDatasetFacet; +import io.openlineage.client.OpenLineage.InputDatasetInputFacets; +import io.openlineage.client.OpenLineage.JobBuilder; +import io.openlineage.client.OpenLineage.JobFacet; +import io.openlineage.client.OpenLineage.OutputDataset; +import io.openlineage.client.OpenLineage.OutputDatasetFacet; +import io.openlineage.client.OpenLineage.OutputDatasetOutputFacets; +import io.openlineage.client.OpenLineage.ParentRunFacet; +import io.openlineage.client.OpenLineage.RunEvent; +import io.openlineage.client.OpenLineage.RunEventBuilder; +import io.openlineage.client.OpenLineage.RunFacet; +import io.openlineage.client.OpenLineage.RunFacets; +import io.openlineage.client.OpenLineage.RunFacetsBuilder; +import io.openlineage.spark.agent.hooks.HookUtils; +import io.openlineage.spark.agent.lifecycle.plan.column.ColumnLevelLineageUtils; +import io.openlineage.spark.agent.lifecycle.plan.column.ColumnLevelLineageVisitor; +import io.openlineage.spark.agent.util.FacetUtils; +import io.openlineage.spark.agent.util.PlanUtils; +import io.openlineage.spark.agent.util.ScalaConversionUtils; +import io.openlineage.spark.api.CustomFacetBuilder; +import io.openlineage.spark.api.OpenLineageContext; +import io.openlineage.spark.api.OpenLineageEventHandlerFactory; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import lombok.AllArgsConstructor; +import lombok.NonNull; +import lombok.extern.slf4j.Slf4j; +import org.apache.spark.rdd.RDD; +import org.apache.spark.scheduler.ActiveJob; +import org.apache.spark.scheduler.JobFailed; +import org.apache.spark.scheduler.SparkListenerJobEnd; +import org.apache.spark.scheduler.SparkListenerJobStart; +import org.apache.spark.scheduler.SparkListenerStageCompleted; +import org.apache.spark.scheduler.SparkListenerStageSubmitted; +import org.apache.spark.scheduler.Stage; +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; +import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd; +import org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart; +import scala.Function1; +import scala.PartialFunction; + +/** + * Event handler that accepts various {@link org.apache.spark.scheduler.SparkListener} events and + * helps build up an {@link RunEvent} by passing event components to partial functions that know how + * to convert those event components into {@link RunEvent} properties. + * + *

The event types that can be consumed to generate @link OpenLineage.RunEvent} properties have + * no common supertype, so the generic argument for the function input is simply {@link Object}. The + * types of arguments that may be found include + * + *

    + *
  • {@link org.apache.spark.scheduler.StageInfo} + *
  • {@link Stage} + *
  • {@link RDD} + *
  • {@link ActiveJob} + *
  • {@link org.apache.spark.sql.execution.QueryExecution} + *
+ * + *

These components are extracted from various {@link org.apache.spark.scheduler.SparkListener} + * events, such as {@link SparkListenerStageCompleted}, {@link SparkListenerJobStart}, and {@link + * org.apache.spark.scheduler.SparkListenerTaskEnd}. + * + *

{@link RDD} chains will be _flattened_ so each `RDD` dependency is passed to the builders one + * at a time. This means a builder can directly specify the type of {@link RDD} it handles, such as + * a {@link org.apache.spark.rdd.HadoopRDD} or a {@link + * org.apache.spark.sql.execution.datasources.FileScanRDD}, without having to check the dependencies + * of every {@link org.apache.spark.rdd.MapPartitionsRDD} or {@link + * org.apache.spark.sql.execution.SQLExecutionRDD}. + * + *

Any {@link RunFacet}s and {@link JobFacet}s returned by the {@link CustomFacetBuilder}s are + * appended to the {@link OpenLineage.Run} and {@link OpenLineage.Job}, respectively. + * + *

If any {@link OpenLineage.InputDatasetBuilder}s or {@link + * OpenLineage.OutputDatasetBuilder}s are returned from the partial functions, the {@link + * #inputDatasetBuilders} or {@link #outputDatasetBuilders} will be invoked using the same input + * arguments in order to construct any {@link InputDatasetFacet}s or {@link OutputDatasetFacet}s to + * the returned dataset. {@link InputDatasetFacet}s and {@link OutputDatasetFacet}s will be attached + * to any {@link OpenLineage.InputDatasetBuilder} or {@link OpenLineage.OutputDatasetBuilder} + * found for the event. This is because facets may be constructed from generic information that is + * not specifically tied to a Dataset. For example, {@link + * OpenLineage.OutputStatisticsOutputDatasetFacet}s are created from {@link + * org.apache.spark.executor.TaskMetrics} attached to the last {@link + * org.apache.spark.scheduler.StageInfo} for a given job execution. However, the {@link + * OutputDataset} is constructed by reading the {@link LogicalPlan}. There's no way to tie the + * output metrics in the {@link org.apache.spark.scheduler.StageInfo} to the {@link OutputDataset} + * in the {@link LogicalPlan} except by inference. Similarly, input metrics can be found in the + * {@link org.apache.spark.scheduler.StageInfo} for the stage that reads a dataset and the {@link + * InputDataset} can usually be constructed by walking the {@link RDD} dependency tree for that + * {@link Stage} and finding a {@link org.apache.spark.sql.execution.datasources.FileScanRDD} or + * other concrete implementation. But while there is typically only one {@link InputDataset} read in + * a given stage, there's no guarantee of that and the {@link org.apache.spark.executor.TaskMetrics} + * in the {@link org.apache.spark.scheduler.StageInfo} won't disambiguate. + * + *

If a facet needs to be attached to a specific dataset, the user must take care to construct + * both the Dataset and the Facet in the same builder. + */ +@Slf4j +@AllArgsConstructor +class OpenLineageRunEventBuilder { + + @NonNull private final OpenLineageContext openLineageContext; + + @NonNull + private final Collection>> inputDatasetBuilders; + + @NonNull + private final Collection>> + inputDatasetQueryPlanVisitors; + + @NonNull + private final Collection>> outputDatasetBuilders; + + @NonNull + private final Collection>> + outputDatasetQueryPlanVisitors; + + @NonNull + private final Collection> datasetFacetBuilders; + + @NonNull + private final Collection> + inputDatasetFacetBuilders; + + @NonNull + private final Collection> + outputDatasetFacetBuilders; + + @NonNull private final Collection> runFacetBuilders; + @NonNull private final Collection> jobFacetBuilders; + @NonNull private final Collection columnLineageVisitors; + private final UnknownEntryFacetListener unknownEntryFacetListener = + UnknownEntryFacetListener.getInstance(); + private final Map jobMap = new HashMap<>(); + private final Map stageMap = new HashMap<>(); + + OpenLineageRunEventBuilder(OpenLineageContext context, OpenLineageEventHandlerFactory factory) { + this( + context, + factory.createInputDatasetBuilder(context), + factory.createInputDatasetQueryPlanVisitors(context), + factory.createOutputDatasetBuilder(context), + factory.createOutputDatasetQueryPlanVisitors(context), + factory.createDatasetFacetBuilders(context), + factory.createInputDatasetFacetBuilders(context), + factory.createOutputDatasetFacetBuilders(context), + factory.createRunFacetBuilders(context), + factory.createJobFacetBuilders(context), + factory.createColumnLevelLineageVisitors(context)); + } + + /** + * Add an {@link ActiveJob} and all of its {@link Stage}s to the maps so we can look them up by id + * later. + * + * @param job + */ + void registerJob(ActiveJob job) { + jobMap.put(job.jobId(), job); + stageMap.put(job.finalStage().id(), job.finalStage()); + job.finalStage() + .parents() + .forall( + toScalaFn( + stage -> { + stageMap.put(stage.id(), stage); + return true; + })); + } + + RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + SparkListenerStageSubmitted event) { + Stage stage = stageMap.get(event.stageInfo().stageId()); + RDD rdd = stage.rdd(); + + List nodes = new ArrayList<>(); + nodes.addAll(Arrays.asList(event.stageInfo(), stage)); + + nodes.addAll(Rdds.flattenRDDs(rdd)); + + return populateRun(parentRunFacet, runEventBuilder, jobBuilder, nodes); + } + + RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + SparkListenerStageCompleted event) { + Stage stage = stageMap.get(event.stageInfo().stageId()); + RDD rdd = stage.rdd(); + + List nodes = new ArrayList<>(); + nodes.addAll(Arrays.asList(event.stageInfo(), stage)); + + nodes.addAll(Rdds.flattenRDDs(rdd)); + + return populateRun(parentRunFacet, runEventBuilder, jobBuilder, nodes); + } + + RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + SparkListenerSQLExecutionStart event) { + runEventBuilder.eventType(RunEvent.EventType.START); + return buildRun(parentRunFacet, runEventBuilder, jobBuilder, event, Optional.empty()); + } + + RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + SparkListenerSQLExecutionEnd event) { + runEventBuilder.eventType(RunEvent.EventType.COMPLETE); + return buildRun(parentRunFacet, runEventBuilder, jobBuilder, event, Optional.empty()); + } + + RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + SparkListenerJobStart event) { + runEventBuilder.eventType(RunEvent.EventType.START); + return buildRun( + parentRunFacet, + runEventBuilder, + jobBuilder, + event, + Optional.ofNullable(jobMap.get(event.jobId()))); + } + + RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + SparkListenerJobEnd event) { + runEventBuilder.eventType( + event.jobResult() instanceof JobFailed + ? RunEvent.EventType.FAIL + : RunEvent.EventType.COMPLETE); + return buildRun( + parentRunFacet, + runEventBuilder, + jobBuilder, + event, + Optional.ofNullable(jobMap.get(event.jobId()))); + } + + private RunEvent buildRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + Object event, + Optional job) { + List nodes = new ArrayList<>(); + nodes.add(event); + job.ifPresent( + j -> { + nodes.add(j); + nodes.addAll(Rdds.flattenRDDs(j.finalStage().rdd())); + }); + + return populateRun(parentRunFacet, runEventBuilder, jobBuilder, nodes); + } + + private RunEvent populateRun( + Optional parentRunFacet, + RunEventBuilder runEventBuilder, + JobBuilder jobBuilder, + List nodes) { + OpenLineage openLineage = openLineageContext.getOpenLineage(); + + RunFacetsBuilder runFacetsBuilder = openLineage.newRunFacetsBuilder(); + OpenLineage.JobFacetsBuilder jobFacetsBuilder = + openLineageContext.getOpenLineage().newJobFacetsBuilder(); + + parentRunFacet.ifPresent(runFacetsBuilder::parent); + OpenLineage.JobFacets jobFacets = buildJobFacets(nodes, jobFacetBuilders, jobFacetsBuilder); + List inputDatasets = buildInputDatasets(nodes); + List outputDatasets = buildOutputDatasets(nodes); + openLineageContext + .getQueryExecution() + .filter(qe -> !FacetUtils.isFacetDisabled(openLineageContext, "spark_unknown")) + .flatMap(qe -> unknownEntryFacetListener.build(qe.optimizedPlan())) + .ifPresent(facet -> runFacetsBuilder.put("spark_unknown", facet)); + + RunFacets runFacets = buildRunFacets(nodes, runFacetBuilders, runFacetsBuilder); + OpenLineage.RunBuilder runBuilder = + openLineage.newRunBuilder().runId(openLineageContext.getRunUuid()).facets(runFacets); + runEventBuilder + .run(runBuilder.build()) + .job(jobBuilder.facets(jobFacets).build()) + .inputs(inputDatasets) + .outputs(outputDatasets); + + HookUtils.preBuild(openLineageContext, runEventBuilder); + return runEventBuilder.build(); + } + + private List buildInputDatasets(List nodes) { + openLineageContext + .getQueryExecution() + .ifPresent( + qe -> { + if (log.isDebugEnabled()) { + log.debug("Traversing optimized plan {}", qe.optimizedPlan().toJSON()); + log.debug("Physical plan executed {}", qe.executedPlan().toJSON()); + } + }); + log.debug( + "Visiting query plan {} with input dataset builders {}", + openLineageContext.getQueryExecution(), + inputDatasetBuilders); + + Function1> inputVisitor = + visitLogicalPlan(PlanUtils.merge(inputDatasetQueryPlanVisitors)); + + List datasets = + Stream.concat( + buildDatasets(nodes, inputDatasetBuilders), + openLineageContext + .getQueryExecution() + .map( + qe -> + fromSeq(qe.optimizedPlan().map(inputVisitor)).stream() + .flatMap(Collection::stream) + .map(((Class) InputDataset.class)::cast)) + .orElse(Stream.empty())) + .collect(Collectors.toList()); + OpenLineage openLineage = openLineageContext.getOpenLineage(); + if (!datasets.isEmpty()) { + Map inputFacetsMap = new HashMap<>(); + nodes.forEach( + event -> inputDatasetFacetBuilders.forEach(fn -> fn.accept(event, inputFacetsMap::put))); + Map datasetFacetsMap = new HashMap<>(); + nodes.forEach( + event -> inputDatasetFacetBuilders.forEach(fn -> fn.accept(event, inputFacetsMap::put))); + return datasets.stream() + .map( + ds -> + openLineage + .newInputDatasetBuilder() + .name(ds.getName()) + .namespace(ds.getNamespace()) + .inputFacets( + mergeFacets( + inputFacetsMap, ds.getInputFacets(), InputDatasetInputFacets.class)) + .facets(mergeFacets(datasetFacetsMap, ds.getFacets(), DatasetFacets.class)) + .build()) + .collect(Collectors.toList()); + } + return datasets; + } + + /** + * Returns a {@link Function1} that passes the input {@link LogicalPlan} node to the {@link + * #unknownEntryFacetListener} if the inputVisitor is defined for the input node. + * + * @param inputVisitor + * @param + * @return + */ + private Function1> visitLogicalPlan( + PartialFunction> inputVisitor) { + return ScalaConversionUtils.toScalaFn( + node -> + inputVisitor + .andThen( + toScalaFn( + ds -> { + unknownEntryFacetListener.accept(node); + return ds; + })) + .applyOrElse(node, toScalaFn(n -> Collections.emptyList()))); + } + + private List buildOutputDatasets(List nodes) { + log.debug( + "Visiting query plan {} with output dataset builders {}", + openLineageContext.getQueryExecution(), + outputDatasetBuilders); + Function1> visitor = + visitLogicalPlan(PlanUtils.merge(outputDatasetQueryPlanVisitors)); + List datasets = + Stream.concat( + buildDatasets(nodes, outputDatasetBuilders), + openLineageContext + .getQueryExecution() + .map(qe -> visitor.apply(qe.optimizedPlan())) + .map(Collection::stream) + .orElse(Stream.empty())) + .collect(Collectors.toList()); + + OpenLineage openLineage = openLineageContext.getOpenLineage(); + + if (!datasets.isEmpty()) { + Map outputFacetsMap = new HashMap<>(); + nodes.forEach( + event -> + outputDatasetFacetBuilders.forEach(fn -> fn.accept(event, outputFacetsMap::put))); + Map datasetFacetsMap = new HashMap<>(); + nodes.forEach( + event -> datasetFacetBuilders.forEach(fn -> fn.accept(event, datasetFacetsMap::put))); + return datasets.stream() + .map( + ds -> { + Map dsFacetsMap = new HashMap(datasetFacetsMap); + ColumnLevelLineageUtils.buildColumnLineageDatasetFacet( + openLineageContext, ds.getFacets().getSchema()) + .ifPresent(facet -> dsFacetsMap.put("columnLineage", facet)); + return openLineage + .newOutputDatasetBuilder() + .name(ds.getName()) + .namespace(ds.getNamespace()) + .outputFacets( + mergeFacets( + outputFacetsMap, ds.getOutputFacets(), OutputDatasetOutputFacets.class)) + .facets(mergeFacets(dsFacetsMap, ds.getFacets(), DatasetFacets.class)) + .build(); + }) + .collect(Collectors.toList()); + } + return datasets; + } + + private Stream buildDatasets( + List nodes, Collection>> builders) { + return nodes.stream() + .flatMap( + event -> + builders.stream() + .filter(pfn -> PlanUtils.safeIsDefinedAt(pfn, event)) + .map(pfn -> PlanUtils.safeApply(pfn, event)) + .flatMap(Collection::stream)); + } + + /** + * Attach facets to a facet container, such as an {@link InputDatasetInputFacets} or an {@link + * OutputDatasetOutputFacets}. Facets returned by a {@link CustomFacetBuilder} may be attached to + * a field in the container, such as {@link InputDatasetInputFacets#dataQualityMetrics} or may be + * attached as a key/value pair in the {@link InputDatasetInputFacets#additionalProperties} map. + * The serialized JSON does not distinguish between these, but the java class does. The Java class + * also has some fields, such as the {@link InputDatasetInputFacets#producer} URI, which need to + * be included in the serialized JSON. + * + *

This methods will generate a new facet container with properties potentially overridden by + * the values set by the custom facet generators. + * + * @param events + * @param builders + * @return + */ + private OpenLineage.JobFacets buildJobFacets( + List events, + Collection> builders, + OpenLineage.JobFacetsBuilder jobFacetsBuilder) { + events.forEach(event -> builders.forEach(fn -> fn.accept(event, jobFacetsBuilder::put))); + return jobFacetsBuilder.build(); + } + + private RunFacets buildRunFacets( + List events, + Collection> builders, + RunFacetsBuilder runFacetsBuilder) { + events.forEach(event -> builders.forEach(fn -> fn.accept(event, runFacetsBuilder::put))); + return runFacetsBuilder.build(); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/plan/LogicalRelationDatasetBuilder.java b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/plan/LogicalRelationDatasetBuilder.java new file mode 100644 index 00000000000000..dd58b9eaf140b0 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/lifecycle/plan/LogicalRelationDatasetBuilder.java @@ -0,0 +1,220 @@ +/* +/* Copyright 2018-2023 contributors to the OpenLineage project +/* SPDX-License-Identifier: Apache-2.0 +*/ + +package io.openlineage.spark.agent.lifecycle.plan; + +import io.openlineage.client.OpenLineage; +import io.openlineage.client.OpenLineage.DatasetFacetsBuilder; +import io.openlineage.client.utils.DatasetIdentifier; +import io.openlineage.spark.agent.lifecycle.plan.handlers.JdbcRelationHandler; +import io.openlineage.spark.agent.util.PathUtils; +import io.openlineage.spark.agent.util.PlanUtils; +import io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder; +import io.openlineage.spark.api.DatasetFactory; +import io.openlineage.spark.api.OpenLineageContext; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.spark.scheduler.SparkListenerEvent; +import org.apache.spark.sql.catalyst.catalog.CatalogTable; +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; +import org.apache.spark.sql.execution.datasources.HadoopFsRelation; +import org.apache.spark.sql.execution.datasources.LogicalRelation; +import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions; +import org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation; +import scala.collection.JavaConversions; + +/** + * {@link LogicalPlan} visitor that attempts to extract a {@link OpenLineage.Dataset} from a {@link + * LogicalRelation}. The {@link org.apache.spark.sql.sources.BaseRelation} is tested for known + * types, such as {@link HadoopFsRelation} or {@link JDBCRelation}s, as those are easy to extract + * exact dataset information. + * + *

For {@link HadoopFsRelation}s, it is assumed that a single directory maps to a single {@link + * OpenLineage.Dataset}. Any files referenced are replaced by their parent directory and all files + * in a given directory are assumed to belong to the same {@link OpenLineage.Dataset}. Directory + * partitioning is currently not addressed. + * + *

For {@link JDBCRelation}s, {@link OpenLineage.Dataset} naming expects the namespace to be the + * JDBC connection URL (schema and authority only) and the table name to be the + * <database> + * .<tableName>. + * + *

{@link CatalogTable}s, if present, can be used to describe the {@link OpenLineage.Dataset} if + * its {@link org.apache.spark.sql.sources.BaseRelation} is unknown. + * + *

TODO If a user specifies the {@link JDBCOptions#JDBC_QUERY_STRING()} option, we do not parse + * the sql to determine the specific tables used. Since we return a List of {@link + * OpenLineage.Dataset}s, we can parse the sql and determine each table referenced to return a + * complete list of datasets referenced. + */ +@Slf4j +public class LogicalRelationDatasetBuilder + extends AbstractQueryPlanDatasetBuilder { + + private final DatasetFactory datasetFactory; + + public LogicalRelationDatasetBuilder( + OpenLineageContext context, DatasetFactory datasetFactory, boolean searchDependencies) { + super(context, searchDependencies); + this.datasetFactory = datasetFactory; + } + + @Override + public boolean isDefinedAtLogicalPlan(LogicalPlan x) { + // if a LogicalPlan is a single node plan like `select * from temp`, + // then it's leaf node and should not be considered output node + if (x instanceof LogicalRelation && isSingleNodeLogicalPlan(x) && !searchDependencies) { + return false; + } + + return x instanceof LogicalRelation + && (((LogicalRelation) x).relation() instanceof HadoopFsRelation + || ((LogicalRelation) x).relation() instanceof JDBCRelation + || ((LogicalRelation) x).catalogTable().isDefined()); + } + + private boolean isSingleNodeLogicalPlan(LogicalPlan x) { + return context + .getQueryExecution() + .map(qe -> qe.optimizedPlan()) + .filter(p -> p.equals(x)) + .isPresent() + && (x.children() == null || x.children().isEmpty()); + } + + @Override + public List apply(LogicalRelation logRel) { + if (logRel.catalogTable() != null && logRel.catalogTable().isDefined()) { + return handleCatalogTable(logRel); + } else if (logRel.relation() instanceof HadoopFsRelation) { + return handleHadoopFsRelation(logRel); + } else if (logRel.relation() instanceof JDBCRelation) { + return new JdbcRelationHandler<>(datasetFactory).handleRelation(logRel); + } + throw new IllegalArgumentException( + "Expected logical plan to be either HadoopFsRelation, JDBCRelation, " + + "or CatalogTable but was " + + logRel); + } + + private List handleCatalogTable(LogicalRelation logRel) { + CatalogTable catalogTable = logRel.catalogTable().get(); + + DatasetIdentifier di = PathUtils.fromCatalogTable(catalogTable); + + OpenLineage.DatasetFacetsBuilder datasetFacetsBuilder = + context.getOpenLineage().newDatasetFacetsBuilder(); + datasetFacetsBuilder.schema(PlanUtils.schemaFacet(context.getOpenLineage(), logRel.schema())); + datasetFacetsBuilder.dataSource( + PlanUtils.datasourceFacet(context.getOpenLineage(), di.getNamespace())); + + getDatasetVersion(logRel) + .map( + version -> + datasetFacetsBuilder.version( + context.getOpenLineage().newDatasetVersionDatasetFacet(version))); + + return Collections.singletonList(datasetFactory.getDataset(di, datasetFacetsBuilder)); + } + + private List handleHadoopFsRelation(LogicalRelation x) { + HadoopFsRelation relation = (HadoopFsRelation) x.relation(); + try { + return context + .getSparkSession() + .map( + session -> { + Configuration hadoopConfig = + session.sessionState().newHadoopConfWithOptions(relation.options()); + + DatasetFacetsBuilder datasetFacetsBuilder = + context.getOpenLineage().newDatasetFacetsBuilder(); + getDatasetVersion(x) + .map( + version -> + datasetFacetsBuilder.version( + context.getOpenLineage().newDatasetVersionDatasetFacet(version))); + + Collection rootPaths = + JavaConversions.asJavaCollection(relation.location().rootPaths()); + + if (isSingleFileRelation(rootPaths, hadoopConfig)) { + return Collections.singletonList( + datasetFactory.getDataset( + rootPaths.stream().findFirst().get().toUri(), + relation.schema(), + datasetFacetsBuilder)); + } else { + return rootPaths.stream() + .map(p -> PlanUtils.getDirectoryPath(p, hadoopConfig)) + .distinct() + .map( + p -> { + // TODO- refactor this to return a single partitioned dataset based on + // static + // static partitions in the relation + return datasetFactory.getDataset( + p.toUri(), relation.schema(), datasetFacetsBuilder); + }) + .collect(Collectors.toList()); + } + }) + .orElse(Collections.emptyList()); + } catch (Exception e) { + if ("com.databricks.backend.daemon.data.client.adl.AzureCredentialNotFoundExcepgittion" + .equals(e.getClass().getName())) { + // This is a fallback that can occur when hadoop configurations cannot be + // reached. This occurs in Azure Databricks when credential passthrough + // is enabled and you're attempting to get the data lake credentials. + // The Spark Listener context cannot use the user credentials + // thus we need a fallback. + // This is similar to the InsertIntoHadoopRelationVisitor's process for getting + // Datasets + List inputDatasets = new ArrayList(); + List paths = + new ArrayList<>(JavaConversions.asJavaCollection(relation.location().rootPaths())); + for (Path p : paths) { + inputDatasets.add(datasetFactory.getDataset(p.toUri(), relation.schema())); + } + if (inputDatasets.isEmpty()) { + return Collections.emptyList(); + } else { + return inputDatasets; + } + } else { + throw e; + } + } + } + + private boolean isSingleFileRelation(Collection paths, Configuration hadoopConfig) { + if (paths.size() != 1) { + return false; + } + + try { + Path path = paths.stream().findFirst().get(); + return path.getFileSystem(hadoopConfig).isFile(path); + /* + Unfortunately it seems like on DataBricks this can throw an SparkException as well if credentials are missing. + Like org.apache.spark.SparkException: There is no Credential Scope. + */ + } catch (Exception e) { + return false; + } + } + + protected Optional getDatasetVersion(LogicalRelation x) { + // not implemented + return Optional.empty(); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PathUtils.java b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PathUtils.java new file mode 100644 index 00000000000000..b72d28ce72dd90 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PathUtils.java @@ -0,0 +1,207 @@ +/* +/* Copyright 2018-2023 contributors to the OpenLineage project +/* SPDX-License-Identifier: Apache-2.0 +*/ + +package io.openlineage.spark.agent.util; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import datahub.spark.conf.SparkAppContext; +import datahub.spark.conf.SparkConfigParser; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.dataset.HdfsPathDataset; +import io.openlineage.client.utils.DatasetIdentifier; +import io.openlineage.client.utils.DatasetIdentifierUtils; +import java.io.File; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.Arrays; +import java.util.Optional; +import java.util.stream.Collectors; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.TableIdentifier; +import org.apache.spark.sql.catalyst.catalog.CatalogTable; +import org.apache.spark.sql.internal.StaticSQLConf; + +@Slf4j +@SuppressWarnings("checkstyle:HideUtilityClassConstructor") +public class PathUtils { + + private static final String DEFAULT_SCHEME = "file"; + public static final String SPARK_OPENLINEAGE_DATASET_REMOVE_PATH_PATTERN = + "spark.openlineage.dataset.removePath.pattern"; + public static final String REMOVE_PATTERN_GROUP = "remove"; + + private static Optional sparkConf = Optional.empty(); + + public static DatasetIdentifier fromPath(Path path) { + return fromPath(path, DEFAULT_SCHEME); + } + + public static DatasetIdentifier fromPath(Path path, String defaultScheme) { + return fromURI(path.toUri(), defaultScheme); + } + + public static DatasetIdentifier fromURI(URI location) { + return fromURI(location, DEFAULT_SCHEME); + } + + public static DatasetIdentifier fromURI(URI location, String defaultScheme) { + DatasetIdentifier di = DatasetIdentifierUtils.fromURI(location, defaultScheme); + return new DatasetIdentifier(removePathPattern(di.getName()), di.getNamespace()); + } + + public static DatasetIdentifier fromCatalogTable(CatalogTable catalogTable) { + return fromCatalogTable(catalogTable, loadSparkConf()); + } + + /** + * Create DatasetIdentifier from CatalogTable, using storage's locationURI if it exists. In other + * way, use defaultTablePath. + */ + @SneakyThrows + public static DatasetIdentifier fromCatalogTable( + CatalogTable catalogTable, Optional sparkConf) { + + DatasetIdentifier di; + if (catalogTable.storage() != null && catalogTable.storage().locationUri().isDefined()) { + di = PathUtils.fromURI(catalogTable.storage().locationUri().get(), DEFAULT_SCHEME); + } else { + // try to obtain location + try { + di = prepareDatasetIdentifierFromDefaultTablePath(catalogTable); + } catch (IllegalStateException e) { + // session inactive - no way to find DatasetProvider + throw new IllegalArgumentException( + "Unable to extract DatasetIdentifier from a CatalogTable", e); + } + } + + Optional metastoreUri = extractMetastoreUri(sparkConf); + // TODO: Is the call to "metastoreUri.get()" really needed? + // Java's Optional should prevent the null in the first place. + if (metastoreUri.isPresent() && metastoreUri.get() != null) { + // dealing with Hive tables + DatasetIdentifier symlink = prepareHiveDatasetIdentifier(catalogTable, metastoreUri.get()); + return di.withSymlink( + symlink.getName(), symlink.getNamespace(), DatasetIdentifier.SymlinkType.TABLE); + } else { + return di.withSymlink( + nameFromTableIdentifier(catalogTable.identifier()), + StringUtils.substringBeforeLast(di.getName(), File.separator), + DatasetIdentifier.SymlinkType.TABLE); + } + } + + @SneakyThrows + private static DatasetIdentifier prepareDatasetIdentifierFromDefaultTablePath( + CatalogTable catalogTable) { + URI uri = + SparkSession.active().sessionState().catalog().defaultTablePath(catalogTable.identifier()); + + return PathUtils.fromURI(uri); + } + + @SneakyThrows + private static DatasetIdentifier prepareHiveDatasetIdentifier( + CatalogTable catalogTable, URI metastoreUri) { + String qualifiedName = nameFromTableIdentifier(catalogTable.identifier()); + if (!qualifiedName.startsWith("/")) { + qualifiedName = String.format("/%s", qualifiedName); + } + return PathUtils.fromPath( + new Path(enrichHiveMetastoreURIWithTableName(metastoreUri, qualifiedName))); + } + + @SneakyThrows + public static URI enrichHiveMetastoreURIWithTableName(URI metastoreUri, String qualifiedName) { + return new URI( + "hive", null, metastoreUri.getHost(), metastoreUri.getPort(), qualifiedName, null, null); + } + + /** + * SparkConf does not change through job lifetime but it can get lost once session is closed. It's + * good to have it set in case of SPARK-29046 + */ + private static Optional loadSparkConf() { + if (!sparkConf.isPresent() && SparkSession.getDefaultSession().isDefined()) { + sparkConf = Optional.of(SparkSession.getDefaultSession().get().sparkContext().getConf()); + } + return sparkConf; + } + + private static Optional extractMetastoreUri(Optional sparkConf) { + // make sure SparkConf is present + if (!sparkConf.isPresent()) { + return Optional.empty(); + } + + // make sure enableHiveSupport is called + Optional setting = + SparkConfUtils.findSparkConfigKey( + sparkConf.get(), StaticSQLConf.CATALOG_IMPLEMENTATION().key()); + if (!setting.isPresent() || !"hive".equals(setting.get())) { + return Optional.empty(); + } + + return SparkConfUtils.getMetastoreUri(sparkConf.get()); + } + + private static String removeFirstSlashIfSingleSlashInString(String name) { + if (name.chars().filter(x -> x == '/').count() == 1 && name.startsWith("/")) { + return name.substring(1); + } + return name; + } + + private static String removePathPattern(String datasetName) { + // TODO: The reliance on global-mutable state here should be changed + // this led to problems in the PathUtilsTest class, where some tests interfered with others + log.info("Removing path pattern from dataset name {}", datasetName); + Optional conf = loadSparkConf(); + if (!conf.isPresent()) { + return datasetName; + } + try { + String propertiesString = + Arrays.stream(conf.get().getAllWithPrefix("spark.datahub.")) + .map(tup -> tup._1 + "= \"" + tup._2 + "\"") + .collect(Collectors.joining("\n")); + Config datahubConfig = ConfigFactory.parseString(propertiesString); + DatahubOpenlineageConfig datahubOpenlineageConfig = + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext()); + HdfsPathDataset hdfsPath = + HdfsPathDataset.create(new URI(datasetName), datahubOpenlineageConfig); + log.debug("Transformed path is {}", hdfsPath.getDatasetPath()); + return hdfsPath.getDatasetPath(); + } catch (InstantiationException e) { + log.warn( + "Unable to convert dataset {} to path the exception was {}", datasetName, e.getMessage()); + return datasetName; + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } + + private static String nameFromTableIdentifier(TableIdentifier identifier) { + // we create name instead of calling `unquotedString` method which includes spark_catalog + // for Spark 3.4 + String name; + if (identifier.database().isDefined()) { + // include database in name + name = String.format("%s.%s", identifier.database().get(), identifier.table()); + } else { + // just table name + name = identifier.table(); + } + + return name; + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java new file mode 100644 index 00000000000000..8d93b0288b5151 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark/agent/util/PlanUtils.java @@ -0,0 +1,343 @@ +/* +/* Copyright 2018-2023 contributors to the OpenLineage project +/* SPDX-License-Identifier: Apache-2.0 +*/ + +package io.openlineage.spark.agent.util; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import datahub.spark.conf.SparkLineageConf; +import io.datahubproject.openlineage.dataset.HdfsPathDataset; +import io.openlineage.client.OpenLineage; +import io.openlineage.spark.agent.Versions; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import lombok.extern.slf4j.Slf4j; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkEnv; +import org.apache.spark.package$; +import org.apache.spark.rdd.HadoopRDD; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.execution.datasources.FileScanRDD; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import scala.PartialFunction; +import scala.runtime.AbstractPartialFunction; + +/** + * Utility functions for traversing a {@link + * org.apache.spark.sql.catalyst.plans.logical.LogicalPlan}. + */ +@Slf4j +@SuppressWarnings("checkstyle:HideUtilityClassConstructor") +public class PlanUtils { + + public static final String SLASH_DELIMITER_USER_PASSWORD_REGEX = + "[A-Za-z0-9_%]+//?[A-Za-z0-9_%]*@"; + public static final String COLON_DELIMITER_USER_PASSWORD_REGEX = + "([/|,])[A-Za-z0-9_%]+:?[A-Za-z0-9_%]*@"; + + /** + * Merge a list of {@link PartialFunction}s and return the first value where the function is + * defined or empty list if no function matches the input. + * + * @param fns + * @param arg + * @param + * @param + * @return + */ + public static Collection applyAll( + List>> fns, T arg) { + PartialFunction> fn = merge(fns); + if (fn.isDefinedAt(arg)) { + return fn.apply(arg); + } + return Collections.emptyList(); + } + + /** + * Given a list of {@link PartialFunction}s merge to produce a single function that will test the + * input against each function one by one until a match is found or empty() is returned. + * + * @param fns + * @param + * @param + * @return + */ + public static PartialFunction> merge( + Collection>> fns) { + return new AbstractPartialFunction>() { + @Override + public boolean isDefinedAt(T x) { + return fns.stream() + .filter(pfn -> PlanUtils.safeIsDefinedAt(pfn, x)) + .findFirst() + .isPresent(); + } + + private boolean isDefinedAt(T x, PartialFunction> pfn) { + return PlanUtils.safeIsDefinedAt(pfn, x); + } + + @Override + public Collection apply(T x) { + return fns.stream() + .filter(pfn -> PlanUtils.safeIsDefinedAt(pfn, x)) + .map( + pfn -> { + try { + Collection collection = pfn.apply(x); + if (log.isDebugEnabled()) { + log.debug( + "Visitor {} visited {}, returned {}", + pfn.getClass().getCanonicalName(), + x.getClass().getCanonicalName(), + collection); + } + return collection; + } catch (RuntimeException | NoClassDefFoundError | NoSuchMethodError e) { + log.error("Apply failed:", e); + return null; + } + }) + .filter(Objects::nonNull) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } + }; + } + + /** + * Given a schema, construct a valid {@link OpenLineage.SchemaDatasetFacet}. + * + * @param structType + * @return + */ + public static OpenLineage.SchemaDatasetFacet schemaFacet( + OpenLineage openLineage, StructType structType) { + return openLineage + .newSchemaDatasetFacetBuilder() + .fields(transformFields(openLineage, structType.fields())) + .build(); + } + + private static List transformFields( + OpenLineage openLineage, StructField... fields) { + List list = new ArrayList<>(); + for (StructField field : fields) { + list.add( + openLineage + .newSchemaDatasetFacetFieldsBuilder() + .name(field.name()) + .type(field.dataType().typeName()) + .build()); + } + return list; + } + + /** + * Given a list of attributes, constructs a valid {@link OpenLineage.SchemaDatasetFacet}. + * + * @param attributes + * @return + */ + public static StructType toStructType(List attributes) { + return new StructType( + attributes.stream() + .map( + attr -> + new StructField(attr.name(), attr.dataType(), attr.nullable(), attr.metadata())) + .collect(Collectors.toList()) + .toArray(new StructField[0])); + } + + public static String namespaceUri(URI outputPath) { + return Optional.ofNullable(outputPath.getAuthority()) + .map(a -> String.format("%s://%s", outputPath.getScheme(), a)) + .orElse(outputPath.getScheme()); + } + + /** + * Construct a {@link OpenLineage.DatasourceDatasetFacet} given a namespace for the datasource. + * + * @param namespaceUri + * @return + */ + public static OpenLineage.DatasourceDatasetFacet datasourceFacet( + OpenLineage openLineage, String namespaceUri) { + return openLineage + .newDatasourceDatasetFacetBuilder() + .uri(URI.create(namespaceUri)) + .name(namespaceUri) + .build(); + } + + /** + * Construct a {@link OpenLineage.ParentRunFacet} given the parent job's parentRunId, job name, + * and namespace. + * + * @param parentRunId + * @param parentJob + * @param parentJobNamespace + * @return + */ + public static OpenLineage.ParentRunFacet parentRunFacet( + UUID parentRunId, String parentJob, String parentJobNamespace) { + return new OpenLineage(Versions.OPEN_LINEAGE_PRODUCER_URI) + .newParentRunFacetBuilder() + .run(new OpenLineage.ParentRunFacetRunBuilder().runId(parentRunId).build()) + .job( + new OpenLineage.ParentRunFacetJobBuilder() + .name(parentJob) + .namespace(parentJobNamespace) + .build()) + .build(); + } + + public static Path getDirectoryPath(Path p, Configuration hadoopConf) { + SparkConf conf = SparkEnv.get().conf(); + String propertiesString = + Arrays.stream(conf.getAllWithPrefix("spark.datahub.")) + .map(tup -> tup._1 + "= \"" + tup._2 + "\"") + .collect(Collectors.joining("\n")); + Config datahubConfig = ConfigFactory.parseString(propertiesString); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConfig, null, null); + HdfsPathDataset hdfsPath = null; + try { + URI uri = new URI(p.toString()); + hdfsPath = HdfsPathDataset.create(uri, sparkLineageConf.getOpenLineageConf()); + log.debug("Path {} transformed to {}", p, hdfsPath.getDatasetPath()); + return new Path(hdfsPath.getDatasetPath()); + } catch (InstantiationException | URISyntaxException e) { + log.warn("Unable to convert path to hdfs path {} the exception was {}", p, e.getMessage()); + return p; + } + + // try { + // if (p.getFileSystem(hadoopConf).getFileStatus(p).isFile()) { + // return p.getParent(); + // } else { + // return p; + // } + // } catch (IOException e) { + // log.warn("Unable to get file system for path ", e); + // return p; + // } + } + + /** + * Given a list of RDDs, it collects list of data location directories. For each RDD, a parent + * directory is taken and list of distinct locations is returned. + * + * @param fileRdds + * @return + */ + public static List findRDDPaths(List> fileRdds) { + return fileRdds.stream() + .flatMap( + rdd -> { + if (rdd instanceof HadoopRDD) { + HadoopRDD hadoopRDD = (HadoopRDD) rdd; + Path[] inputPaths = FileInputFormat.getInputPaths(hadoopRDD.getJobConf()); + Configuration hadoopConf = hadoopRDD.getConf(); + return Arrays.stream(inputPaths) + .map(p -> PlanUtils.getDirectoryPath(p, hadoopConf)); + } else if (rdd instanceof FileScanRDD) { + FileScanRDD fileScanRDD = (FileScanRDD) rdd; + return ScalaConversionUtils.fromSeq(fileScanRDD.filePartitions()).stream() + .flatMap(fp -> Arrays.stream(fp.files())) + .map( + f -> { + if (package$.MODULE$.SPARK_VERSION().compareTo("3.4") > 0) { + // filePath returns SparkPath for Spark 3.4 + return ReflectionUtils.tryExecuteMethod(f, "filePath") + .map(o -> ReflectionUtils.tryExecuteMethod(o, "toPath")) + .map(o -> (Path) o.get()) + .get() + .getParent(); + } else { + return new Path(f.filePath()).getParent(); + } + }); + } else { + log.warn("Unknown RDD class {}", rdd.getClass().getCanonicalName()); + return Stream.empty(); + } + }) + .distinct() + .collect(Collectors.toList()); + } + + /** + * instanceOf alike implementation which does not fail in case of a missing class. + * + * @param instance + * @param classCanonicalName + * @return + */ + public static boolean safeIsInstanceOf(Object instance, String classCanonicalName) { + try { + Class c = Class.forName(classCanonicalName); + return instance.getClass().isAssignableFrom(c); + } catch (ClassNotFoundException e) { + return false; + } + } + + /** + * isDefinedAt method implementation that should never throw an error or exception + * + * @param pfn + * @param x + * @return + */ + public static boolean safeIsDefinedAt(PartialFunction pfn, Object x) { + try { + return pfn.isDefinedAt(x); + } catch (ClassCastException e) { + // do nothing + return false; + } catch (Exception e) { + if (e != null) { + log.debug("isDefinedAt method failed on {}", e); + } + return false; + } catch (NoClassDefFoundError e) { + log.debug("isDefinedAt method failed on {}", e.getMessage()); + return false; + } + } + + /** + * apply method implementation that should never throw an error or exception + * + * @param pfn + * @param x + * @return + */ + public static List safeApply(PartialFunction> pfn, D x) { + try { + return pfn.apply(x); + } catch (Exception | NoClassDefFoundError | NoSuchMethodError e) { + log.info("apply method failed with", e); + return Collections.emptyList(); + } + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark3/agent/lifecycle/plan/catalog/IcebergHandler.java b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark3/agent/lifecycle/plan/catalog/IcebergHandler.java new file mode 100644 index 00000000000000..dcd1cf3fb3aff7 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/io/openlineage/spark3/agent/lifecycle/plan/catalog/IcebergHandler.java @@ -0,0 +1,192 @@ +/* +/* Copyright 2018-2023 contributors to the OpenLineage project +/* SPDX-License-Identifier: Apache-2.0 +*/ + +package io.openlineage.spark3.agent.lifecycle.plan.catalog; + +import io.openlineage.client.OpenLineage; +import io.openlineage.client.utils.DatasetIdentifier; +import io.openlineage.spark.agent.util.PathUtils; +import io.openlineage.spark.agent.util.ScalaConversionUtils; +import io.openlineage.spark.agent.util.SparkConfUtils; +import io.openlineage.spark.api.OpenLineageContext; +import java.io.File; +import java.net.URI; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.spark.SparkCatalog; +import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.source.SparkTable; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.connector.catalog.Identifier; +import org.apache.spark.sql.connector.catalog.TableCatalog; + +@Slf4j +public class IcebergHandler implements CatalogHandler { + + private final OpenLineageContext context; + + private static final String TYPE = "type"; + private static final String CATALOG_IMPL = "catalog-impl"; + private static final String IO_IMPL = "io-impl"; + + public IcebergHandler(OpenLineageContext context) { + this.context = context; + } + + @Override + public boolean hasClasses() { + try { + IcebergHandler.class.getClassLoader().loadClass("org.apache.iceberg.catalog.Catalog"); + return true; + } catch (Exception e) { + // swallow- we don't care + } + return false; + } + + @Override + public boolean isClass(TableCatalog tableCatalog) { + return (tableCatalog instanceof SparkCatalog) || (tableCatalog instanceof SparkSessionCatalog); + } + + @Override + public DatasetIdentifier getDatasetIdentifier( + SparkSession session, + TableCatalog tableCatalog, + Identifier identifier, + Map properties) { + String catalogName = tableCatalog.name(); + + String prefix = String.format("spark.sql.catalog.%s", catalogName); + Map conf = + ScalaConversionUtils.fromMap(session.conf().getAll()); + log.info(conf.toString()); + Map catalogConf = + conf.entrySet().stream() + .filter(x -> x.getKey().startsWith(prefix)) + .filter(x -> x.getKey().length() > prefix.length()) + .collect( + Collectors.toMap( + x -> x.getKey().substring(prefix.length() + 1), // handle dot after prefix + Map.Entry::getValue)); + + log.info(catalogConf.toString()); + if (catalogConf.isEmpty() + || (!catalogConf.containsKey(TYPE) + && !catalogConf.get(CATALOG_IMPL).equals("org.apache.iceberg.aws.glue.GlueCatalog"))) { + throw new UnsupportedCatalogException(catalogName); + } + log.info(catalogConf.get(TYPE)); + + String warehouse = catalogConf.get(CatalogProperties.WAREHOUSE_LOCATION); + DatasetIdentifier di; + + if (catalogConf.get(CATALOG_IMPL).equals("org.apache.iceberg.aws.glue.GlueCatalog")) { + di = new DatasetIdentifier(identifier.toString(), "glue"); + log.info("Glue catalog detected, returning glue dataset identifier {}", di); + return di; + } else { + di = PathUtils.fromPath(new Path(warehouse, identifier.toString())); + } + if (catalogConf.get(TYPE).equals("hive")) { + di.withSymlink( + getHiveIdentifier( + session, catalogConf.get(CatalogProperties.URI), identifier.toString())); + } else if (catalogConf.get(TYPE).equals("hadoop")) { + di.withSymlink( + identifier.toString(), + StringUtils.substringBeforeLast( + di.getName(), File.separator), // parent location from a name becomes a namespace + DatasetIdentifier.SymlinkType.TABLE); + } else if (catalogConf.get(TYPE).equals("rest")) { + di.withSymlink( + getRestIdentifier( + session, catalogConf.get(CatalogProperties.URI), identifier.toString())); + } else if (catalogConf.get(TYPE).equals("nessie")) { + di.withSymlink( + getNessieIdentifier( + session, catalogConf.get(CatalogProperties.URI), identifier.toString())); + } + + return di; + } + + @SneakyThrows + private DatasetIdentifier.Symlink getNessieIdentifier( + SparkSession session, @Nullable String confUri, String table) { + + String uri = new URI(confUri).toString(); + return new DatasetIdentifier.Symlink(table, uri, DatasetIdentifier.SymlinkType.TABLE); + } + + @SneakyThrows + private DatasetIdentifier.Symlink getHiveIdentifier( + SparkSession session, @Nullable String confUri, String table) { + String slashPrefixedTable = String.format("/%s", table); + URI uri; + if (confUri == null) { + uri = + SparkConfUtils.getMetastoreUri(session.sparkContext().conf()) + .orElseThrow(() -> new UnsupportedCatalogException("hive")); + } else { + uri = new URI(confUri); + } + DatasetIdentifier metastoreIdentifier = + PathUtils.fromPath( + new Path(PathUtils.enrichHiveMetastoreURIWithTableName(uri, slashPrefixedTable))); + + return new DatasetIdentifier.Symlink( + metastoreIdentifier.getName(), + metastoreIdentifier.getNamespace(), + DatasetIdentifier.SymlinkType.TABLE); + } + + @SneakyThrows + private DatasetIdentifier.Symlink getRestIdentifier( + SparkSession session, @Nullable String confUri, String table) { + + String uri = new URI(confUri).toString(); + return new DatasetIdentifier.Symlink(table, uri, DatasetIdentifier.SymlinkType.TABLE); + } + + @Override + public Optional getStorageDatasetFacet( + Map properties) { + String format = properties.getOrDefault("format", ""); + return Optional.of( + context.getOpenLineage().newStorageDatasetFacet("iceberg", format.replace("iceberg/", ""))); + } + + @SneakyThrows + @Override + public Optional getDatasetVersion( + TableCatalog tableCatalog, Identifier identifier, Map properties) { + SparkTable table; + try { + table = (SparkTable) tableCatalog.loadTable(identifier); + } catch (NoSuchTableException | ClassCastException e) { + log.error("Failed to load table from catalog: {}", identifier, e); + return Optional.empty(); + } + + if (table.table() != null && table.table().currentSnapshot() != null) { + return Optional.of(Long.toString(table.table().currentSnapshot().snapshotId())); + } + return Optional.empty(); + } + + @Override + public String getName() { + return "iceberg"; + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/HdfsPathDatasetTest.java b/metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/HdfsPathDatasetTest.java new file mode 100644 index 00000000000000..bed4c197f96912 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/HdfsPathDatasetTest.java @@ -0,0 +1,291 @@ +package datahub.spark; + +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import datahub.spark.conf.SparkAppContext; +import datahub.spark.conf.SparkConfigParser; +import datahub.spark.conf.SparkLineageConf; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.dataset.HdfsPathDataset; +import io.datahubproject.openlineage.dataset.SparkDataset; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.HashMap; +import lombok.extern.slf4j.Slf4j; +import org.junit.Assert; +import org.junit.Test; + +@Slf4j +public class HdfsPathDatasetTest { + + @Test + public void testNoPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + log.warn("Test log"); + SparkDataset dataset = + HdfsPathDataset.create( + new URI("s3://my-bucket/foo/tests/bar.avro"), + sparkLineageConfBuilder.build().getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "s3" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join( + ",", "s3a://wrong-my-bucket/foo/{table}", "s3a://my-bucket/foo/{table}")); + } + }); + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + SparkDataset dataset = + HdfsPathDataset.create( + new URI("s3a://my-bucket/foo/tests/bar.avro"), + sparkLineageConfBuilder.build().getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } + + @Test + public void testUrisWithPartitionRegexp() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config rawDatahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put(SparkConfigParser.FILE_PARTITION_REGEXP_PATTERN, "year=.*/month=.*/day=.*"); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + rawDatahubConfig, new SparkAppContext())); + DatahubOpenlineageConfig datahubConfig = sparkLineageConfBuilder.build().getOpenLineageConf(); + + SparkDataset dataset = + HdfsPathDataset.create( + new URI("s3://bucket-a/kafka_backup/my-table/year=2022/month=10/day=11/my-file.tx"), + datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,bucket-a/kafka_backup/my-table,PROD)", + dataset.urn().toString()); + + dataset = + HdfsPathDataset.create( + new URI("s3://bucket-b/kafka_backup/my-table/year=2023/month=11/day=23/my-file.tx"), + datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,bucket-b/kafka_backup/my-table,PROD)", + dataset.urn().toString()); + + dataset = + HdfsPathDataset.create( + new URI( + "s3://bucket-c/my-backup/my-other-folder/my-table/year=2023/month=11/day=23/my-file.tx"), + datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,bucket-c/my-backup/my-other-folder/my-table,PROD)", + dataset.urn().toString()); + + dataset = + HdfsPathDataset.create( + new URI("s3://bucket-d/kafka_backup/my-table/non-partitioned/"), datahubConfig); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,bucket-d/kafka_backup/my-table/non-partitioned,PROD)", + dataset.urn().toString()); + } + + @Test + public void testNoMatchPathSpecListWithFolder() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config rawDatahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + String gcsPath = + "gcs://gcs-spike-standard-offerwall-dev-useast1/events_creation_timestamp_enhanced"; + String expectedUrn = + "urn:li:dataset:(urn:li:dataPlatform:gcs,gcs-spike-standard-offerwall-dev-useast1/events_creation_timestamp_enhanced,PROD)"; + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + rawDatahubConfig, new SparkAppContext())); + DatahubOpenlineageConfig datahubConfig = sparkLineageConfBuilder.build().getOpenLineageConf(); + + SparkDataset dataset = HdfsPathDataset.create(new URI(gcsPath), datahubConfig); + Assert.assertEquals(expectedUrn, dataset.urn().toString()); + } + + @Test + public void testNoMatchPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "s3" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join(",", "s3a://wrong-my-bucket/foo/{table}")); + } + }); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConfig, null, null); + SparkDataset dataset = + HdfsPathDataset.create( + new URI("s3a://my-bucket/foo/tests/bar.avro"), sparkLineageConf.getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo/tests/bar.avro,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathSpecListPlatformInstance() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put(SparkConfigParser.DATASET_PLATFORM_INSTANCE_KEY, "instance"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "s3" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join( + ",", "s3a://wrong-my-bucket/foo/{table}", "s3a://my-bucket/foo/{table}")); + } + }); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConfig, null, null); + + SparkDataset dataset = + HdfsPathDataset.create( + new URI("s3a://my-bucket/foo/tests/bar.avro"), sparkLineageConf.getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,instance.my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } + + @Test + public void testPathAliasList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "s3" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join(",", "s3a://my-bucket/{table}")); + } + }); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConfig, null, null); + + SparkDataset dataset = + HdfsPathDataset.create( + new URI("s3a://my-bucket/foo/tests/bar.avro"), sparkLineageConf.getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,my-bucket/foo,PROD)", dataset.urn().toString()); + } + + // ==================================================================== + // GCS tests + // ==================================================================== + @Test + public void testGcsNoPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConfig, null, null); + + SparkDataset dataset = + HdfsPathDataset.create( + new URI("gs://my-bucket/foo/tests/bar.avro"), sparkLineageConf.getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo/tests/bar.avro,PROD)", + dataset.urn().toString()); + } + + @Test + public void testGcsPathSpecList() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "gcs" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join( + ",", "s3a://wrong-my-bucket/foo/{table}", "gs://my-bucket/foo/{table}")); + } + }); + SparkLineageConf sparkLineageConf = + SparkLineageConf.toSparkLineageConf(datahubConfig, null, null); + + SparkDataset dataset = + HdfsPathDataset.create( + new URI("gs://my-bucket/foo/tests/bar.avro"), sparkLineageConf.getOpenLineageConf()); + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo/tests,PROD)", + dataset.urn().toString()); + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/SparkStreamingEventToDatahubTest.java b/metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/SparkStreamingEventToDatahubTest.java new file mode 100644 index 00000000000000..71c43cea408e6d --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/test/java/datahub/spark/SparkStreamingEventToDatahubTest.java @@ -0,0 +1,486 @@ +package datahub.spark; + +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.dataprocess.RunResultType; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import datahub.spark.conf.SparkAppContext; +import datahub.spark.conf.SparkConfigParser; +import datahub.spark.conf.SparkLineageConf; +import datahub.spark.converter.SparkStreamingEventToDatahub; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.converter.OpenLineageToDataHub; +import io.datahubproject.openlineage.dataset.DatahubDataset; +import io.datahubproject.openlineage.dataset.DatahubJob; +import io.openlineage.client.OpenLineage; +import io.openlineage.client.OpenLineageClientUtils; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Optional; +import java.util.stream.Stream; +import junit.framework.TestCase; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.tuple.Triple; +import org.junit.Assert; + +public class SparkStreamingEventToDatahubTest extends TestCase { + public void testGenerateUrnFromStreamingDescriptionFile() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription( + "FileSink[/tmp/streaming_output/]", sparkLineageConfBuilder.build()); + assert (urn.isPresent()); + + assertEquals("hdfs", urn.get().getPlatformEntity().getPlatformNameEntity()); + assertEquals("/tmp/streaming_output", urn.get().getDatasetNameEntity()); + } + + public void testGenerateUrnFromStreamingDescriptionS3File() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription( + "FileSink[s3://bucket/streaming_output/]", sparkLineageConfBuilder.build()); + assert (urn.isPresent()); + + assertEquals("s3", urn.get().getPlatformEntity().getPlatformNameEntity()); + assertEquals("bucket/streaming_output", urn.get().getDatasetNameEntity()); + } + + public void testGenerateUrnFromStreamingDescriptionS3AFile() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription( + "FileSink[s3a://bucket/streaming_output/]", sparkLineageConfBuilder.build()); + assert (urn.isPresent()); + + assertEquals("s3", urn.get().getPlatformEntity().getPlatformNameEntity()); + assertEquals("bucket/streaming_output", urn.get().getDatasetNameEntity()); + } + + public void testGenerateUrnFromStreamingDescriptionGCSFile() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription( + "FileSink[gcs://bucket/streaming_output/]", sparkLineageConfBuilder.build()); + assert (urn.isPresent()); + + assertEquals("gcs", urn.get().getPlatformEntity().getPlatformNameEntity()); + assertEquals("bucket/streaming_output", urn.get().getDatasetNameEntity()); + } + + public void testGenerateUrnFromStreamingDescriptionDeltaFile() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription( + "DeltaSink[/tmp/streaming_output/]", sparkLineageConfBuilder.build()); + assert (urn.isPresent()); + + assertEquals("hdfs", urn.get().getPlatformEntity().getPlatformNameEntity()); + assertEquals("/tmp/streaming_output", urn.get().getDatasetNameEntity()); + } + + public void testGenerateUrnFromStreamingDescriptionGCSWithPathSpec() + throws InstantiationException, IllegalArgumentException, URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "gcs" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join(",", "gcs://my-bucket/foo/{table}/*/*/*")); + } + }); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + SparkStreamingEventToDatahub.generateUrnFromStreamingDescription( + "DeltaSink[gcs://my-bucket/foo/tests/year=2023/month=03/day=11/myfile.parquet]", + sparkLineageConfBuilder.build()); + assert (urn.isPresent()); + + Assert.assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:gcs,my-bucket/foo/tests,PROD)", urn.get().toString()); + } + + public void testGcsDataset() throws URISyntaxException { + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder() + .namespace("gs://spark-integration-tests") + .name("/spark-integration-test/test_gcs_delta_lake") + .build(); + + Config datahubConfig = ConfigFactory.empty(); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + OpenLineageToDataHub.convertOpenlineageDatasetToDatasetUrn( + outputDataset, sparkLineageConfBuilder.build().getOpenLineageConf()); + assert (urn.isPresent()); + assertEquals( + "spark-integration-tests/spark-integration-test/test_gcs_delta_lake", + urn.get().getDatasetNameEntity()); + } + + public void testGcsDatasetWithoutSlashInName() throws URISyntaxException { + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder() + .namespace("gs://spark-integration-tests") + .name("spark-integration-test/test_gcs_delta_lake") + .build(); + + Config datahubConfig = ConfigFactory.empty(); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + OpenLineageToDataHub.convertOpenlineageDatasetToDatasetUrn( + outputDataset, sparkLineageConfBuilder.build().getOpenLineageConf()); + assert (urn.isPresent()); + assertEquals( + "spark-integration-tests/spark-integration-test/test_gcs_delta_lake", + urn.get().getDatasetNameEntity()); + } + + public void testRemoveFilePrefixFromPath() throws URISyntaxException { + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder() + .namespace("file") + .name("/tmp/streaming_output/file.txt") + .build(); + + Config datahubConfig = ConfigFactory.empty(); + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + OpenLineageToDataHub.convertOpenlineageDatasetToDatasetUrn( + outputDataset, sparkLineageConfBuilder.build().getOpenLineageConf()); + assert (urn.isPresent()); + assertEquals("/tmp/streaming_output/file.txt", urn.get().getDatasetNameEntity()); + } + + public void testRemoveFilePrefixFromPathWithPlatformInstance() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put(SparkConfigParser.DATASET_PLATFORM_INSTANCE_KEY, "my-platfrom-instance"); + } + }); + + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder() + .namespace("file") + .name("/tmp/streaming_output/file.txt") + .build(); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + OpenLineageToDataHub.convertOpenlineageDatasetToDatasetUrn( + outputDataset, sparkLineageConfBuilder.build().getOpenLineageConf()); + assert (urn.isPresent()); + assertEquals( + "my-platfrom-instance./tmp/streaming_output/file.txt", urn.get().getDatasetNameEntity()); + } + + public void testOpenlineageDatasetWithPathSpec() throws URISyntaxException { + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + put( + SparkConfigParser.PLATFORM_KEY + + "." + + "s3" + + "." + + SparkConfigParser.PATH_SPEC_LIST_KEY, + String.join( + ",", + "s3a://data-482ajm7100-longtailcompanions-demo-795586375822-usw2/kafka_backup/{table}/year=*/month=*/day=*/*")); + } + }); + + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder() + .namespace("s3a://data-482ajm7100-longtailcompanions-demo-795586375822-usw2") + .name( + "/kafka_backup/482ajm7100-longtailcompanions_MCL_Timeseries/year=2023/month=03/day=23") + .build(); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + OpenLineageToDataHub.convertOpenlineageDatasetToDatasetUrn( + outputDataset, sparkLineageConfBuilder.build().getOpenLineageConf()); + assert (urn.isPresent()); + assertEquals( + "data-482ajm7100-longtailcompanions-demo-795586375822-usw2/kafka_backup/482ajm7100-longtailcompanions_MCL_Timeseries", + urn.get().getDatasetNameEntity()); + } + + public void testOpenlineageTableDataset() throws URISyntaxException { + // https://openlineage.io/docs/spec/naming#dataset-naming + Stream> testCases = + Stream.of( + Triple.of( + "postgres://db.foo.com:6543", + "metrics.sales.orders", + "urn:li:dataset:(urn:li:dataPlatform:postgres,metrics.sales.orders,PROD)"), + Triple.of( + "mysql://db.foo.com:6543", + "metrics.orders", + "urn:li:dataset:(urn:li:dataPlatform:mysql,metrics.orders,PROD)"), + Triple.of( + "s3://sales-metrics", + "orders.csv", + "urn:li:dataset:(urn:li:dataPlatform:s3,sales-metrics/orders.csv,PROD)"), + Triple.of( + "gcs://sales-metrics", + "orders.csv", + "urn:li:dataset:(urn:li:dataPlatform:gcs,sales-metrics/orders.csv,PROD)"), + Triple.of( + "hdfs://stg.foo.com:3000", + "salesorders.csv", + "urn:li:dataset:(urn:li:dataPlatform:hdfs,salesorders.csv,PROD)"), + Triple.of( + "bigquery", + "metrics.sales.orders", + "urn:li:dataset:(urn:li:dataPlatform:bigquery,metrics.sales.orders,PROD)"), + Triple.of( + "redshift://examplecluster.XXXXXXXXXXXX.us-west-2.redshift.amazonaws.com:5439", + "metrics.sales.orders", + "urn:li:dataset:(urn:li:dataPlatform:redshift,metrics.sales.orders,PROD)"), + Triple.of( + "awsathena://athena.us-west-2.amazonaws.com", + "metrics.sales.orders", + "urn:li:dataset:(urn:li:dataPlatform:athena,metrics.sales.orders,PROD)"), + Triple.of( + "sqlserver://XXXXXXXXXXXX.sql.azuresynapse.net:1433", + "SQLPool1/sales.orders", + "urn:li:dataset:(urn:li:dataPlatform:mssql,SQLPool1/sales.orders,PROD)"), + Triple.of( + "azurecosmos://XXXXXXXXXXXX.documents.azure.com/dbs", + "metrics.colls.orders", + "urn:li:dataset:(urn:li:dataPlatform:azurecosmos,metrics.colls.orders,PROD)")); + Config datahubConfig = + ConfigFactory.parseMap( + new HashMap() { + { + put(SparkConfigParser.DATASET_ENV_KEY, "PROD"); + } + }); + + testCases.forEach( + args -> { + String namespace = args.getLeft(); + String datasetName = args.getMiddle(); + String expectedUrn = args.getRight(); + + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder().namespace(namespace).name(datasetName).build(); + + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = + SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + + Optional urn = + OpenLineageToDataHub.convertOpenlineageDatasetToDatasetUrn( + outputDataset, sparkLineageConfBuilder.build().getOpenLineageConf()); + assert (urn.isPresent()); + assertEquals(expectedUrn, urn.get().toString()); + }); + } + + public void testProcessOlEvent() throws URISyntaxException, IOException { + OpenLineage.OutputDataset outputDataset = + new OpenLineage.OutputDatasetBuilder() + .namespace("file") + .name("/tmp/streaming_output/file.txt") + .build(); + + Config datahubConfig = ConfigFactory.empty(); + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + String olEvent = + IOUtils.toString( + this.getClass().getResourceAsStream("/ol_events/sample_spark.json"), + StandardCharsets.UTF_8); + + OpenLineage.RunEvent runEvent = OpenLineageClientUtils.runEventFromJson(olEvent); + DatahubJob datahubJob = + OpenLineageToDataHub.convertRunEventToJob( + runEvent, sparkLineageConfBuilder.build().getOpenLineageConf()); + assertNotNull(datahubJob); + } + + public void testProcessOlFailedEvent() throws URISyntaxException, IOException { + + Config datahubConfig = ConfigFactory.empty(); + SparkLineageConf.SparkLineageConfBuilder sparkLineageConfBuilder = SparkLineageConf.builder(); + sparkLineageConfBuilder.openLineageConf( + SparkConfigParser.sparkConfigToDatahubOpenlineageConf( + datahubConfig, new SparkAppContext())); + String olEvent = + IOUtils.toString( + this.getClass().getResourceAsStream("/ol_events/sample_failed_spark.json"), + StandardCharsets.UTF_8); + + OpenLineage.RunEvent runEvent = OpenLineageClientUtils.runEventFromJson(olEvent); + DatahubJob datahubJob = + OpenLineageToDataHub.convertRunEventToJob( + runEvent, sparkLineageConfBuilder.build().getOpenLineageConf()); + assertNotNull(datahubJob); + assertEquals("cloud_trail_log_statistics", datahubJob.getDataFlowInfo().getName()); + assertEquals( + RunResultType.FAILURE, datahubJob.getDataProcessInstanceRunEvent().getResult().getType()); + } + + public void testProcessOlEventWithSetFlowname() throws URISyntaxException, IOException { + DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder = + DatahubOpenlineageConfig.builder(); + builder.pipelineName("my_flow_name"); + + String olEvent = + IOUtils.toString( + this.getClass().getResourceAsStream("/ol_events/sample_failed_spark.json"), + StandardCharsets.UTF_8); + + OpenLineage.RunEvent runEvent = OpenLineageClientUtils.runEventFromJson(olEvent); + DatahubJob datahubJob = OpenLineageToDataHub.convertRunEventToJob(runEvent, builder.build()); + assertNotNull(datahubJob); + assertEquals("my_flow_name", datahubJob.getDataFlowInfo().getName()); + for (DatahubDataset dataset : datahubJob.getInSet()) { + assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,aws-cloudtrail-logs-795586375822-837d93fd/AWSLogs/795586375822/CloudTrail/eu-west-1,PROD)", + dataset.getUrn().toString()); + } + for (DatahubDataset dataset : datahubJob.getOutSet()) { + assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,acryl-datahub-offline/tmp/daily_stats2,PROD)", + dataset.getUrn().toString()); + } + + assertEquals( + RunResultType.FAILURE, datahubJob.getDataProcessInstanceRunEvent().getResult().getType()); + } + + public void testProcessOlEventWithSetDatasetFabricType() throws URISyntaxException, IOException { + DatahubOpenlineageConfig.DatahubOpenlineageConfigBuilder builder = + DatahubOpenlineageConfig.builder(); + builder.fabricType(FabricType.DEV); + + String olEvent = + IOUtils.toString( + this.getClass().getResourceAsStream("/ol_events/sample_failed_spark.json"), + StandardCharsets.UTF_8); + + OpenLineage.RunEvent runEvent = OpenLineageClientUtils.runEventFromJson(olEvent); + DatahubJob datahubJob = OpenLineageToDataHub.convertRunEventToJob(runEvent, builder.build()); + + assertNotNull(datahubJob); + + for (DatahubDataset dataset : datahubJob.getInSet()) { + assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,aws-cloudtrail-logs-795586375822-837d93fd/AWSLogs/795586375822/CloudTrail/eu-west-1,DEV)", + dataset.getUrn().toString()); + } + for (DatahubDataset dataset : datahubJob.getOutSet()) { + assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:s3,acryl-datahub-offline/tmp/daily_stats2,DEV)", + dataset.getUrn().toString()); + } + } +} diff --git a/metadata-integration/java/spark-lineage-beta/src/test/resources/log4j.properties b/metadata-integration/java/spark-lineage-beta/src/test/resources/log4j.properties new file mode 100644 index 00000000000000..6c7b54c239c4a5 --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/test/resources/log4j.properties @@ -0,0 +1,10 @@ +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# datahub to info +log4j.logger.io.datahub=DEBUG +log4j.logger.datahub.spark=DEBUG +log4j.logger.io.openlineage=INFO \ No newline at end of file diff --git a/metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_failed_spark.json b/metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_failed_spark.json new file mode 100644 index 00000000000000..a59af01628cbdd --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_failed_spark.json @@ -0,0 +1,104 @@ +{ + "eventTime": "2023-09-27T13:38:39.278Z", + "producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunEvent", + "eventType": "FAIL", + "run": { + "runId": "9f06a36e-97c5-4d4a-9e92-0aef9ed2be4c", + "facets": { + "processing_engine": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-1-0/ProcessingEngineRunFacet.json#/$defs/ProcessingEngineRunFacet", + "version": "3.0.3", + "name": "spark" + }, + "environment-properties": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "environment-properties": {} + }, + "spark_version": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "spark-version": "3.0.3" + }, + "spark.exception": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "message": "Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 246, tamass-mbp-2.chello.hu, executor driver): org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: java.nio.file.AccessDeniedException: delete on s3a://acryl-datahub-offline/tmp/daily_stats2/_temporary/0/_temporary/attempt_202309271538167729286925835797846_0002_m_000000_246: com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 8NSVQ0ZA5AE9C3J6; S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=), S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=:null: AccessDenied: tmp/daily_stats2/_temporary/0/_temporary/attempt_202309271538167729286925835797846_0002_m_000000_246/part-00000-ac0d9627-17c4-47ee-9559-4c52ce9a9975-c000.csv: Access Denied\n\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateMultiObjectDeleteException(S3AUtils.java:460)\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:269)\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:151)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.delete(S3AFileSystem.java:1727)\n\tat org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:637)\n\tat org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:626)\n\tat org.apache.spark.mapred.SparkHadoopMapRedUtil$.performCommit$1(SparkHadoopMapRedUtil.scala:55)\n\tat org.apache.spark.mapred.SparkHadoopMapRedUtil$.commitTask(SparkHadoopMapRedUtil.scala:77)\n\tat org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitTask(HadoopMapReduceCommitProtocol.scala:254)\n\tat org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:79)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:280)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)\n\t... 9 more\nCaused by: com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 8NSVQ0ZA5AE9C3J6; S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=), S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=\n\tat com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.java:2146)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$deleteObjects$8(S3AFileSystem.java:1420)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:322)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:285)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.deleteObjects(S3AFileSystem.java:1416)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.removeKeys(S3AFileSystem.java:1676)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerDelete(S3AFileSystem.java:1796)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.delete(S3AFileSystem.java:1711)\n\t... 18 more\n\nDriver stacktrace:", + "stackTrace": "org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 246, tamass-mbp-2.chello.hu, executor driver): org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: java.nio.file.AccessDeniedException: delete on s3a://acryl-datahub-offline/tmp/daily_stats2/_temporary/0/_temporary/attempt_202309271538167729286925835797846_0002_m_000000_246: com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 8NSVQ0ZA5AE9C3J6; S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=), S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=:null: AccessDenied: tmp/daily_stats2/_temporary/0/_temporary/attempt_202309271538167729286925835797846_0002_m_000000_246/part-00000-ac0d9627-17c4-47ee-9559-4c52ce9a9975-c000.csv: Access Denied\n\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateMultiObjectDeleteException(S3AUtils.java:460)\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:269)\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:151)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.delete(S3AFileSystem.java:1727)\n\tat org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:637)\n\tat org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:626)\n\tat org.apache.spark.mapred.SparkHadoopMapRedUtil$.performCommit$1(SparkHadoopMapRedUtil.scala:55)\n\tat org.apache.spark.mapred.SparkHadoopMapRedUtil$.commitTask(SparkHadoopMapRedUtil.scala:77)\n\tat org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitTask(HadoopMapReduceCommitProtocol.scala:254)\n\tat org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:79)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:280)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)\n\t... 9 more\nCaused by: com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 8NSVQ0ZA5AE9C3J6; S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=), S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=\n\tat com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.java:2146)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$deleteObjects$8(S3AFileSystem.java:1420)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:322)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:285)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.deleteObjects(S3AFileSystem.java:1416)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.removeKeys(S3AFileSystem.java:1676)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerDelete(S3AFileSystem.java:1796)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.delete(S3AFileSystem.java:1711)\n\t... 18 more\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)\n\tat scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)\n\tat scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)\n\tat org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)\n\tat scala.Option.foreach(Option.scala:407)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:200)\n\tat org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:178)\n\tat org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)\n\tat org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)\n\tat org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)\n\tat org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)\n\tat org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)\n\tat org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:127)\n\tat org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:126)\n\tat org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:962)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:767)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)\n\tat org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:962)\n\tat org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:414)\n\tat org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:398)\n\tat org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:287)\n\tat org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:952)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\nCaused by: org.apache.spark.SparkException: Task failed while writing rows.\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:296)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:127)\n\tat org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:463)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:466)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\nCaused by: java.nio.file.AccessDeniedException: delete on s3a://acryl-datahub-offline/tmp/daily_stats2/_temporary/0/_temporary/attempt_202309271538167729286925835797846_0002_m_000000_246: com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 8NSVQ0ZA5AE9C3J6; S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=), S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=:null: AccessDenied: tmp/daily_stats2/_temporary/0/_temporary/attempt_202309271538167729286925835797846_0002_m_000000_246/part-00000-ac0d9627-17c4-47ee-9559-4c52ce9a9975-c000.csv: Access Denied\n\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateMultiObjectDeleteException(S3AUtils.java:460)\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:269)\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:151)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.delete(S3AFileSystem.java:1727)\n\tat org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:637)\n\tat org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.abortTask(FileOutputCommitter.java:626)\n\tat org.apache.spark.mapred.SparkHadoopMapRedUtil$.performCommit$1(SparkHadoopMapRedUtil.scala:55)\n\tat org.apache.spark.mapred.SparkHadoopMapRedUtil$.commitTask(SparkHadoopMapRedUtil.scala:77)\n\tat org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitTask(HadoopMapReduceCommitProtocol.scala:254)\n\tat org.apache.spark.sql.execution.datasources.FileFormatDataWriter.commit(FileFormatDataWriter.scala:79)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:280)\n\tat org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)\n\tat org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)\n\t... 9 more\nCaused by: com.amazonaws.services.s3.model.MultiObjectDeleteException: One or more objects could not be deleted (Service: null; Status Code: 200; Error Code: null; Request ID: 8NSVQ0ZA5AE9C3J6; S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=), S3 Extended Request ID: EC7gYrBqATpz1PTW5yN1sy9VyXW8yIVIrfbuwKUmwXkF4skjjxp6eMjpkHytY4cXII/NQOguMBc=\n\tat com.amazonaws.services.s3.AmazonS3Client.deleteObjects(AmazonS3Client.java:2146)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$deleteObjects$8(S3AFileSystem.java:1420)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:322)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:285)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.deleteObjects(S3AFileSystem.java:1416)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.removeKeys(S3AFileSystem.java:1676)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerDelete(S3AFileSystem.java:1796)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.delete(S3AFileSystem.java:1711)\n\t... 18 more\n" + } + } + }, + "job": { + "namespace": "default", + "name": "cloud_trail_log_statistics.execute_insert_into_hadoop_fs_relation_command.tmp_daily_stats2", + "facets": {} + }, + "inputs": [ + { + "namespace": "s3a://aws-cloudtrail-logs-795586375822-837d93fd", + "name": "/AWSLogs/795586375822/CloudTrail/eu-west-1", + "facets": { + "dataSource": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/DatasourceDatasetFacet.json#/$defs/DatasourceDatasetFacet", + "name": "s3a://aws-cloudtrail-logs-795586375822-837d93fd", + "uri": "s3a://aws-cloudtrail-logs-795586375822-837d93fd" + }, + "schema": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json#/$defs/SchemaDatasetFacet", + "fields": [ + { + "name": "Records", + "type": "array" + } + ] + } + }, + "inputFacets": {} + } + ], + "outputs": [ + { + "namespace": "s3a://acryl-datahub-offline", + "name": "/tmp/daily_stats2", + "facets": { + "dataSource": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/DatasourceDatasetFacet.json#/$defs/DatasourceDatasetFacet", + "name": "s3a://acryl-datahub-offline", + "uri": "s3a://acryl-datahub-offline" + }, + "schema": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json#/$defs/SchemaDatasetFacet", + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "count", + "type": "long" + } + ] + }, + "lifecycleStateChange": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/LifecycleStateChangeDatasetFacet.json#/$defs/LifecycleStateChangeDatasetFacet", + "lifecycleStateChange": "OVERWRITE" + } + }, + "outputFacets": { + "outputStatistics": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/OutputStatisticsOutputDatasetFacet.json#/$defs/OutputStatisticsOutputDatasetFacet", + "rowCount": 0, + "size": 0 + } + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_spark.json b/metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_spark.json new file mode 100644 index 00000000000000..77a6ebc4044bdf --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/test/resources/ol_events/sample_spark.json @@ -0,0 +1,114 @@ +{ + "eventTime": "2023-09-27T10:22:12.391Z", + "producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunEvent", + "eventType": "START", + "run": { + "runId": "580230ff-c2ef-47c3-bb5f-c395fc93ba56", + "facets": { + "spark_version": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "spark-version": "3.0.3" + }, + "spark_properties": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "properties": { + "spark.master": "local[*]", + "spark.app.name": "SimpleAppJsonNew" + } + }, + "processing_engine": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-1-0/ProcessingEngineRunFacet.json#/$defs/ProcessingEngineRunFacet", + "version": "3.0.3", + "name": "spark" + }, + "environment-properties": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunFacet", + "environment-properties": {} + } + } + }, + "job": { + "namespace": "default", + "name": "simple_app_json_new.execute_insert_into_hadoop_fs_relation_command.spark-test_result", + "facets": {} + }, + "inputs": [ + { + "namespace": "file", + "name": "/Users/treff7es/shadow/spark-test/people.json", + "facets": { + "dataSource": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/DatasourceDatasetFacet.json#/$defs/DatasourceDatasetFacet", + "name": "file", + "uri": "file" + }, + "schema": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json#/$defs/SchemaDatasetFacet", + "fields": [ + { + "name": "age", + "type": "long" + }, + { + "name": "name", + "type": "string" + } + ] + } + }, + "inputFacets": {} + } + ], + "outputs": [ + { + "namespace": "file", + "name": "/Users/treff7es/shadow/spark-test/result", + "facets": { + "dataSource": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/DatasourceDatasetFacet.json#/$defs/DatasourceDatasetFacet", + "name": "file", + "uri": "file" + }, + "schema": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/SchemaDatasetFacet.json#/$defs/SchemaDatasetFacet", + "fields": [ + { + "name": "name", + "type": "string" + } + ] + }, + "columnLineage": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-1/ColumnLineageDatasetFacet.json#/$defs/ColumnLineageDatasetFacet", + "fields": { + "name": { + "inputFields": [ + { + "namespace": "file", + "name": "/Users/treff7es/shadow/spark-test/people.json", + "field": "name" + } + ] + } + } + }, + "lifecycleStateChange": { + "_producer": "https://github.com/OpenLineage/OpenLineage/tree/1.2.2/integration/spark", + "_schemaURL": "https://openlineage.io/spec/facets/1-0-0/LifecycleStateChangeDatasetFacet.json#/$defs/LifecycleStateChangeDatasetFacet", + "lifecycleStateChange": "OVERWRITE" + } + }, + "outputFacets": {} + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/spark-lineage-beta/src/test/resources/org.apache.spark/log4j-defaults.properties b/metadata-integration/java/spark-lineage-beta/src/test/resources/org.apache.spark/log4j-defaults.properties new file mode 100644 index 00000000000000..c2e37900940dfd --- /dev/null +++ b/metadata-integration/java/spark-lineage-beta/src/test/resources/org.apache.spark/log4j-defaults.properties @@ -0,0 +1,9 @@ +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +log4j.logger.datahub.spark=DEBUG +log4j.logger.datahub.client.rest=DEBUG diff --git a/metadata-service/openapi-servlet/build.gradle b/metadata-service/openapi-servlet/build.gradle index 0430d4427528dd..32c40c31df42d7 100644 --- a/metadata-service/openapi-servlet/build.gradle +++ b/metadata-service/openapi-servlet/build.gradle @@ -1,5 +1,6 @@ plugins { id 'java' + id 'org.hidetake.swagger.generator' } dependencies { @@ -38,4 +39,37 @@ dependencies { testImplementation externalDependency.jacksonCore testImplementation externalDependency.jacksonDataBind testImplementation externalDependency.springBootStarterWeb -} \ No newline at end of file + + // Openlineage Specific Dependencies + implementation "io.openlineage:openlineage-java:$openLineageVersion" + implementation project(':metadata-integration:java:openlineage-converter') + implementation project(':metadata-integration:java:datahub-event') + swaggerCodegen externalDependency.swaggerCli +} + +sourceSets { + main { + java { + srcDirs = ["$buildDir/openapi/generated/src/main/java", 'src/main/java'] + } + } +} + +// https://github.com/int128/gradle-swagger-generator-plugin#task-type-generateswaggercode +task openApiGenerate(type: GenerateSwaggerCode) { + inputFile = file("$projectDir/src/main/resources/openlineage/openlineage.json") + outputDir = file("$buildDir/openapi/generated") + language = "spring" + components = ["apis"] + additionalProperties = [ + 'group-id' : "io.datahubproject", + 'dateLibrary' : "java8", + 'java11' : "true", + 'interfaceOnly' : 'true', + 'modelPropertyNaming': "original", + 'modelPackage' : "io.datahubproject.openlineage.model", + 'apiPackage' : "io.datahubproject.openlineage.generated.controller", + 'delegatePattern' : "true" + ] +} +tasks.getByName("compileJava").dependsOn(openApiGenerate) \ No newline at end of file diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java index 2336bea565e590..4a149f9ce82cc8 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java @@ -31,6 +31,8 @@ public class SpringWebConfig implements WebMvcConfigurer { private static final Set SCHEMA_REGISTRY_PACKAGES = Set.of("io.datahubproject.openapi.schema.registry"); + private static final Set OPENLINEAGE_PACKAGES = Set.of("io.datahubproject.openlineage"); + public static final Set NONDEFAULT_OPENAPI_PACKAGES; static { @@ -38,6 +40,7 @@ public class SpringWebConfig implements WebMvcConfigurer { NONDEFAULT_OPENAPI_PACKAGES.addAll(OPERATIONS_PACKAGES); NONDEFAULT_OPENAPI_PACKAGES.addAll(V2_PACKAGES); NONDEFAULT_OPENAPI_PACKAGES.addAll(SCHEMA_REGISTRY_PACKAGES); + NONDEFAULT_OPENAPI_PACKAGES.addAll(OPENLINEAGE_PACKAGES); } @Override @@ -76,4 +79,12 @@ public GroupedOpenApi openApiGroupV3() { .packagesToScan(V2_PACKAGES.toArray(String[]::new)) .build(); } + + @Bean + public GroupedOpenApi openlineageOpenApiGroup() { + return GroupedOpenApi.builder() + .group("openlineage") + .packagesToScan(OPENLINEAGE_PACKAGES.toArray(String[]::new)) + .build(); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/config/OpenLineageServletConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/config/OpenLineageServletConfig.java new file mode 100644 index 00000000000000..fa1569fa8cf621 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/config/OpenLineageServletConfig.java @@ -0,0 +1,29 @@ +package io.datahubproject.openapi.openlineage.config; + +import io.datahubproject.openapi.openlineage.mapping.RunEventMapper; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class OpenLineageServletConfig { + + @Bean + public RunEventMapper.MappingConfig mappingConfig() { + DatahubOpenlineageConfig datahubOpenlineageConfig = + DatahubOpenlineageConfig.builder() + .isStreaming(false) + .pipelineName(null) + .platformInstance(null) + .commonDatasetPlatformInstance(null) + .platform(null) + .filePartitionRegexpPattern(null) + .materializeDataset(true) + .includeSchemaMetadata(true) + .captureColumnLevelLineage(true) + .usePatch(false) + .parentJobUrn(null) + .build(); + return RunEventMapper.MappingConfig.builder().datahubConfig(datahubOpenlineageConfig).build(); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java new file mode 100644 index 00000000000000..b849ff588bee51 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/controller/LineageApiImpl.java @@ -0,0 +1,86 @@ +package io.datahubproject.openapi.openlineage.controller; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthorizerChain; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.mxe.MetadataChangeProposal; +import io.datahubproject.openapi.openlineage.mapping.RunEventMapper; +import io.datahubproject.openlineage.generated.controller.LineageApi; +import io.openlineage.client.OpenLineage; +import io.openlineage.client.OpenLineageClientUtils; +import java.util.Optional; +import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/openlineage/api/v1") +@Slf4j +public class LineageApiImpl implements LineageApi { + private static final ObjectMapper OBJECT_MAPPER = OpenLineageClientUtils.newObjectMapper(); + + @Autowired private RunEventMapper.MappingConfig _mappingConfig; + + // @Autowired + // @Qualifier("javaEntityClient") + // private EntityClient _entityClient; + + @Autowired private EntityServiceImpl _entityService; + + @Autowired private AuthorizerChain _authorizerChain; + + @Value("${authorization.restApiAuthorization:false}") + private boolean restApiAuthorizationEnabled; + + @Override + public Optional getObjectMapper() { + return Optional.of(OBJECT_MAPPER); + } + + @Override + public ResponseEntity postRunEventRaw(String body) { + try { + log.info("Received lineage event: {}", body); + OpenLineage.RunEvent openlineageRunEvent = OpenLineageClientUtils.runEventFromJson(body); + log.info("Deserialized to lineage event: {}", openlineageRunEvent); + return postRunEventRaw(openlineageRunEvent); + } catch (Exception e) { + log.error(e.getMessage(), e); + return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR); + } + } + + public ResponseEntity postRunEventRaw(OpenLineage.RunEvent openlineageRunEvent) { + Authentication authentication = AuthenticationContext.getAuthentication(); + log.info("PostRun received lineage event: {}", openlineageRunEvent); + + RunEventMapper runEventMapper = new RunEventMapper(); + AuditStamp auditStamp = + new AuditStamp() + .setActor(UrnUtils.getUrn(authentication.getActor().toUrnStr())) + .setTime(System.currentTimeMillis()); + try { + for (MetadataChangeProposal mcp : + runEventMapper + .map(openlineageRunEvent, this._mappingConfig) + .collect(Collectors.toList())) { + log.info("Ingesting MCP: {}", mcp); + _entityService.ingestProposal(mcp, auditStamp, true); + } + return new ResponseEntity<>(HttpStatus.OK); + } catch (Exception e) { + // log.error(e.getMessage(), e); + throw new RuntimeException(e); + // return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR); + } + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/mapping/RunEventMapper.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/mapping/RunEventMapper.java new file mode 100644 index 00000000000000..6704d3d6eea51b --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/mapping/RunEventMapper.java @@ -0,0 +1,37 @@ +package io.datahubproject.openapi.openlineage.mapping; + +import com.linkedin.mxe.MetadataChangeProposal; +import datahub.event.EventFormatter; +import io.datahubproject.openlineage.config.DatahubOpenlineageConfig; +import io.datahubproject.openlineage.converter.OpenLineageToDataHub; +import io.openlineage.client.OpenLineage; +import java.io.IOException; +import java.net.URISyntaxException; +import java.util.stream.Stream; +import lombok.Builder; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class RunEventMapper { + + public RunEventMapper() {} + + public Stream map( + OpenLineage.RunEvent runEvent, RunEventMapper.MappingConfig mappingConfig) { + EventFormatter eventFormatter = new EventFormatter(); + try { + return OpenLineageToDataHub.convertRunEventToJob(runEvent, mappingConfig.getDatahubConfig()) + .toMcps(mappingConfig.datahubConfig) + .stream(); + } catch (IOException | URISyntaxException e) { + throw new RuntimeException(e); + } + } + + @Builder + @Getter + public static class MappingConfig { + DatahubOpenlineageConfig datahubConfig; + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/model/LineageBody.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/model/LineageBody.java new file mode 100644 index 00000000000000..151d4cad7b95ab --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/openlineage/model/LineageBody.java @@ -0,0 +1,26 @@ +package io.datahubproject.openapi.openlineage.model; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import io.openlineage.server.OpenLineage; + +/** LineageBody */ +@JsonTypeInfo( + use = JsonTypeInfo.Id.NAME, + include = JsonTypeInfo.As.PROPERTY, + property = "schemaURL") +@JsonSubTypes({ + @JsonSubTypes.Type(value = OpenLineage.RunEvent.class, name = LineageBody.RUN_EVENT_SCHEMA), + @JsonSubTypes.Type( + value = OpenLineage.DatasetEvent.class, + name = LineageBody.DATASET_EVENT_SCHEMA), + @JsonSubTypes.Type(value = OpenLineage.JobEvent.class, name = LineageBody.JOB_EVENT_SCHEMA) +}) +public interface LineageBody extends OpenLineage.BaseEvent { + String RUN_EVENT_SCHEMA = + "https://openlineage.io/spec/2-0-0/OpenLineage.json#/definitions/RunEvent"; + String DATASET_EVENT_SCHEMA = + "https://openlineage.io/spec/2-0-0/OpenLineage.json#/definitions/DatasetEvent"; + String JOB_EVENT_SCHEMA = + "https://openlineage.io/spec/2-0-0/OpenLineage.json#/definitions/JobEvent"; +} diff --git a/metadata-service/openapi-servlet/src/main/resources/openlineage/openlineage.json b/metadata-service/openapi-servlet/src/main/resources/openlineage/openlineage.json new file mode 100644 index 00000000000000..9492004e9e6c91 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/resources/openlineage/openlineage.json @@ -0,0 +1,413 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "OpenLineage", + "version": "2-0-2", + "description": "OpenLineage is an open source **lineage and metadata collection API** for the data ecosystem.", + "license": { + "name": "Apache 2.0", + "url": "http://www.apache.org/licenses/LICENSE-2.0.html" + } + }, + "paths": { + "/lineage": { + "post": { + "summary": "Send an event related to the state of a run", + "description": "Updates a run state for a job.", + "operationId": "postRunEventRaw", + "tags": [ + "OpenLineage" + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "string" + } + } + } + }, + "responses": { + "200": { + "description": "OK" + } + } + } + } + }, + "components": { + "schemas": { + "BaseEvent": { + "type": "object", + "properties": { + "eventTime": { + "description": "the time the event occurred at", + "type": "string", + "format": "date-time" + }, + "producer": { + "description": "URI identifying the producer of this metadata. For example this could be a git url with a given tag or sha", + "type": "string", + "format": "uri", + "example": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client" + }, + "schemaURL": { + "description": "The JSON Pointer (https://tools.ietf.org/html/rfc6901) URL to the corresponding version of the schema definition for this RunEvent", + "type": "string", + "format": "uri", + "example": "https://openlineage.io/spec/0-0-1/OpenLineage.json" + } + }, + "required": [ + "eventTime", + "producer", + "schemaURL" + ] + }, + "BaseFacet": { + "description": "all fields of the base facet are prefixed with _ to avoid name conflicts in facets", + "type": "object", + "properties": { + "_producer": { + "description": "URI identifying the producer of this metadata. For example this could be a git url with a given tag or sha", + "type": "string", + "format": "uri", + "example": "https://github.com/OpenLineage/OpenLineage/blob/v1-0-0/client" + }, + "_schemaURL": { + "description": "The JSON Pointer (https://tools.ietf.org/html/rfc6901) URL to the corresponding version of the schema definition for this facet", + "type": "string", + "format": "uri", + "example": "https://openlineage.io/spec/1-0-2/OpenLineage.json#/$defs/BaseFacet" + } + }, + "additionalProperties": true, + "required": [ + "_producer", + "_schemaURL" + ] + }, + "RunFacet": { + "description": "A Run Facet", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/BaseFacet" + } + ] + }, + "Run": { + "type": "object", + "properties": { + "runId": { + "description": "The globally unique ID of the run associated with the job.", + "type": "string", + "format": "uuid" + }, + "facets": { + "description": "The run facets.", + "type": "object", + "anyOf": [ + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/RunFacet" + } + } + ] + } + }, + "required": [ + "runId" + ] + }, + "JobFacet": { + "description": "A Job Facet", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/BaseFacet" + }, + { + "type": "object", + "properties": { + "_deleted": { + "description": "set to true to delete a facet", + "type": "boolean" + } + } + } + ] + }, + "Job": { + "type": "object", + "properties": { + "namespace": { + "description": "The namespace containing that job", + "type": "string", + "example": "my-scheduler-namespace" + }, + "name": { + "description": "The unique name for that job within that namespace", + "type": "string", + "example": "myjob.mytask" + }, + "facets": { + "description": "The job facets.", + "type": "object", + "anyOf": [ + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/JobFacet" + } + } + ] + } + }, + "required": [ + "namespace", + "name" + ] + }, + "DatasetFacet": { + "description": "A Dataset Facet", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/BaseFacet" + }, + { + "type": "object", + "properties": { + "_deleted": { + "description": "set to true to delete a facet", + "type": "boolean" + } + } + } + ] + }, + "Dataset": { + "type": "object", + "properties": { + "namespace": { + "description": "The namespace containing that dataset", + "type": "string", + "example": "my-datasource-namespace" + }, + "name": { + "description": "The unique name for that dataset within that namespace", + "type": "string", + "example": "instance.schema.table" + }, + "facets": { + "description": "The facets for this dataset", + "type": "object", + "anyOf": [ + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/DatasetFacet" + } + } + ] + } + }, + "required": [ + "namespace", + "name" + ] + }, + "InputDatasetFacet": { + "description": "An Input Dataset Facet", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/BaseFacet" + } + ] + }, + "InputDataset": { + "description": "An input dataset", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/Dataset" + }, + { + "type": "object", + "properties": { + "inputFacets": { + "description": "The input facets for this dataset.", + "type": "object", + "anyOf": [ + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/InputDatasetFacet" + } + } + ] + } + } + } + ] + }, + "OutputDatasetFacet": { + "description": "An Output Dataset Facet", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/BaseFacet" + } + ] + }, + "OutputDataset": { + "description": "An output dataset", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/Dataset" + }, + { + "type": "object", + "properties": { + "outputFacets": { + "description": "The output facets for this dataset", + "type": "object", + "anyOf": [ + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/OutputDatasetFacet" + } + } + ] + } + } + } + ] + }, + "RunEvent": { + "allOf": [ + { + "$ref": "#/components/schemas/BaseEvent" + }, + { + "type": "object", + "properties": { + "eventType": { + "description": "the current transition of the run state. It is required to issue 1 START event and 1 of [ COMPLETE, ABORT, FAIL ] event per run. Additional events with OTHER eventType can be added to the same run. For example to send additional metadata after the run is complete", + "type": "string", + "enum": [ + "START", + "RUNNING", + "COMPLETE", + "ABORT", + "FAIL", + "OTHER" + ], + "example": "START|RUNNING|COMPLETE|ABORT|FAIL|OTHER" + }, + "run": { + "$ref": "#/components/schemas/Run" + }, + "job": { + "$ref": "#/components/schemas/Job" + }, + "inputs": { + "description": "The set of **input** datasets.", + "type": "array", + "items": { + "$ref": "#/components/schemas/InputDataset" + } + }, + "outputs": { + "description": "The set of **output** datasets.", + "type": "array", + "items": { + "$ref": "#/components/schemas/OutputDataset" + } + } + }, + "required": [ + "run", + "job" + ] + } + ] + }, + "StaticDataset": { + "description": "A Dataset sent within static metadata events", + "type": "object", + "allOf": [ + { + "$ref": "#/components/schemas/Dataset" + } + ] + }, + "DatasetEvent": { + "allOf": [ + { + "$ref": "#/components/schemas/BaseEvent" + }, + { + "type": "object", + "properties": { + "dataset": { + "$ref": "#/components/schemas/StaticDataset" + } + }, + "required": [ + "dataset" + ], + "not": { + "required": [ + "job", + "run" + ] + } + } + ] + }, + "JobEvent": { + "allOf": [ + { + "$ref": "#/components/schemas/BaseEvent" + }, + { + "type": "object", + "properties": { + "job": { + "$ref": "#/components/schemas/Job" + }, + "inputs": { + "description": "The set of **input** datasets.", + "type": "array", + "items": { + "$ref": "#/components/schemas/InputDataset" + } + }, + "outputs": { + "description": "The set of **output** datasets.", + "type": "array", + "items": { + "$ref": "#/components/schemas/OutputDataset" + } + } + }, + "required": [ + "job" + ], + "not": { + "required": [ + "run" + ] + } + } + ] + } + } + } +} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index e58c1c851c8f12..57d7d1dbc36f22 100644 --- a/settings.gradle +++ b/settings.gradle @@ -52,7 +52,10 @@ include 'metadata-models-custom' include 'entity-registry:custom-test-model' include 'metadata-integration:java:spark-lineage' include 'metadata-integration:java:datahub-client' +include 'metadata-integration:java:datahub-event' include 'metadata-integration:java:datahub-protobuf' +include 'metadata-integration:java:openlineage-converter' +include 'metadata-integration:java:spark-lineage-beta' include 'ingestion-scheduler' include 'metadata-ingestion-modules:airflow-plugin' include 'smoke-test' From 1b4f31bcfe09850a48be0bfb58b3ee71ab14b93d Mon Sep 17 00:00:00 2001 From: AvaniSiddhapuraAPT <156416042+AvaniSiddhapuraAPT@users.noreply.github.com> Date: Sat, 2 Mar 2024 02:33:47 +0530 Subject: [PATCH 04/13] fix(ingest/json-schema): adding support descriptions for array (#9757) --- .../ingestion/extractor/json_schema_util.py | 30 ++++++++-- .../unit/schema/test_json_schema_util.py | 57 ++++++++++++++++--- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index 52d2e4a8f56e3b..a1fab13fd4bbb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -417,15 +417,35 @@ def _field_from_complex_type( inner_field_path, ) elif datahub_field_type == ArrayTypeClass: - field_path = field_path.expand_type("array", schema) - # default items schema is string + field_path = field_path.expand_type(discriminated_type, schema) + yield SchemaField( + fieldPath=field_path.as_string(), + type=type_override or SchemaFieldDataTypeClass(type=ArrayTypeClass()), + nativeDataType=native_type_override + or JsonSchemaTranslator._get_discriminated_type_from_schema(schema), + description=JsonSchemaTranslator._get_description_from_any_schema( + schema + ), + nullable=nullable, + jsonProps=JsonSchemaTranslator._get_jsonprops_for_any_schema( + schema, required=required + ), + isPartOfKey=field_path.is_key_schema, + ) + items_schema = schema.get("items", {"type": "string"}) items_type = JsonSchemaTranslator._get_type_from_schema(items_schema) - field_path._set_parent_type_if_not_exists( - DataHubType(type=ArrayTypeClass, nested_type=items_type) + field_name = items_schema.get("title", None) + if not field_name: + field_name = items_type + inner_field_path = field_path.clone_plus( + FieldElement(type=[], name=field_name, schema_types=[]) ) yield from JsonSchemaTranslator.get_fields( - items_type, items_schema, required=False, base_field_path=field_path + items_type, + items_schema, + required=False, + base_field_path=inner_field_path, ) elif datahub_field_type == MapTypeClass: diff --git a/metadata-ingestion/tests/unit/schema/test_json_schema_util.py b/metadata-ingestion/tests/unit/schema/test_json_schema_util.py index 5e095fc0df8dce..34ccc3d4fb9225 100644 --- a/metadata-ingestion/tests/unit/schema/test_json_schema_util.py +++ b/metadata-ingestion/tests/unit/schema/test_json_schema_util.py @@ -153,15 +153,20 @@ def test_json_schema_with_recursion(): }, } fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) + expected_field_paths = [ { "path": "[version=2.0].[type=TreeNode].[type=integer].value", "type": NumberTypeClass, }, { - "path": "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode].children", + "path": "[version=2.0].[type=TreeNode].[type=array].children", "type": ArrayTypeClass, }, + { + "path": "[version=2.0].[type=TreeNode].[type=array].children.[type=TreeNode].TreeNode", + "type": RecordTypeClass, + }, ] assert_field_paths_match(fields, expected_field_paths) assert_fields_are_valid(fields) @@ -372,8 +377,10 @@ def test_nested_arrays(): fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) expected_field_paths: List[str] = [ - "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar", - "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=integer].a", + "[version=2.0].[type=NestedArray].[type=array].ar", + "[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array", + "[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo", + "[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo.[type=integer].a", ] assert_field_paths_match(fields, expected_field_paths) assert isinstance(fields[0].type.type, ArrayTypeClass) @@ -496,14 +503,17 @@ def test_needs_disambiguation_nested_union_of_records_with_same_field_name(): }, } fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) + expected_field_paths: List[str] = [ "[version=2.0].[type=ABFooUnion].[type=union].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=integer].f", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=integer].f", ] assert_field_paths_match(fields, expected_field_paths) @@ -578,8 +588,10 @@ def test_key_schema_handling(): "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=number].f", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=number].f", ] assert_field_paths_match(fields, expected_field_paths) for f in fields: @@ -664,7 +676,8 @@ def test_simple_array(): fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) expected_field_paths: List[str] = [ - "[version=2.0].[type=ObjectWithArray].[type=array].[type=string].ar", + "[version=2.0].[type=ObjectWithArray].[type=array].ar", + "[version=2.0].[type=ObjectWithArray].[type=array].ar.[type=string].string", ] assert_field_paths_match(fields, expected_field_paths) assert isinstance(fields[0].type.type, ArrayTypeClass) @@ -846,3 +859,31 @@ def test_top_level_trival_allof(): assert json.loads(fields[1].jsonProps or "{}")["required"] is False assert json.loads(fields[2].jsonProps or "{}")["required"] is True assert json.loads(fields[3].jsonProps or "{}")["required"] is False + + +def test_description_extraction(): + schema = { + "$id": "test", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "bar": { + "type": "array", + "items": {"type": "string"}, + "description": "XYZ", + } + }, + } + fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) + expected_field_paths: List[str] = [ + "[version=2.0].[type=object].[type=array].bar", + "[version=2.0].[type=object].[type=array].bar.[type=string].string", + ] + assert_field_paths_match(fields, expected_field_paths) + assert_fields_are_valid(fields) + # Additional check for the description extraction + array_field = next( + field + for field in fields + if field.fieldPath == "[version=2.0].[type=object].[type=array].bar" + ) + assert array_field.description == "XYZ" From 70adf73081e86343f4fc51ca1261da8732dd3398 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 4 Mar 2024 04:31:11 -0800 Subject: [PATCH 05/13] fix(ingest/redshift): fix bug in lineage v2 table renames (#9967) --- .../src/datahub/ingestion/source/redshift/lineage_v2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 3fbba909b25e6c..9105b6f071286f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -1,3 +1,4 @@ +import collections import logging import traceback from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union @@ -116,7 +117,9 @@ def build( table_renames, _ = self._lineage_v1._process_table_renames( database=self.database, connection=connection, - all_tables={}, + all_tables=collections.defaultdict( + lambda: collections.defaultdict(set) + ), ) for new_urn, original_urn in table_renames.items(): self.aggregator.add_table_rename( From d987707cdede76614053f82165efe5664bb54f1a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 4 Mar 2024 04:31:39 -0800 Subject: [PATCH 06/13] feat(ingest): speed up to_obj() and validate() (#9969) --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6c7786f30b9ef9..886f455390e5db 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -34,7 +34,7 @@ "importlib_metadata>=4.0.0; python_version < '3.10'", "docker", "expandvars>=0.6.5", - "avro-gen3==0.7.11", + "avro-gen3==0.7.12", # "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3", "avro>=1.11.3,<1.12", "python-dateutil>=2.8.0", From 9dd0c378304e54673c269a7e8eb3fdc67bbe795c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 4 Mar 2024 17:09:22 -0800 Subject: [PATCH 07/13] feat(ingest): fix fspath lint error (#9976) --- metadata-ingestion/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py index eb58ad40abb305..d0716e34ee2b6f 100644 --- a/metadata-ingestion/tests/conftest.py +++ b/metadata-ingestion/tests/conftest.py @@ -69,7 +69,7 @@ def pytest_collection_modifyitems( integration_path = root / "tests/integration" for item in items: - test_path = pathlib.Path(item.fspath) + test_path = item.path if ( "docker_compose_runner" in item.fixturenames # type: ignore[attr-defined] From 67406aa21805aa1179b8928e948fe81635b08201 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Tue, 5 Mar 2024 16:05:26 +0900 Subject: [PATCH 08/13] docs: archive old version before 0.12.0 & fix broken links (#9957) Co-authored-by: Harshal Sheth --- docs-website/docusaurus.config.js | 21 +++++++++++++++++++ docs-website/src/pages/docs/index.js | 2 +- docs-website/versions.json | 4 +--- metadata-ingestion/scripts/docgen.py | 2 +- .../ingestion/source/identity/azure_ad.py | 2 +- .../datahub/ingestion/source/identity/okta.py | 2 +- 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 423096544a2f0d..13fc48d390234c 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -114,6 +114,16 @@ module.exports = { type: "docsVersionDropdown", position: "left", dropdownActiveClassDisabled: true, + dropdownItemsAfter: [ + { + href: "https://docs-website-irpoe2osc-acryldata.vercel.app/docs/", + label: "0.11.0", + }, + { + href: "https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/", + label: "0.10.5", + }, + ], }, ], }, @@ -210,6 +220,17 @@ module.exports = { "@docusaurus/preset-classic", { docs: { + lastVersion: "current", + versions: { + current: { + label: "Next", + banner: 'none', + }, + "0.12.0": { + label: "0.12.0", + banner: 'none', + }, + }, path: "genDocs", sidebarPath: require.resolve("./sidebars.js"), ...(!isSaas && { diff --git a/docs-website/src/pages/docs/index.js b/docs-website/src/pages/docs/index.js index 0edd07267b27ec..11f1b3344a3d8f 100644 --- a/docs-website/src/pages/docs/index.js +++ b/docs-website/src/pages/docs/index.js @@ -38,7 +38,7 @@ const deploymentGuideContent = [ { title: "Managed DataHub", platformIcon: "acryl", - to: "docs/saas", + to: "docs/managed-datahub/managed-datahub-overview", }, { title: "Docker", diff --git a/docs-website/versions.json b/docs-website/versions.json index a66607b67ddd55..725fc7457a6982 100644 --- a/docs-website/versions.json +++ b/docs-website/versions.json @@ -1,5 +1,3 @@ [ - "0.12.0", - "0.11.0", - "0.10.5" + "0.12.0" ] diff --git a/metadata-ingestion/scripts/docgen.py b/metadata-ingestion/scripts/docgen.py index 3e4595650d46a5..10679ad770c458 100644 --- a/metadata-ingestion/scripts/docgen.py +++ b/metadata-ingestion/scripts/docgen.py @@ -1018,7 +1018,7 @@ def generate( - [Acryl Data introduces lineage support and automated propagation of governance information for Snowflake in DataHub](https://blog.datahubproject.io/acryl-data-introduces-lineage-support-and-automated-propagation-of-governance-information-for-339c99536561) - [Data in Context: Lineage Explorer in DataHub](https://blog.datahubproject.io/data-in-context-lineage-explorer-in-datahub-a53a9a476dc4) - [Harnessing the Power of Data Lineage with DataHub](https://blog.datahubproject.io/harnessing-the-power-of-data-lineage-with-datahub-ad086358dec4) -- [DataHub Lineage Impact Analysis](https://datahubproject.io/docs/next/act-on-metadata/impact-analysis) +- [DataHub Lineage Impact Analysis](../../act-on-metadata/impact-analysis.md) """) print("Lineage Documentation Generation Complete") diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index 5783304da5ef27..7e3ff7d4fb84cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -189,7 +189,7 @@ class AzureADSource(StatefulIngestionSourceBase): would like to take actions like adding them to a group or assigning them a role. For instructions on how to do configure Azure AD OIDC SSO, please read the documentation - [here](https://datahubproject.io/docs/authentication/guides/sso/configure-oidc-react-azure). + [here](../../../authentication/guides/sso/configure-oidc-react.md#create-an-application-registration-in-microsoft-azure-portal). ### Extracting DataHub Users diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index c9b0e4d7de4674..c8a1ec580d0121 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -215,7 +215,7 @@ class OktaSource(StatefulIngestionSourceBase): like to take actions like adding them to a group or assigning them a role. For instructions on how to do configure Okta OIDC SSO, please read the documentation - [here](https://datahubproject.io/docs/authentication/guides/sso/configure-oidc-react-okta). + [here](../../../authentication/guides/sso/configure-oidc-react.md#create-an-application-in-okta-developer-console). ### Extracting DataHub Users From 70656b3c20e6a5590e3208dd154c0e71764c342f Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Tue, 5 Mar 2024 14:47:41 +0530 Subject: [PATCH 09/13] fix(ui/editor): arrows change field when editing description (#9949) --- .../components/legacy/DescriptionModal.tsx | 18 +++++++++++++++++- .../Documentation/components/editor/Editor.tsx | 5 +++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx index 639eab94721560..0e899bc391e0a7 100644 --- a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx @@ -31,6 +31,17 @@ type Props = { export default function UpdateDescriptionModal({ title, description, original, onClose, onSubmit, isAddDesc }: Props) { const [updatedDesc, setDesc] = useState(description || original || ''); + const handleEditorKeyDown = (event: React.KeyboardEvent) => { + if ( + event.key === 'ArrowDown' || + event.key === 'ArrowUp' || + event.key === 'ArrowRight' || + event.key === 'ArrowLeft' + ) { + event.stopPropagation(); + } + }; + return (

- + {!isAddDesc && description && original && ( Original:}> diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx index db56c092c8ccdf..5a02067deb33d1 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/editor/Editor.tsx @@ -41,10 +41,11 @@ type EditorProps = { className?: string; doNotFocus?: boolean; dataTestId?: string; + onKeyDown?: (event: React.KeyboardEvent) => void; }; export const Editor = forwardRef((props: EditorProps, ref) => { - const { content, readOnly, onChange, className, dataTestId } = props; + const { content, readOnly, onChange, className, dataTestId, onKeyDown } = props; const { manager, state, getContext } = useRemirror({ extensions: () => [ new BlockquoteExtension(), @@ -99,7 +100,7 @@ export const Editor = forwardRef((props: EditorProps, ref) => { }, [readOnly, content]); return ( - + {!readOnly && ( From dde9687210bb1a6d2f5cc00ede5c983bc1b010b6 Mon Sep 17 00:00:00 2001 From: gaurav2733 <77378510+gaurav2733@users.noreply.github.com> Date: Tue, 5 Mar 2024 16:06:41 +0530 Subject: [PATCH 10/13] feat(ui/policies): add filter for Active/Inactive/All on policy page (#9958) --- .../app/permissions/policy/ManagePolicies.tsx | 121 +++++++++++++----- 1 file changed, 87 insertions(+), 34 deletions(-) diff --git a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx index 5765babcb575ef..069db8cc4d38c5 100644 --- a/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx +++ b/datahub-web-react/src/app/permissions/policy/ManagePolicies.tsx @@ -1,19 +1,14 @@ import React, { useEffect, useMemo, useState } from 'react'; -import { Button, Empty, message, Pagination, Tag } from 'antd'; +import { Button, Empty, message, Pagination, Select, Tag } from 'antd'; import styled from 'styled-components/macro'; import * as QueryString from 'query-string'; import { DeleteOutlined, PlusOutlined } from '@ant-design/icons'; import { useLocation } from 'react-router'; import PolicyBuilderModal from './PolicyBuilderModal'; -import { - Policy, - PolicyState, -} from '../../../types.generated'; +import { AndFilterInput, Policy, PolicyState,FilterOperator } from '../../../types.generated'; import { useAppConfig } from '../../useAppConfig'; import PolicyDetailsModal from './PolicyDetailsModal'; -import { - useListPoliciesQuery, -} from '../../../graphql/policy.generated'; +import { useListPoliciesQuery } from '../../../graphql/policy.generated'; import { Message } from '../../shared/Message'; import { DEFAULT_PAGE_SIZE, EMPTY_POLICY } from './policyUtils'; import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; @@ -73,7 +68,22 @@ const PageContainer = styled.span` flex-direction: column; overflow: auto; `; +const StyledSelect = styled(Select)` + margin-right: 15px; + min-width: 90px; + margin-left: 20px; +`; +const SelectContainer = styled.div` + display: flex; + align-items: flex-start; +`; + +export enum StatusType { + ALL, + ACTIVE, + INACTIVE, +} // TODO: Cleanup the styling. export const ManagePolicies = () => { @@ -82,6 +92,11 @@ export const ManagePolicies = () => { const params = QueryString.parse(location.search, { arrayFormat: 'comma' }); const paramsQuery = (params?.query as string) || undefined; const [query, setQuery] = useState(undefined); + const [orFilters, setOrFilters] = useState([ + { and: [{ field: 'state', values: ['ACTIVE'], condition: FilterOperator.Equal }] }, + ]); + const [statusFilter, setStatusFilter] = useState(StatusType.ACTIVE); + useEffect(() => setQuery(paramsQuery), [paramsQuery]); const { @@ -101,8 +116,6 @@ export const ManagePolicies = () => { const [focusPolicyUrn, setFocusPolicyUrn] = useState(undefined); const [focusPolicy, setFocusPolicy] = useState>(EMPTY_POLICY); - - const { loading: policiesLoading, error: policiesError, @@ -114,6 +127,7 @@ export const ManagePolicies = () => { start, count: pageSize, query, + orFilters, }, }, fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', @@ -152,11 +166,33 @@ export const ManagePolicies = () => { }; const onEditPolicy = (policy: Policy) => { - setShowPolicyBuilderModal(true); - setFocusPolicyUrn(policy?.urn); - setFocusPolicy({ ...policy }); + setShowPolicyBuilderModal(true); + setFocusPolicyUrn(policy?.urn); + setFocusPolicy({ ...policy }); }; + const onStatusChange = (newStatusFilter: StatusType) => { + setStatusFilter(newStatusFilter); + // Reset page to 1 when filter changes + setPage(1); + const filtersInput: any = []; + let statusValue = ''; + if (newStatusFilter === StatusType.ACTIVE) { + statusValue = 'ACTIVE'; + } else if (newStatusFilter === StatusType.INACTIVE) { + statusValue = 'INACTIVE'; + } + if (statusValue) { + const filter = { field: 'state', values: [statusValue], condition: FilterOperator.Equal }; + filtersInput.push({ and: [filter] }); + } + setOrFilters(filtersInput); + }; + + useEffect(() => { + policiesRefetch(); + }, [orFilters, policiesRefetch]); + const { createPolicyError, updatePolicyError, @@ -175,7 +211,7 @@ export const ManagePolicies = () => { ); const updateError = createPolicyError || updatePolicyError || deletePolicyError; - + const tableColumns = [ { title: 'Name', @@ -326,26 +362,43 @@ export const ManagePolicies = () => { Create new policy - null} - onQueryChange={(q) => { - setPage(1); - setQuery(q); - }} - entityRegistry={entityRegistry} - hideRecommendations - /> + + null} + onQueryChange={(q) => { + setPage(1); + setQuery(q); + }} + entityRegistry={entityRegistry} + hideRecommendations + /> + onStatusChange(selection as StatusType)} + style={{ width: 100 }} + > + + All + + + Active + + + Inactive + + + Date: Tue, 5 Mar 2024 18:00:40 +0530 Subject: [PATCH 11/13] feat(ui): add option to add picture link for groups (#9882) Co-authored-by: gaurav2733 --- .../graphql/types/corpgroup/CorpGroupType.java | 4 ++++ .../CorpGroupEditablePropertiesMapper.java | 13 +++++++++++++ .../src/main/resources/entity.graphql | 10 ++++++++++ .../src/app/entity/group/GroupEditModal.tsx | 17 +++++++++++++++++ .../src/app/entity/group/GroupInfoSideBar.tsx | 1 + .../src/app/entity/group/GroupProfile.tsx | 2 +- .../src/app/identity/group/GroupListItem.tsx | 2 +- .../src/app/identity/group/cacheUtils.ts | 3 +++ datahub-web-react/src/graphql/group.graphql | 4 ++++ .../cypress/e2e/settings/managing_groups.js | 13 ++++++------- 10 files changed, 60 insertions(+), 9 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/CorpGroupType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/CorpGroupType.java index 4eb038632c6c69..3e82c543a0098c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/CorpGroupType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/CorpGroupType.java @@ -7,6 +7,7 @@ import com.datahub.authorization.DisjunctivePrivilegeGroup; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import com.linkedin.common.url.Url; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.RecordTemplate; @@ -231,6 +232,9 @@ private RecordTemplate mapCorpGroupEditableInfo( if (input.getEmail() != null) { result.setEmail(input.getEmail()); } + if (input.getPictureLink() != null) { + result.setPictureLink(new Url(input.getPictureLink())); + } return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/mappers/CorpGroupEditablePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/mappers/CorpGroupEditablePropertiesMapper.java index a6e14535cf0b7f..a7fde4f42a6793 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/mappers/CorpGroupEditablePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/corpgroup/mappers/CorpGroupEditablePropertiesMapper.java @@ -1,9 +1,12 @@ package com.linkedin.datahub.graphql.types.corpgroup.mappers; import com.linkedin.data.template.GetMode; +import com.linkedin.data.template.RecordTemplate; import com.linkedin.datahub.graphql.generated.CorpGroupEditableProperties; import com.linkedin.datahub.graphql.types.mappers.ModelMapper; import javax.annotation.Nonnull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Maps Pegasus {@link RecordTemplate} objects to objects conforming to the GQL schema. @@ -14,6 +17,9 @@ public class CorpGroupEditablePropertiesMapper implements ModelMapper< com.linkedin.identity.CorpGroupEditableInfo, CorpGroupEditableProperties> { + private final Logger _logger = + LoggerFactory.getLogger(CorpGroupEditablePropertiesMapper.class.getName()); + public static final CorpGroupEditablePropertiesMapper INSTANCE = new CorpGroupEditablePropertiesMapper(); @@ -29,6 +35,13 @@ public CorpGroupEditableProperties apply( result.setDescription(corpGroupEditableInfo.getDescription(GetMode.DEFAULT)); result.setSlack(corpGroupEditableInfo.getSlack(GetMode.DEFAULT)); result.setEmail(corpGroupEditableInfo.getEmail(GetMode.DEFAULT)); + com.linkedin.common.url.Url pictureLinkObject = + corpGroupEditableInfo.getPictureLink(GetMode.NULL); + String pictureLink = null; + if (pictureLinkObject != null) { + pictureLink = pictureLinkObject.toString(); + } + result.setPictureLink(pictureLink); return result; } } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 855cf19f0cb3b8..ec27bff61f8f3a 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -4077,6 +4077,11 @@ type CorpGroupEditableProperties { Email address for the group """ email: String + + """ + A URL which points to a picture which user wants to set as a profile photo + """ + pictureLink: String } """ @@ -4097,6 +4102,11 @@ input CorpGroupUpdateInput { Email address for the group """ email: String + + """ + A URL which points to a picture which user wants to set as a profile photo + """ + pictureLink: String } """ diff --git a/datahub-web-react/src/app/entity/group/GroupEditModal.tsx b/datahub-web-react/src/app/entity/group/GroupEditModal.tsx index 9db52c7598d1e8..be1289ad3202b3 100644 --- a/datahub-web-react/src/app/entity/group/GroupEditModal.tsx +++ b/datahub-web-react/src/app/entity/group/GroupEditModal.tsx @@ -7,6 +7,7 @@ type PropsData = { email: string | undefined; slack: string | undefined; urn: string | undefined; + photoUrl: string | undefined; }; type Props = { @@ -27,6 +28,7 @@ export default function GroupEditModal({ visible, onClose, onSave, editModalData slack: editModalData.slack, email: editModalData.email, urn: editModalData.urn, + photoUrl: editModalData.photoUrl, }); useEffect(() => { @@ -41,6 +43,7 @@ export default function GroupEditModal({ visible, onClose, onSave, editModalData input: { email: data.email, slack: data.slack, + pictureLink: data.photoUrl, }, }, }) @@ -55,6 +58,7 @@ export default function GroupEditModal({ visible, onClose, onSave, editModalData email: '', slack: '', urn: '', + photoUrl: '', }); }) .catch((e) => { @@ -125,6 +129,19 @@ export default function GroupEditModal({ visible, onClose, onSave, editModalData onChange={(event) => setData({ ...data, slack: event.target.value })} /> + + Image URL} + rules={[{ whitespace: true }, { type: 'url', message: 'not valid url' }]} + hasFeedback + > + setData({ ...data, photoUrl: event.target.value })} + /> +
); diff --git a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx index 044b09dc185e53..f4dc03ea0fd324 100644 --- a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx +++ b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx @@ -216,6 +216,7 @@ export default function GroupInfoSidebar({ sideBarData, refetch }: Props) { urn, email, slack, + photoUrl }; // About Text save diff --git a/datahub-web-react/src/app/entity/group/GroupProfile.tsx b/datahub-web-react/src/app/entity/group/GroupProfile.tsx index 11ed31e00003f4..e8001ebccc3b5a 100644 --- a/datahub-web-react/src/app/entity/group/GroupProfile.tsx +++ b/datahub-web-react/src/app/entity/group/GroupProfile.tsx @@ -89,7 +89,7 @@ export default function GroupProfile() { // Side bar data const sideBarData = { - photoUrl: undefined, + photoUrl: data?.corpGroup?.editableProperties?.pictureLink || undefined, avatarName: data?.corpGroup?.properties?.displayName || data?.corpGroup?.name || diff --git a/datahub-web-react/src/app/identity/group/GroupListItem.tsx b/datahub-web-react/src/app/identity/group/GroupListItem.tsx index 74c0a8afb4d02e..e5aada4800253c 100644 --- a/datahub-web-react/src/app/identity/group/GroupListItem.tsx +++ b/datahub-web-react/src/app/identity/group/GroupListItem.tsx @@ -54,7 +54,7 @@ export default function GroupListItem({ group, onDelete, selectRoleOptions, refe - +
{displayName} diff --git a/datahub-web-react/src/app/identity/group/cacheUtils.ts b/datahub-web-react/src/app/identity/group/cacheUtils.ts index 272b9f841d25c8..3674a1e3ebf1e2 100644 --- a/datahub-web-react/src/app/identity/group/cacheUtils.ts +++ b/datahub-web-react/src/app/identity/group/cacheUtils.ts @@ -45,6 +45,9 @@ const createFullGroup = (baseGroup) => { }, memberCount: null, roles: null, + editableProperties: { + pictureLink: null, + }, }; }; diff --git a/datahub-web-react/src/graphql/group.graphql b/datahub-web-react/src/graphql/group.graphql index c8d3ff5e4731c8..7c47a83451a4ee 100644 --- a/datahub-web-react/src/graphql/group.graphql +++ b/datahub-web-react/src/graphql/group.graphql @@ -17,6 +17,7 @@ query getGroup($urn: String!, $membersCount: Int!) { description slack email + pictureLink } properties { displayName @@ -201,6 +202,9 @@ query listGroups($input: ListGroupsInput!) { description email } + editableProperties { + pictureLink + } memberCount: relationships( input: { types: ["IsMemberOfGroup", "IsMemberOfNativeGroup"], direction: INCOMING, start: 0, count: 1 } ) { diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 8421bd288edf07..d9f69cd9a5ec42 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -64,7 +64,6 @@ describe("create and manage group", () => { }); it("update group info", () => { - var expected_name = Cypress.env('ADMIN_USERNAME'); cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.clickOptionWithText(group_name); @@ -79,14 +78,14 @@ describe("create and manage group", () => { cy.waitTextVisible("Changes saved."); cy.contains("Test group description EDITED").should("be.visible"); cy.clickOptionWithText("Add Owners"); - cy.contains("Search for users or groups...").click({ force: true }); - cy.focused().type(expected_name); - cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click(); + cy.get('[id="owner"]').click({ force: true }); + cy.focused().type(username); + cy.get(".ant-select-item-option").contains(username, { matchCase: false }).click(); cy.focused().blur(); - cy.contains(expected_name, { matchCase: false }).should("have.length", 1); + cy.contains(username, { matchCase: false }).should("have.length", 1); cy.get('[role="dialog"] button').contains("Done").click(); cy.waitTextVisible("Owners Added"); - cy.contains(expected_name, { matchCase: false }).should("be.visible"); + cy.contains(username, { matchCase: false }).should("be.visible"); cy.clickOptionWithText("Edit Group"); cy.waitTextVisible("Edit Profile"); cy.get("#email").type(`${test_id}@testemail.com`); @@ -97,7 +96,7 @@ describe("create and manage group", () => { cy.waitTextVisible(`#${test_id}`); }); - it("test user verify group participation", () => { + it("test User verify group participation", () => { cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.hideOnboardingTour(); From 5bee25fa209b215dbf5dace5f7eeb2d68c4c9761 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 5 Mar 2024 05:27:41 -0800 Subject: [PATCH 12/13] feat(ingest): add Looks subtype + stop reemitting browsePathV2 (#9978) --- .../src/datahub/ingestion/api/source.py | 3 +- .../datahub/ingestion/api/source_helpers.py | 15 --- .../ingestion/source/common/subtypes.py | 3 + .../ingestion/source/looker/looker_source.py | 8 +- .../integration/looker/expected_output.json | 93 ------------------- .../golden_test_independent_look_ingest.json | 84 ++++++++++++----- 6 files changed, 73 insertions(+), 133 deletions(-) delete mode 100644 metadata-ingestion/tests/integration/looker/expected_output.json diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 0679c884ba0ede..906a431666e17f 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -34,7 +34,6 @@ auto_materialize_referenced_tags, auto_status_aspect, auto_workunit_reporter, - re_emit_browse_path_v2, ) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent @@ -297,7 +296,7 @@ def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor drop_dirs=[s for s in browse_path_drop_dirs if s is not None], dry_run=dry_run, ) - return lambda stream: re_emit_browse_path_v2(browse_path_processor(stream)) + return lambda stream: browse_path_processor(stream) class TestableSource(Source): diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 84051a5887966f..80eb283424d69f 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -197,21 +197,6 @@ def auto_lowercase_urns( yield wu -def re_emit_browse_path_v2( - stream: Iterable[MetadataWorkUnit], -) -> Iterable[MetadataWorkUnit]: - """Re-emit browse paths v2 aspects, to avoid race condition where server overwrites with default.""" - browse_path_v2_workunits = [] - - for wu in stream: - yield wu - if wu.is_primary_source and wu.get_aspect_of_type(BrowsePathsV2Class): - browse_path_v2_workunits.append(wu) - - for wu in browse_path_v2_workunits: - yield wu - - def auto_browse_path_v2( stream: Iterable[MetadataWorkUnit], *, diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py index 1def0dd02097b4..466cc64519f24c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py @@ -48,3 +48,6 @@ class BIContainerSubTypes(str, Enum): class BIAssetSubTypes(str, Enum): # Generic SubTypes REPORT = "Report" + + # Looker + LOOKER_LOOK = "Look" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index a66675b8be0b08..e42e7b82bf8aba 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -47,6 +47,7 @@ ) from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.looker import looker_usage from datahub.ingestion.source.looker.looker_common import ( InputFieldElement, @@ -89,6 +90,7 @@ OwnerClass, OwnershipClass, OwnershipTypeClass, + SubTypesClass, ) from datahub.utilities.advanced_thread_executor import BackpressureAwareExecutor @@ -624,7 +626,11 @@ def _make_chart_metadata_events( chart_mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot) proposals: List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]] = [ - chart_mce + chart_mce, + MetadataChangeProposalWrapper( + entityUrn=chart_urn, + aspect=SubTypesClass(typeNames=[BIAssetSubTypes.LOOKER_LOOK]), + ), ] # If extracting embeds is enabled, produce an MCP for embed URL. diff --git a/metadata-ingestion/tests/integration/looker/expected_output.json b/metadata-ingestion/tests/integration/looker/expected_output.json deleted file mode 100644 index 96393ce4b531a3..00000000000000 --- a/metadata-ingestion/tests/integration/looker/expected_output.json +++ /dev/null @@ -1,93 +0,0 @@ -[ -{ - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": { - "urn": "urn:li:chart:(looker,dashboard_elements.2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.chart.ChartInfo": { - "customProperties": {}, - "externalUrl": null, - "title": "", - "description": "Some text", - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null - }, - "chartUrl": "https://looker.company.com/x/", - "inputs": [], - "type": null, - "access": null, - "lastRefreshed": null - } - } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "looker-test", - "properties": null - } -}, -{ - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": { - "urn": "urn:li:dashboard:(looker,dashboards.1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dashboard.DashboardInfo": { - "customProperties": {}, - "externalUrl": null, - "title": "foo", - "description": "lorem ipsum", - "charts": [ - "urn:li:chart:(looker,dashboard_elements.2)" - ], - "datasets": [], - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null - }, - "dashboardUrl": "https://looker.company.com/dashboards/1", - "access": null, - "lastRefreshed": null - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "looker-test", - "properties": null - } -} -] diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 9521c9af4bbdcc..e1102466059743 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -34,7 +34,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -49,7 +50,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -109,7 +111,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -169,7 +172,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -212,7 +216,26 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.looks_1)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Look" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -227,7 +250,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -242,7 +266,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -338,7 +363,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -355,7 +381,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -370,7 +397,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -392,7 +420,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -488,7 +517,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -505,7 +535,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -520,7 +551,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -542,7 +574,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -561,7 +594,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -580,7 +614,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -599,7 +634,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -614,7 +650,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -629,7 +666,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -644,7 +682,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } }, { @@ -659,7 +698,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "looker-test" + "runId": "looker-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file From 782d33db994e3638722a78286e50c7418cf43ef5 Mon Sep 17 00:00:00 2001 From: AvaniSiddhapuraAPT <156416042+AvaniSiddhapuraAPT@users.noreply.github.com> Date: Tue, 5 Mar 2024 21:00:09 +0530 Subject: [PATCH 13/13] fix(ingest/bigquery): escape special characters for table descriptions (#9932) --- .../ingestion/source/bigquery_v2/bigquery.py | 7 ++- .../source/bigquery_v2/bigquery_helper.py | 19 +++++++ .../bigquery_v2/bigquery_mcp_golden.json | 55 +++++++++++++------ .../unit/test_bigqueryv2_usage_source.py | 36 ++++++++++++ 4 files changed, 98 insertions(+), 19 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index c36b150d3220f9..bcc0aa50ed22e6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -40,6 +40,9 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, @@ -1073,7 +1076,9 @@ def gen_dataset_workunits( dataset_properties = DatasetProperties( name=datahub_dataset_name.get_table_display_name(), - description=table.comment, + description=unquote_and_decode_unicode_escape_seq(table.comment) + if table.comment + else "", qualifiedName=str(datahub_dataset_name), created=( TimeStamp(time=int(table.created.timestamp() * 1000)) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py new file mode 100644 index 00000000000000..6142c96a5faa1d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py @@ -0,0 +1,19 @@ +from typing import Optional + + +def unquote_and_decode_unicode_escape_seq( + string: str, + leading_quote: str = '"', + trailing_quote: Optional[str] = None, +) -> str: + """ + If string starts and ends with a quote, unquote it and decode Unicode escape sequences + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + + cleaned_string = string.encode().decode("unicode-escape") + + return cleaned_string diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index f58eee09aa1cec..da9589d2195ac6 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -100,7 +105,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -115,7 +121,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -130,7 +137,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -147,7 +155,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -162,7 +171,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -182,7 +192,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -197,7 +208,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -229,7 +241,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -243,12 +256,14 @@ "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", "name": "table-1", "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", "tags": [] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -263,7 +278,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -279,7 +295,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -296,7 +313,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -320,7 +338,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00" + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 80f9ab927f887b..8a3fa5ca46ea4a 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -8,6 +8,9 @@ BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.sql_parsing.schema_resolver import SchemaResolver @@ -176,3 +179,36 @@ def test_bigquery_table_sanitasitation(): assert table_identifier.dataset == "dataset-4567" assert table_identifier.table == "foo_2016*" assert table_identifier.get_table_display_name() == "foo" + + +def test_unquote_and_decode_unicode_escape_seq(): + + # Test with a string that starts and ends with quotes and has Unicode escape sequences + input_string = '"Hello \\u003cWorld\\u003e"' + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that does not start and end with quotes + input_string = "Hello \\u003cWorld\\u003e" + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with an empty string + input_string = "" + expected_output = "" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that does not have Unicode escape sequences + input_string = "No escape sequences here" + expected_output = "No escape sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that starts and ends with quotes but does not have escape sequences + input_string = '"No escape sequences here"' + expected_output = "No escape sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output