Merge branch 'datahub-project:master' into master

acryldata · Jul 25, 2024 · b550e3b · b550e3b
2 parents 86d8886 + dd732d0
commit b550e3b
Show file tree

Hide file tree

Showing 170 changed files with 1,397 additions and 735 deletions.
diff --git a/build.gradle b/build.gradle
@@ -54,7 +54,7 @@ buildscript {
   ext.hazelcastVersion = '5.3.6'
   ext.ebeanVersion = '12.16.1'
   ext.googleJavaFormatVersion = '1.18.1'
-  ext.openLineageVersion = '1.16.0'
+  ext.openLineageVersion = '1.19.0'
   ext.logbackClassicJava8 = '1.2.12'
 
   ext.docker_registry = 'acryldata'
@@ -111,6 +111,7 @@ project.ext.externalDependency = [
     'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3',
     'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17',
     'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.0.3',
+    'awsS3': 'software.amazon.awssdk:s3:2.26.21',
     'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.13',
     'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.2',
     'awsRds':'software.amazon.awssdk:rds:2.18.24',

diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json
@@ -562,7 +562,7 @@
       }
     },
     {
-      "Path": "docs/metadata-integration/java/spark-lineage-beta",
+      "Path": "docs/metadata-integration/java/acryl-spark-lineage",
       "imgPath": "img/logos/platforms/spark.svg",
       "Title": "Spark",
       "Description": "Spark is a data processing tool that enables fast and efficient processing of large-scale data sets using distributed computing.",

diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -419,17 +419,13 @@ module.exports = {
         },
         {
           type: "doc",
-          id: "metadata-integration/java/spark-lineage/README",
-          label: "Spark (Legacy)",
-        },
-        {
-          type: "doc",
-          id: "metadata-integration/java/spark-lineage-beta/README",
+          id: "metadata-integration/java/acryl-spark-lineage/README",
           label: "Spark",
         },
         //"docker/airflow/local_airflow",
         "metadata-ingestion/integration_docs/great-expectations",
         "metadata-integration/java/datahub-protobuf/README",
+        //"metadata-integration/java/spark-lineage-legacy/README",
         //"metadata-ingestion/source-docs-template",
         {
           type: "autogenerated",
@@ -886,7 +882,7 @@ module.exports = {
     //"docs/how/graph-onboarding",
     //"docs/demo/graph-onboarding",
     //"metadata-integration/java/spark-lineage/README",
-    // "metadata-integration/java/spark-lineage-beta/README.md
+    // "metadata-integration/java/acryl-spark-lineage/README.md
     // "metadata-integration/java/openlineage-converter/README"
     //"metadata-ingestion-modules/airflow-plugin/README"
     //"metadata-ingestion-modules/dagster-plugin/README"

diff --git a/docs/cli.md b/docs/cli.md
@@ -193,14 +193,14 @@ datahub init
 /Users/user/.datahubenv already exists. Overwrite? [y/N]: y
 Configure which datahub instance to connect to
 Enter your DataHub host [http://localhost:8080]: http://localhost:8080
-Enter your DataHub access token (Supports env vars via `{VAR_NAME}` syntax) []:
+Enter your DataHub access token []:
 
 # acryl example
 datahub init
 /Users/user/.datahubenv already exists. Overwrite? [y/N]: y
 Configure which datahub instance to connect to
 Enter your DataHub host [http://localhost:8080]: https://<your-instance-id>.acryl.io/gms
-Enter your DataHub access token (Supports env vars via `{VAR_NAME}` syntax) []: <token generated from https://<your-instance-id>.acryl.io/settings/tokens>
+Enter your DataHub access token []: <token generated from https://<your-instance-id>.acryl.io/settings/tokens>
 ```
 
 #### Environment variables supported

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -80,6 +80,7 @@ New (optional fields `systemMetadata` and `headers`):
 ### Deprecations
 
 ### Other Notable Change
+- #10466 - Extends configuration in `~/.datahubenv` to match `DatahubClientConfig` object definition. See full configuration in https://datahubproject.io/docs/python-sdk/clients/. The CLI should now respect the updated configurations specified in `~/.datahubenv` across its functions and utilities. This means that for systems where ssl certification is disabled, setting `disable_ssl_verification: true` in `~./datahubenv` will apply to all CLI calls.
 
 ## 0.13.1
 

diff --git a/docs/lineage/openlineage.md b/docs/lineage/openlineage.md
@@ -6,7 +6,7 @@ DataHub, now supports [OpenLineage](https://openlineage.io/) integration. With t
 
 - **REST Endpoint Support**: DataHub now includes a REST endpoint that can understand OpenLineage events. This allows users to send lineage information directly to DataHub, enabling easy integration with various data processing frameworks.
 
-- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more.
+- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more.
 
 ## OpenLineage Support with DataHub
 
@@ -73,7 +73,7 @@ The transport should look like this:
 #### Known Limitations
 With Spark and Airflow we recommend using the Spark Lineage or DataHub's Airflow plugin for tighter integration with DataHub.
 
-- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns)) support is not yet available.
+- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage/#configuring-hdfs-based-dataset-urns)) support is not yet available.
 
 - **Column-level Lineage**: DataHub's current OpenLineage support does not provide full column-level lineage tracking.
 - etc...
@@ -83,10 +83,10 @@ DataHub's Spark Event Listener plugin enhances OpenLineage support by providing
 
 #### How to Use
 
-Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)
+Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)
 
 ## References
 
 - [OpenLineage](https://openlineage.io/)
 - [DataHub OpenAPI Guide](../api/openapi/openapi-usage-guide.md)
-- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)
+- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)
diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md
@@ -19,7 +19,7 @@ Integration can be divided into two concepts based on the method:
 ### Push-based Integration
 
 Push-based integrations allow you to emit metadata directly from your data systems when metadata changes.
-Examples of push-based integrations include [Airflow](../docs/lineage/airflow.md), [Spark](../metadata-integration/java/spark-lineage/README.md), [Great Expectations](./integration_docs/great-expectations.md) and [Protobuf Schemas](../metadata-integration/java/datahub-protobuf/README.md). This allows you to get low-latency metadata integration from the "active" agents in your data ecosystem.
+Examples of push-based integrations include [Airflow](../docs/lineage/airflow.md), [Spark](../metadata-integration/java/acryl-spark-lineage/README.md), [Great Expectations](./integration_docs/great-expectations.md) and [Protobuf Schemas](../metadata-integration/java/datahub-protobuf/README.md). This allows you to get low-latency metadata integration from the "active" agents in your data ecosystem.
 
 ### Pull-based Integration
 

diff --git a/metadata-ingestion/docs/sources/databricks/README.md b/metadata-ingestion/docs/sources/databricks/README.md
@@ -11,7 +11,7 @@ The alternative way to integrate is via the Hive connector. The [Hive starter re
 
 ## Databricks Spark
 
-To complete the picture, we recommend adding push-based ingestion from your Spark jobs to see real-time activity and lineage between your Databricks tables and your Spark jobs. Use the Spark agent to push metadata to DataHub using the instructions [here](../../../../metadata-integration/java/spark-lineage-beta/README.md#configuration-instructions-databricks).
+To complete the picture, we recommend adding push-based ingestion from your Spark jobs to see real-time activity and lineage between your Databricks tables and your Spark jobs. Use the Spark agent to push metadata to DataHub using the instructions [here](../../../../metadata-integration/java/acryl-spark-lineage/README.md#configuration-instructions-databricks).
 
 ## Watch the DataHub Talk at the Data and AI Summit 2022
 

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -113,6 +113,11 @@
     "numpy<2",
 }
 
+dbt_common = {
+    *sqlglot_lib,
+    "more_itertools",
+}
+
 sql_common = (
     {
         # Required for all SQL sources.
@@ -352,8 +357,8 @@
     "datahub-lineage-file": set(),
     "datahub-business-glossary": set(),
     "delta-lake": {*data_lake_profiling, *delta_lake},
-    "dbt": {"requests"} | sqlglot_lib | aws_common,
-    "dbt-cloud": {"requests"} | sqlglot_lib,
+    "dbt": {"requests"} | dbt_common | aws_common,
+    "dbt-cloud": {"requests"} | dbt_common,
     "druid": sql_common | {"pydruid>=0.6.2"},
     "dynamodb": aws_common | classification_lib,
     # Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws

diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py
@@ -389,3 +389,13 @@ def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
         logger.info(f"Extracted {len(queries)} queries to {output}")
     else:
         click.echo(json.dumps(queries, indent=2))
+
+
+@check.command()
+def server_config() -> None:
+    """Print the server config."""
+    graph = get_default_graph()
+
+    server_config = graph.get_server_config()
+
+    click.echo(pprint.pformat(server_config))