Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
asikowitz authored Jul 25, 2024
2 parents 86d8886 + dd732d0 commit b550e3b
Show file tree
Hide file tree
Showing 170 changed files with 1,397 additions and 735 deletions.
3 changes: 2 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ buildscript {
ext.hazelcastVersion = '5.3.6'
ext.ebeanVersion = '12.16.1'
ext.googleJavaFormatVersion = '1.18.1'
ext.openLineageVersion = '1.16.0'
ext.openLineageVersion = '1.19.0'
ext.logbackClassicJava8 = '1.2.12'

ext.docker_registry = 'acryldata'
Expand Down Expand Up @@ -111,6 +111,7 @@ project.ext.externalDependency = [
'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3',
'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.17',
'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:2.0.3',
'awsS3': 'software.amazon.awssdk:s3:2.26.21',
'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.13',
'awsPostgresIamAuth': 'software.amazon.jdbc:aws-advanced-jdbc-wrapper:1.0.2',
'awsRds':'software.amazon.awssdk:rds:2.18.24',
Expand Down
2 changes: 1 addition & 1 deletion docs-website/filterTagIndexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@
}
},
{
"Path": "docs/metadata-integration/java/spark-lineage-beta",
"Path": "docs/metadata-integration/java/acryl-spark-lineage",
"imgPath": "img/logos/platforms/spark.svg",
"Title": "Spark",
"Description": "Spark is a data processing tool that enables fast and efficient processing of large-scale data sets using distributed computing.",
Expand Down
10 changes: 3 additions & 7 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -419,17 +419,13 @@ module.exports = {
},
{
type: "doc",
id: "metadata-integration/java/spark-lineage/README",
label: "Spark (Legacy)",
},
{
type: "doc",
id: "metadata-integration/java/spark-lineage-beta/README",
id: "metadata-integration/java/acryl-spark-lineage/README",
label: "Spark",
},
//"docker/airflow/local_airflow",
"metadata-ingestion/integration_docs/great-expectations",
"metadata-integration/java/datahub-protobuf/README",
//"metadata-integration/java/spark-lineage-legacy/README",
//"metadata-ingestion/source-docs-template",
{
type: "autogenerated",
Expand Down Expand Up @@ -886,7 +882,7 @@ module.exports = {
//"docs/how/graph-onboarding",
//"docs/demo/graph-onboarding",
//"metadata-integration/java/spark-lineage/README",
// "metadata-integration/java/spark-lineage-beta/README.md
// "metadata-integration/java/acryl-spark-lineage/README.md
// "metadata-integration/java/openlineage-converter/README"
//"metadata-ingestion-modules/airflow-plugin/README"
//"metadata-ingestion-modules/dagster-plugin/README"
Expand Down
4 changes: 2 additions & 2 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,14 +193,14 @@ datahub init
/Users/user/.datahubenv already exists. Overwrite? [y/N]: y
Configure which datahub instance to connect to
Enter your DataHub host [http://localhost:8080]: http://localhost:8080
Enter your DataHub access token (Supports env vars via `{VAR_NAME}` syntax) []:
Enter your DataHub access token []:

# acryl example
datahub init
/Users/user/.datahubenv already exists. Overwrite? [y/N]: y
Configure which datahub instance to connect to
Enter your DataHub host [http://localhost:8080]: https://<your-instance-id>.acryl.io/gms
Enter your DataHub access token (Supports env vars via `{VAR_NAME}` syntax) []: <token generated from https://<your-instance-id>.acryl.io/settings/tokens>
Enter your DataHub access token []: <token generated from https://<your-instance-id>.acryl.io/settings/tokens>
```
#### Environment variables supported
Expand Down
1 change: 1 addition & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ New (optional fields `systemMetadata` and `headers`):
### Deprecations

### Other Notable Change
- #10466 - Extends configuration in `~/.datahubenv` to match `DatahubClientConfig` object definition. See full configuration in https://datahubproject.io/docs/python-sdk/clients/. The CLI should now respect the updated configurations specified in `~/.datahubenv` across its functions and utilities. This means that for systems where ssl certification is disabled, setting `disable_ssl_verification: true` in `~./datahubenv` will apply to all CLI calls.

## 0.13.1

Expand Down
8 changes: 4 additions & 4 deletions docs/lineage/openlineage.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ DataHub, now supports [OpenLineage](https://openlineage.io/) integration. With t

- **REST Endpoint Support**: DataHub now includes a REST endpoint that can understand OpenLineage events. This allows users to send lineage information directly to DataHub, enabling easy integration with various data processing frameworks.

- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more.
- **[Spark Event Listener Plugin](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)**: DataHub provides a Spark Event Listener plugin that seamlessly integrates OpenLineage's Spark plugin. This plugin enhances DataHub's OpenLineage support by offering additional features such as PathSpec support, column-level lineage, patch support and more.

## OpenLineage Support with DataHub

Expand Down Expand Up @@ -73,7 +73,7 @@ The transport should look like this:
#### Known Limitations
With Spark and Airflow we recommend using the Spark Lineage or DataHub's Airflow plugin for tighter integration with DataHub.

- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta/#configuring-hdfs-based-dataset-urns)) support is not yet available.
- **[PathSpec](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage/#configuring-hdfs-based-dataset-urns) Support**: While the REST endpoint supports OpenLineage messages, full [PathSpec](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage/#configuring-hdfs-based-dataset-urns)) support is not yet available.

- **Column-level Lineage**: DataHub's current OpenLineage support does not provide full column-level lineage tracking.
- etc...
Expand All @@ -83,10 +83,10 @@ DataHub's Spark Event Listener plugin enhances OpenLineage support by providing

#### How to Use

Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)
Follow the guides of the Spark Lineage plugin page for more information on how to set up the Spark Lineage plugin. The guide can be found [here](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)

## References

- [OpenLineage](https://openlineage.io/)
- [DataHub OpenAPI Guide](../api/openapi/openapi-usage-guide.md)
- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/spark-lineage-beta)
- [DataHub Spark Lineage Plugin](https://datahubproject.io/docs/metadata-integration/java/acryl-spark-lineage)
2 changes: 1 addition & 1 deletion metadata-ingestion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Integration can be divided into two concepts based on the method:
### Push-based Integration

Push-based integrations allow you to emit metadata directly from your data systems when metadata changes.
Examples of push-based integrations include [Airflow](../docs/lineage/airflow.md), [Spark](../metadata-integration/java/spark-lineage/README.md), [Great Expectations](./integration_docs/great-expectations.md) and [Protobuf Schemas](../metadata-integration/java/datahub-protobuf/README.md). This allows you to get low-latency metadata integration from the "active" agents in your data ecosystem.
Examples of push-based integrations include [Airflow](../docs/lineage/airflow.md), [Spark](../metadata-integration/java/acryl-spark-lineage/README.md), [Great Expectations](./integration_docs/great-expectations.md) and [Protobuf Schemas](../metadata-integration/java/datahub-protobuf/README.md). This allows you to get low-latency metadata integration from the "active" agents in your data ecosystem.

### Pull-based Integration

Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/docs/sources/databricks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The alternative way to integrate is via the Hive connector. The [Hive starter re

## Databricks Spark

To complete the picture, we recommend adding push-based ingestion from your Spark jobs to see real-time activity and lineage between your Databricks tables and your Spark jobs. Use the Spark agent to push metadata to DataHub using the instructions [here](../../../../metadata-integration/java/spark-lineage-beta/README.md#configuration-instructions-databricks).
To complete the picture, we recommend adding push-based ingestion from your Spark jobs to see real-time activity and lineage between your Databricks tables and your Spark jobs. Use the Spark agent to push metadata to DataHub using the instructions [here](../../../../metadata-integration/java/acryl-spark-lineage/README.md#configuration-instructions-databricks).

## Watch the DataHub Talk at the Data and AI Summit 2022

Expand Down
9 changes: 7 additions & 2 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@
"numpy<2",
}

dbt_common = {
*sqlglot_lib,
"more_itertools",
}

sql_common = (
{
# Required for all SQL sources.
Expand Down Expand Up @@ -352,8 +357,8 @@
"datahub-lineage-file": set(),
"datahub-business-glossary": set(),
"delta-lake": {*data_lake_profiling, *delta_lake},
"dbt": {"requests"} | sqlglot_lib | aws_common,
"dbt-cloud": {"requests"} | sqlglot_lib,
"dbt": {"requests"} | dbt_common | aws_common,
"dbt-cloud": {"requests"} | dbt_common,
"druid": sql_common | {"pydruid>=0.6.2"},
"dynamodb": aws_common | classification_lib,
# Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws
Expand Down
10 changes: 10 additions & 0 deletions metadata-ingestion/src/datahub/cli/check_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,3 +389,13 @@ def extract_sql_agg_log(query_log_file: str, output: Optional[str]) -> None:
logger.info(f"Extracted {len(queries)} queries to {output}")
else:
click.echo(json.dumps(queries, indent=2))


@check.command()
def server_config() -> None:
"""Print the server config."""
graph = get_default_graph()

server_config = graph.get_server_config()

click.echo(pprint.pformat(server_config))
Loading

0 comments on commit b550e3b

Please sign in to comment.