Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored Nov 6, 2024
2 parents 6ef9fe9 + 32878ab commit 07bef2a
Show file tree
Hide file tree
Showing 49 changed files with 12,497 additions and 642 deletions.
4 changes: 4 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import clickhouseLogo from '../../../../images/clickhouselogo.png';
import cockroachdbLogo from '../../../../images/cockroachdblogo.png';
import trinoLogo from '../../../../images/trinologo.png';
import dbtLogo from '../../../../images/dbtlogo.png';
import dremioLogo from '../../../../images/dremiologo.png';
import druidLogo from '../../../../images/druidlogo.png';
import elasticsearchLogo from '../../../../images/elasticsearchlogo.png';
import feastLogo from '../../../../images/feastlogo.png';
Expand Down Expand Up @@ -52,6 +53,8 @@ export const COCKROACHDB = 'cockroachdb';
export const COCKROACHDB_URN = `urn:li:dataPlatform:${COCKROACHDB}`;
export const DBT = 'dbt';
export const DBT_URN = `urn:li:dataPlatform:${DBT}`;
export const DREMIO = 'dremio';
export const DREMIO_URN = `urn:li:dataPlatform:${DREMIO}`;
export const DRUID = 'druid';
export const DRUID_URN = `urn:li:dataPlatform:${DRUID}`;
export const DYNAMODB = 'dynamodb';
Expand Down Expand Up @@ -139,6 +142,7 @@ export const PLATFORM_URN_TO_LOGO = {
[CLICKHOUSE_URN]: clickhouseLogo,
[COCKROACHDB_URN]: cockroachdbLogo,
[DBT_URN]: dbtLogo,
[DREMIO_URN]: dremioLogo,
[DRUID_URN]: druidLogo,
[DYNAMODB_URN]: dynamodbLogo,
[ELASTICSEARCH_URN]: elasticsearchLogo,
Expand Down
8 changes: 8 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -302,5 +302,13 @@
"description": "Configure a custom recipe using YAML.",
"docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
"recipe": "source:\n type: <source-type>\n config:\n # Source-type specifics config\n <source-configs>"
},
{
"urn": "urn:li:dataPlatform:dremio",
"name": "dremio",
"displayName": "Dremio",
"description": "Import Spaces, Sources, Tables and statistics from Dremio.",
"docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
"recipe": "source:\n type: dremio\n config:\n # Coordinates\n hostname: null\n port: null\n #true if https, otherwise false\n tls: true\n\n #For cloud instance\n #is_dremio_cloud: True\n #dremio_cloud_project_id: <project_id>\n\n #Credentials with personal access token\n authentication_method: PAT\n password: pass\n\n #Or Credentials with basic auth\n #authentication_method: password\n #username: null\n #password: null\n\n stateful_ingestion:\n enabled: true"
}
]
Binary file added datahub-web-react/src/images/dremiologo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
45 changes: 27 additions & 18 deletions docs/automations/bigquery-metadata-sync.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,31 @@ BigQuery. This automation is exclusively available in DataHub Cloud (Acryl).
- Facilitate compliance efforts by automatically tagging sensitive data columns
- Support data lineage tracking by keeping metadata aligned across platforms

## Capabilities
## Sync Capabilities

- Automatically add DataHub Tags as BigQuery Labels to tables
- Automatically add DataHub Table descriptions to BigQuery Tables
- Automatically add DataHub Column descriptions to BigQuery Columns
- Automatically add DataHub Glossary Terms as Policy Tags to BigQuery Columns (under a **DataHub** taxonomy created in BigQuery)
- Automatically remove Policy Tags/Table Labels when removed in DataHub
| DataHub Source | BigQuery Target | Sync Direction | Notes |
|----------------|-----------------|----------------|--------|
| Table Tags | Table Labels | Bi-directional | Changes in either system reflect in both |
| Table Descriptions | Table Descriptions | Bi-directional | Changes in either system reflect in both |
| Column Descriptions | Column Descriptions | Bi-directional | Changes in either system reflect in both. <br/> Thes sync doesn't delete table description from BigQuery |
| Column Glossary Terms | Column Policy Tags | DataHub → BigQuery | Created under DataHub taxonomy |

## Setup Instructions

## Required Bigquery Permissions
### 1. Verify Permissions

| Action | Required Permission(s) |
|--------|------------------------|
| Create/update policy tags and taxonomies | `bigquery.taxonomies.create` <br/> `bigquery.taxonomies.update` |
| Assign/remove policy tags from columns | `bigquery.tables.updateTag` |
| Edit table description | `bigquery.tables.update` |
| Edit column description | `bigquery.tables.update` |
| Assign/remove labels from tables | `bigquery.tables.update` |
Ensure your service account has the following permissions:

## Enabling BigQuery Sync Automation
| Task | Required Permissions | Available Role |
|------|---------------------|----------------|
| Policy Tag Management |`datacatalog.taxonomies.create`<br/>• `datacatalog.taxonomies.update`<br/>• `datacatalog.taxonomies.list`<br/>• `datacatalog.taxonomies.get`<br/>• `bigquery.tables.createTagBinding` | Policy Tag Admin |
| Policy Tag Assignment |`bigquery.tables.updateTag` | - |
| Description Management |`bigquery.tables.update` | - |
| Label Management |`bigquery.tables.update` | - |

**Note**: `bigquery.tables` permissions must be granted in every project where metadata sync is needed.

### 2. Enable the Automation

1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar.

Expand Down Expand Up @@ -87,7 +92,7 @@ BigQuery. This automation is exclusively available in DataHub Cloud (Acryl).

3. **Finally, click 'Save and Run' to start the automation**

## Propagating for Existing Assets
## 3. Propagating for Existing Assets (Optional)

To ensure that all existing table Tags and Column Glossary Terms are propagated to BigQuery, you can back-fill historical data for existing assets. Note that the initial back-filling process may take some time, depending on the number of BigQuery assets you have.

Expand Down Expand Up @@ -131,7 +136,7 @@ A: No, BigQuery Policy Tags are only propagated from DataHub to BigQuery, not vi

It is recommended to avoid enabling `extract_policy_tags_from_catalog` during
ingestion, as this will ingest policy tags as BigQuery labels. Our sync process
propagates Glossary Term assignments to BigQuery as Policy Tags.
propagates Glossary Term assignments to BigQuery as Policy Tags.

In a future release, we plan to remove this restriction to support full bi-directional syncing.

Expand Down Expand Up @@ -159,7 +164,7 @@ a specific area of the Business Glossary.
A: From DataHub to BigQuery, the sync happens instantly (within a few seconds)
when the change occurs in DataHub.

From BigQuery to DataHub, changes are synced when ingestion occurs, and the frequency depends on your custom ingestion schedule. (Visible on the **Integrations** page)
From BigQuery to DataHub, changes are synced when ingestion occurs, and the frequency depends on your custom ingestion schedule. (Visible on the **Integrations** page)

### Q: What happens if there's a conflict between DataHub and BigQuery metadata?

Expand All @@ -169,6 +174,10 @@ A: In case of conflicts (e.g., a tag is modified in both systems between syncs),

A: Ensure that the service account used for the automation has the necessary permissions in both DataHub and BigQuery to read and write metadata. See the required BigQuery permissions at the top of the page.

### Q: Can table description removed?

No, the sync can only modify table description but it won't remove or clear a description from a table.

## Related Documentation

- [DataHub Tags Documentation](https://datahubproject.io/docs/tags/)
Expand Down
1 change: 1 addition & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,7 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo
| [datahub-lineage-file](./generated/ingestion/sources/file-based-lineage.md) | _no additional dependencies_ | Lineage File source |
| [datahub-business-glossary](./generated/ingestion/sources/business-glossary.md) | _no additional dependencies_ | Business Glossary File source |
| [dbt](./generated/ingestion/sources/dbt.md) | _no additional dependencies_ | dbt source |
| [dremio](./generated/ingestion/sources/dremio.md) | `pip install 'acryl-datahub[dremio]'` | Dremio Source |
| [druid](./generated/ingestion/sources/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
| [feast](./generated/ingestion/sources/feast.md) | `pip install 'acryl-datahub[feast]'` | Feast source (0.26.0) |
| [glue](./generated/ingestion/sources/glue.md) | `pip install 'acryl-datahub[glue]'` | AWS Glue source |
Expand Down
2 changes: 2 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
`datahub delete --platform powerbi --soft` and then re-ingest with the latest CLI version, ensuring the `include_workspace_name_in_dataset_urn` configuration is set to true.

- #11701: The Fivetran `sources_to_database` field is deprecated in favor of setting directly within `sources_to_platform_instance.<key>.database`.
- #11742: For PowerBi ingestion, `use_powerbi_email` is now enabled by default when extracting ownership information.

### Breaking Changes

Expand All @@ -45,6 +46,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
- #11619 - schema field/column paths can no longer be duplicated within the schema
- #11570 - The `DatahubClientConfig`'s server field no longer defaults to `http://localhost:8080`. Be sure to explicitly set this.
- #11570 - If a `datahub_api` is explicitly passed to a stateful ingestion config provider, it will be used. We previously ignored it if the pipeline context also had a graph object.
- #11518 - DataHub Garbage Collection: Various entities that are soft-deleted (after 10d) or are timeseries *entities* (dataprocess, execution requests) will be removed automatically using logic in the `datahub-gc` ingestion source.

### Potential Downtime

Expand Down
11 changes: 11 additions & 0 deletions metadata-ingestion/docs/sources/dremio/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
### Concept Mapping

Here's a table for **Concept Mapping** between Dremio and DataHub to provide a clear overview of how entities and concepts in Dremio are mapped to corresponding entities in DataHub:

| Source Concept | DataHub Concept | Notes |
| -------------------------- | --------------- | ---------------------------------------------------------- |
| **Physical Dataset/Table** | `Dataset` | Subtype: `Table` |
| **Virtual Dataset/Views** | `Dataset` | Subtype: `View` |
| **Spaces** | `Container` | Mapped to DataHub’s `Container` aspect. Subtype: `Space` |
| **Folders** | `Container` | Mapped as a `Container` in DataHub. Subtype: `Folder` |
| **Sources** | `Container` | Represented as a `Container` in DataHub. Subtype: `Source` |
29 changes: 29 additions & 0 deletions metadata-ingestion/docs/sources/dremio/dremio.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
### Starter Receipe for Dremio Cloud Instance

```
source:
type: dremio
config:
# Authentication details
authentication_method: PAT # Use Personal Access Token for authentication
password: <your_api_token> # Replace <your_api_token> with your Dremio Cloud API token
is_dremio_cloud: True # Set to True for Dremio Cloud instances
dremio_cloud_project_id: <project_id> # Provide the Project ID for Dremio Cloud
# Enable query lineage tracking
include_query_lineage: True
#Optional
source_mappings:
- platform: s3
source_name: samples
# Optional
schema_pattern:
allow:
- "<source_name>.<table_name>"
sink:
# Define your sink configuration here
```
25 changes: 25 additions & 0 deletions metadata-ingestion/docs/sources/dremio/dremio_pre.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
### Setup

This integration pulls metadata directly from the Dremio APIs.

You'll need to have a Dremio instance up and running with access to the necessary datasets, and API access should be enabled with a valid token.

The API token should have the necessary permissions to **read metadata** and **retrieve lineage**.

#### Steps to Get the Required Information

1. **Generate an API Token**:

- Log in to your Dremio instance.
- Navigate to your user profile in the top-right corner.
- Select **Generate API Token** to create an API token for programmatic access.

2. **Permissions**:

- The token should have **read-only** or **admin** permissions that allow it to:
- View all datasets (physical and virtual).
- Access all spaces, folders, and sources.
- Retrieve dataset and column-level lineage information.

3. **Verify External Data Source Permissions**:
- If Dremio is connected to external data sources (e.g., AWS S3, relational databases), ensure that Dremio has access to the credentials required for querying those sources.
34 changes: 34 additions & 0 deletions metadata-ingestion/docs/sources/dremio/dremio_recipe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
source:
type: dremio
config:
# Coordinates
hostname: localhost
port: 9047
tls: true

# Credentials with personal access token(recommended)
authentication_method: PAT
password: pass
# OR Credentials with basic auth
# authentication_method: password
# username: user
# password: pass

#For cloud instance
#is_dremio_cloud: True
#dremio_cloud_project_id: <project_id>

include_query_lineage: True

#Optional
source_mappings:
- platform: s3
source_name: samples

#Optional
schema_pattern:
allow:
- "<source_name>.<table_name>"

sink:
# sink configs
3 changes: 3 additions & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@
"delta-lake": {*data_lake_profiling, *delta_lake},
"dbt": {"requests"} | dbt_common | aws_common,
"dbt-cloud": {"requests"} | dbt_common,
"dremio": {"requests"} | sql_common,
"druid": sql_common | {"pydruid>=0.6.2"},
"dynamodb": aws_common | classification_lib,
# Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws
Expand Down Expand Up @@ -616,6 +617,7 @@
"clickhouse-usage",
"cockroachdb",
"delta-lake",
"dremio",
"druid",
"elasticsearch",
"feast",
Expand Down Expand Up @@ -714,6 +716,7 @@
"s3 = datahub.ingestion.source.s3:S3Source",
"dbt = datahub.ingestion.source.dbt.dbt_core:DBTCoreSource",
"dbt-cloud = datahub.ingestion.source.dbt.dbt_cloud:DBTCloudSource",
"dremio = datahub.ingestion.source.dremio.dremio_source:DremioSource",
"druid = datahub.ingestion.source.sql.druid:DruidSource",
"dynamodb = datahub.ingestion.source.dynamodb.dynamodb:DynamoDBSource",
"elasticsearch = datahub.ingestion.source.elastic_search:ElasticsearchSource",
Expand Down
Empty file.
Loading

0 comments on commit 07bef2a

Please sign in to comment.