Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Nov 16, 2024
2 parents e0ac314 + 17c9fcf commit 6b6b3ed
Show file tree
Hide file tree
Showing 29 changed files with 6,366 additions and 100 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ subprojects {
implementation externalDependency.annotationApi
constraints {
implementation("com.google.googlejavaformat:google-java-format:$googleJavaFormatVersion")
implementation('io.netty:netty-all:4.1.114.Final')
implementation('io.netty:netty-all:4.1.115.Final')
implementation('org.apache.commons:commons-compress:1.27.1')
implementation('org.apache.velocity:velocity-engine-core:2.4')
implementation('org.hibernate:hibernate-validator:6.0.20.Final')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import React from 'react';
import { Alert } from 'antd';

const CSV_FORMAT_LINK = 'https://datahubproject.io/docs/generated/ingestion/sources/csv';
const CSV_FORMAT_LINK = 'https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher';

export const CSVInfo = () => {
const link = (
Expand Down
4 changes: 4 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import csvLogo from '../../../../images/csv-logo.png';
import qlikLogo from '../../../../images/qliklogo.png';
import sigmaLogo from '../../../../images/sigmalogo.png';
import sacLogo from '../../../../images/saclogo.svg';
import cassandraLogo from '../../../../images/cassandralogo.png';
import datahubLogo from '../../../../images/datahublogo.png';

export const ATHENA = 'athena';
Expand Down Expand Up @@ -129,6 +130,8 @@ export const SIGMA = 'sigma';
export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`;
export const SAC = 'sac';
export const SAC_URN = `urn:li:dataPlatform:${SAC}`;
export const CASSANDRA = 'cassandra';
export const CASSANDRA_URN = `urn:li:dataPlatform:${CASSANDRA}`;
export const DATAHUB = 'datahub';
export const DATAHUB_GC = 'datahub-gc';
export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file';
Expand Down Expand Up @@ -175,6 +178,7 @@ export const PLATFORM_URN_TO_LOGO = {
[QLIK_SENSE_URN]: qlikLogo,
[SIGMA_URN]: sigmaLogo,
[SAC_URN]: sacLogo,
[CASSANDRA_URN]: cassandraLogo,
[DATAHUB_URN]: datahubLogo,
};

Expand Down
7 changes: 7 additions & 0 deletions datahub-web-react/src/app/ingest/source/builder/sources.json
Original file line number Diff line number Diff line change
Expand Up @@ -310,5 +310,12 @@
"description": "Import Spaces, Sources, Tables and statistics from Dremio.",
"docsUrl": "https://datahubproject.io/docs/metadata-ingestion/",
"recipe": "source:\n type: dremio\n config:\n # Coordinates\n hostname: null\n port: null\n #true if https, otherwise false\n tls: true\n\n #For cloud instance\n #is_dremio_cloud: True\n #dremio_cloud_project_id: <project_id>\n\n #Credentials with personal access token\n authentication_method: PAT\n password: pass\n\n #Or Credentials with basic auth\n #authentication_method: password\n #username: null\n #password: null\n\n stateful_ingestion:\n enabled: true"
},
{
"urn": "urn:li:dataPlatform:cassandra",
"name": "cassandra",
"displayName": "CassandraDB",
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra",
"recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]"
}
]
Binary file added datahub-web-react/src/images/cassandralogo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
27 changes: 27 additions & 0 deletions docs/how/search.md
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,33 @@ queryConfigurations:
boost_mode: multiply
```
Similar example to boost with `primary` AND `gold` instead of the previous OR condition.

```yaml
queryConfigurations:
- queryRegex: .*
simpleQuery: true
prefixMatchQuery: true
exactMatchQuery: true
functionScore:
functions:
- filter:
bool:
filter:
- term:
tags.keyword: urn:li:tag:primary
- term:
tags.keyword: urn:li:tag:gold
weight: 3.0
score_mode: multiply
boost_mode: multiply
```

##### Example 2: Preferred Data Platform

Boost the `urn:li:dataPlatform:hive` platform.
Expand Down
40 changes: 40 additions & 0 deletions metadata-ingestion/docs/sources/cassandra/cassandra_pre.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
### Setup

This integration pulls metadata directly from Cassandra databases, including both **DataStax Astra DB** and **Cassandra Enterprise Edition (EE)**.

You’ll need to have a Cassandra instance or an Astra DB setup with appropriate access permissions.

#### Steps to Get the Required Information

1. **Set Up User Credentials**:

- **For Astra DB**:
- Log in to your Astra DB Console.
- Navigate to **Organization Settings** > **Token Management**.
- Generate an **Application Token** with the required permissions for read access.
- Download the **Secure Connect Bundle** from the Astra DB Console.
- **For Cassandra EE**:
- Ensure you have a **username** and **password** with read access to the necessary keyspaces.

2. **Permissions**:

- The user or token must have `SELECT` permissions that allow it to:
- Access metadata in system keyspaces (e.g., `system_schema`) to retrieve information about keyspaces, tables, columns, and views.
- Perform `SELECT` operations on the data tables if data profiling is enabled.

3. **Verify Database Access**:
- For Astra DB: Ensure the **Secure Connect Bundle** is used and configured correctly.
- For Cassandra Opensource: Ensure the **contact point** and **port** are accessible.


:::caution

When enabling profiling, make sure to set a limit on the number of rows to sample. Profiling large tables without a limit may lead to excessive resource consumption and slow performance.

:::

:::note

For cloud configuration with Astra DB, it is necessary to specify the Secure Connect Bundle path in the configuration. For that reason, use the CLI to ingest metadata into DataHub.

:::
30 changes: 30 additions & 0 deletions metadata-ingestion/docs/sources/cassandra/cassandra_recipe.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
source:
type: "cassandra"
config:
# Credentials for on prem cassandra
contact_point: "localhost"
port: 9042
username: "admin"
password: "password"

# Or
# Credentials Astra Cloud
#cloud_config:
# secure_connect_bundle: "Path to Secure Connect Bundle (.zip)"
# token: "Application Token"

# Optional Allow / Deny extraction of particular keyspaces.
keyspace_pattern:
allow: [".*"]

# Optional Allow / Deny extraction of particular tables.
table_pattern:
allow: [".*"]

# Optional
profiling:
enabled: true
profile_table_level_only: true

sink:
# config sinks
7 changes: 7 additions & 0 deletions metadata-ingestion/docs/transformer/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,23 @@ DataHub provided transformers for dataset are:
- [Simple Add Dataset ownership](./dataset_transformer.md#simple-add-dataset-ownership)
- [Pattern Add Dataset ownership](./dataset_transformer.md#pattern-add-dataset-ownership)
- [Simple Remove Dataset ownership](./dataset_transformer.md#simple-remove-dataset-ownership)
- [Extract Ownership from Tags](./dataset_transformer.md#extract-ownership-from-tags)
- [Clean suffix prefix from Ownership](./dataset_transformer.md#clean-suffix-prefix-from-ownership)
- [Mark Dataset Status](./dataset_transformer.md#mark-dataset-status)
- [Simple Add Dataset globalTags](./dataset_transformer.md#simple-add-dataset-globaltags)
- [Pattern Add Dataset globalTags](./dataset_transformer.md#pattern-add-dataset-globaltags)
- [Add Dataset globalTags](./dataset_transformer.md#add-dataset-globaltags)
- [Set Dataset browsePath](./dataset_transformer.md#set-dataset-browsepath)
- [Simple Add Dataset glossaryTerms](./dataset_transformer.md#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](./dataset_transformer.md#pattern-add-dataset-glossaryterms)
- [Add Dataset globalTags](./dataset_transformer.md#add-dataset-globaltags)
- [Pattern Add Dataset Schema Field glossaryTerms](./dataset_transformer.md#pattern-add-dataset-schema-field-glossaryterms)
- [Pattern Add Dataset Schema Field globalTags](./dataset_transformer.md#pattern-add-dataset-schema-field-globaltags)
- [Simple Add Dataset datasetProperties](./dataset_transformer.md#simple-add-dataset-datasetproperties)
- [Add Dataset datasetProperties](./dataset_transformer.md#add-dataset-datasetproperties)
- [Simple Add Dataset domains](./dataset_transformer.md#simple-add-dataset-domains)
- [Pattern Add Dataset domains](./dataset_transformer.md#pattern-add-dataset-domains)
- [Domain Mapping Based on Tags](./dataset_transformer.md#domain-mapping-based-on-tags)
- [Simple Add Dataset dataProduct ](./dataset_transformer.md#simple-add-dataset-dataproduct)
- [Pattern Add Dataset dataProduct](./dataset_transformer.md#pattern-add-dataset-dataproduct)
- [Add Dataset dataProduct](./dataset_transformer.md#add-dataset-dataproduct)
9 changes: 9 additions & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,13 @@
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/release-notes.html#rn-7-14-0
# https://github.com/elastic/elasticsearch-py/issues/1639#issuecomment-883587433
"elasticsearch": {"elasticsearch==7.13.4"},
"cassandra": {
"cassandra-driver>=3.28.0",
# We were seeing an error like this `numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject`
# with numpy 2.0. This likely indicates a mismatch between scikit-learn and numpy versions.
# https://stackoverflow.com/questions/40845304/runtimewarning-numpy-dtype-size-changed-may-indicate-binary-incompatibility
"numpy<2",
},
"feast": {
"feast>=0.34.0,<1",
"flask-openid>=1.3.0",
Expand Down Expand Up @@ -660,6 +667,7 @@
"qlik-sense",
"sigma",
"sac",
"cassandra",
]
if plugin
for dependency in plugins[plugin]
Expand Down Expand Up @@ -778,6 +786,7 @@
"qlik-sense = datahub.ingestion.source.qlik_sense.qlik_sense:QlikSenseSource",
"sigma = datahub.ingestion.source.sigma.sigma:SigmaSource",
"sac = datahub.ingestion.source.sac.sac:SACSource",
"cassandra = datahub.ingestion.source.cassandra.cassandra:CassandraSource",
],
"datahub.ingestion.transformer.plugins": [
"pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership",
Expand Down
Empty file.
Loading

0 comments on commit 6b6b3ed

Please sign in to comment.