Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Jan 8, 2024
2 parents a593883 + ade7b61 commit ea16284
Show file tree
Hide file tree
Showing 28 changed files with 460 additions and 70 deletions.
1 change: 1 addition & 0 deletions .github/workflows/airflow-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- name: Upload coverage to Codecov
if: always()
uses: codecov/codecov-action@v3
Expand Down
23 changes: 8 additions & 15 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@ jobs:
"except_metadata_ingestion",
"frontend",
]
timezone: ["UTC", "America/New_York"]
timezone: ["UTC"]
include:
# We only need the timezone variation for frontend tests.
- command: "frontend"
timezone: "America/New_York"
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- uses: szenius/set-timezone@v1.0
- uses: szenius/set-timezone@v1.2
with:
timezoneLinux: ${{ matrix.timezone }}
- uses: hsheth2/sane-checkout-action@v1
Expand All @@ -48,8 +52,7 @@ jobs:
python-version: "3.10"
cache: pip
- name: Gradle build (and test) for metadata ingestion
# we only need the timezone runs for frontend tests
if: ${{ matrix.command == 'except_metadata_ingestion' && matrix.timezone == 'America/New_York' }}
if: ${{ matrix.command == 'except_metadata_ingestion' }}
run: |
./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test -x :metadata-io:test -x :metadata-ingestion-modules:airflow-plugin:build -x :metadata-ingestion-modules:airflow-plugin:check -x :datahub-frontend:build -x :datahub-web-react:build --parallel
- name: Gradle build (and test) for frontend
Expand All @@ -66,15 +69,9 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- name: Ensure codegen is updated
uses: ./.github/actions/ensure-codegen-updated
- name: Slack failure notification
if: failure() && github.event_name == 'push'
uses: kpritam/slack-job-status-action@v1
with:
job-status: ${{ job.status }}
slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
channel: github-activities

quickstart-compose-validation:
runs-on: ubuntu-latest
Expand All @@ -83,10 +80,6 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- name: Download YQ
uses: chrisdickinson/[email protected]
with:
yq-version: v4.28.2
- name: Quickstart Compose Validation
run: ./docker/quickstart/generate_and_compare.sh

Expand Down
8 changes: 1 addition & 7 deletions .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -911,13 +911,7 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
- name: Slack failure notification
if: failure() && github.event_name == 'push'
uses: kpritam/slack-job-status-action@v1
with:
job-status: ${{ job.status }}
slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
channel: github-activities
!**/binary/**
deploy_datahub_head:
name: Deploy to Datahub HEAD
runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/metadata-ingestion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- name: Upload coverage to Codecov
if: ${{ always() && matrix.python-version == '3.10' }}
uses: codecov/codecov-action@v3
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/metadata-io.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
!**/binary/**
- name: Ensure codegen is updated
uses: ./.github/actions/ensure-codegen-updated

Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/pr-labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,38 @@ jobs:
if:
${{
!contains(
fromJson('[
"anshbansal",
"asikowitz",
"chriscollins3456",
"david-leifker",
"shirshanka",
"sid-acryl",
"swaroopjagadish",
"treff7es",
"yoonhyejin",
"eboneil",
"ethan-cartwright",
"gabe-lyons",
"hsheth2",
"jjoyce0510",
"maggiehays",
"mrjefflewis",
"pedro93",
"RyanHolstien"
]'),
github.actor
)
}}
with:
github_token: ${{ github.token }}
labels: |
community-contribution
- uses: actions-ecosystem/[email protected]
# only add names of champions here. Confirm with DevRel Team
if:
${{
contains(
fromJson('[
"skrydal",
"siladitya2",
Expand Down
8 changes: 1 addition & 7 deletions .github/workflows/spark-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,4 @@ jobs:
**/build/reports/tests/test/**
**/build/test-results/test/**
**/junit.*.xml
- name: Slack failure notification
if: failure() && github.event_name == 'push'
uses: kpritam/slack-job-status-action@v1
with:
job-status: ${{ job.status }}
slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
channel: github-activities
!**/binary/**
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ type Props = {
setShowDownloadAsCsvModal: (showDownloadAsCsvModal: boolean) => any;
};

const SEARCH_PAGE_SIZE_FOR_DOWNLOAD = 500;
const SEARCH_PAGE_SIZE_FOR_DOWNLOAD = 200;

export default function DownloadAsCsvModal({
downloadSearchResults,
Expand Down
6 changes: 3 additions & 3 deletions datahub-web-react/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@


"@adobe/css-tools@^4.3.1":
version "4.3.1"
resolved "https://registry.yarnpkg.com/@adobe/css-tools/-/css-tools-4.3.1.tgz#abfccb8ca78075a2b6187345c26243c1a0842f28"
integrity sha512-/62yikz7NLScCGAAST5SHdnjaDJQBDq0M2muyRTpf2VQhw6StBg2ALiu73zSJQ4fMVLA+0uBhBHAle7Wg+2kSg==
version "4.3.2"
resolved "https://registry.yarnpkg.com/@adobe/css-tools/-/css-tools-4.3.2.tgz#a6abc715fb6884851fca9dad37fc34739a04fd11"
integrity sha512-DA5a1C0gD/pLOvhv33YMrbf2FK3oUzwNl9oOJqE4XVjuEtt6XIakRcsd7eLiOSPkp1kTRQGICTA8cKra/vFbjw==

"@ampproject/remapping@^2.2.0":
version "2.2.1"
Expand Down
2 changes: 1 addition & 1 deletion docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ successful release on Github will automatically publish the images.
To build the full images (that we are going to publish), you need to run the following:

```
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub build
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub build
```

This is because we're relying on builtkit for multistage builds. It does not hurt also set `DATAHUB_VERSION` to
Expand Down
6 changes: 3 additions & 3 deletions docker/airflow/local_airflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ curl -L 'https://raw.githubusercontent.com/datahub-project/datahub/master/metada
First you need to initialize airflow in order to create initial database tables and the initial airflow user.

```
docker-compose up airflow-init
docker compose up airflow-init
```

You should see the following final initialization message
Expand All @@ -66,10 +66,10 @@ airflow_install_airflow-init_1 exited with code 0
```

Afterwards you need to start the airflow docker-compose
Afterwards you need to start the airflow docker compose

```
docker-compose up
docker compose up
```

You should see a host of messages as Airflow starts up.
Expand Down
4 changes: 2 additions & 2 deletions docker/dev-with-cassandra.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ fi
# YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS.
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR && \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose \
-f docker-compose-with-cassandra.yml \
-f docker-compose.dev.yml \
$CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE \
pull \
&& \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
-f docker-compose-with-cassandra.yml \
-f docker-compose.dev.yml \
$CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE \
Expand Down
4 changes: 2 additions & 2 deletions docker/dev-without-neo4j.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ fi
# Launches dev instances of DataHub images. See documentation for more details.
# YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS.
cd "${DIR}/../.." && \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose \
-f "${DIR}/docker-compose-without-neo4j.yml" \
-f "${DIR}/docker-compose-without-neo4j.override.yml" \
-f "${DIR}/docker-compose.dev.yml" \
$CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE pull \
&& \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
-f "${DIR}/docker-compose-without-neo4j.yml" \
-f "${DIR}/docker-compose-without-neo4j.override.yml" \
-f "${DIR}/docker-compose.dev.yml" \
Expand Down
4 changes: 2 additions & 2 deletions docker/dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ fi
# YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS.
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR && \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose \
-f docker-compose.yml \
-f docker-compose.override.yml \
-f docker-compose.dev.yml \
$CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE pull \
&& \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
-f docker-compose.yml \
-f docker-compose.override.yml \
-f docker-compose.dev.yml \
Expand Down
2 changes: 1 addition & 1 deletion docker/ingestion/ingestion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
export DATAHUB_VERSION=${DATAHUB_VERSION:-head}
cd $DIR && docker-compose pull && docker-compose -p datahub up
cd $DIR && docker compose pull && docker compose -p datahub up
6 changes: 3 additions & 3 deletions docker/nuke.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
cd $DIR

# Tear down and clean up all DataHub-related containers, volumes, and network
docker-compose -p datahub down -v
docker-compose rm -f -v
docker compose -p datahub down -v
docker compose rm -f -v

# Tear down ingestion container
(cd ingestion && docker-compose -p datahub down -v)
(cd ingestion && docker compose -p datahub down -v)
4 changes: 2 additions & 2 deletions docker/quickstart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ echo "Quickstarting DataHub: version ${DATAHUB_VERSION}"
if docker volume ls | grep -c -q datahub_neo4jdata
then
echo "Datahub Neo4j volume found, starting with neo4j as graph service"
cd $DIR && docker-compose pull && docker-compose -p datahub up
cd $DIR && docker compose pull && docker compose -p datahub up
else
echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service"
cd $DIR && \
DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
-f quickstart/docker-compose-without-neo4j.quickstart.yml \
$MONITORING_COMPOSE $CONSUMERS_COMPOSE $M1_COMPOSE up $@
fi
4 changes: 2 additions & 2 deletions docs/developers.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,13 @@ Replace whatever container you want in the existing deployment.
I.e, replacing datahub's backend (GMS):

```shell
(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-gms)
(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-gms)
```

Running the local version of the frontend

```shell
(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-frontend-react)
(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-frontend-react)
```

## IDE Support
Expand Down
2 changes: 1 addition & 1 deletion docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ It is not intended for a production environment. This recommendation is based on

#### Default Credentials

`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying
`quickstart` uses docker compose configuration which includes default credentials for both DataHub, and it's underlying
prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a
design choice to make development easier and is not best practice for a production environment.

Expand Down
1 change: 1 addition & 0 deletions metadata-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ task customPackageGenerate(type: Exec, dependsOn: [environmentSetup, installPack
def package_version = project.findProperty('package_version')
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && " +
"pip install build && " +
"./scripts/custom_package_codegen.sh '${package_name}' '${package_version}'"
}

Expand Down
70 changes: 70 additions & 0 deletions metadata-ingestion/docs/transformer/dataset_transformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The below table shows transformer which can transform aspects of entity [Dataset
| `schemaMetadata` | - [Pattern Add Dataset Schema Field glossaryTerms](#pattern-add-dataset-schema-field-glossaryterms)<br/> - [Pattern Add Dataset Schema Field globalTags](#pattern-add-dataset-schema-field-globaltags) |
| `datasetProperties` | - [Simple Add Dataset datasetProperties](#simple-add-dataset-datasetproperties)<br/> - [Add Dataset datasetProperties](#add-dataset-datasetproperties) |
| `domains` | - [Simple Add Dataset domains](#simple-add-dataset-domains)<br/> - [Pattern Add Dataset domains](#pattern-add-dataset-domains) |
| `dataProduct` | - [Simple Add Dataset dataProduct ](#simple-add-dataset-dataproduct)<br/> - [Pattern Add Dataset dataProduct](#pattern-add-dataset-dataproduct)<br/> - [Add Dataset dataProduct](#add-dataset-dataproduct)

## Extract Ownership from Tags
### Config Details
Expand Down Expand Up @@ -961,6 +962,75 @@ in both of the cases domain should be provisioned on DataHub GMS
'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"]
'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"]
```
## Simple Add Dataset dataProduct
### Config Details
| Field | Required | Type | Default | Description |
|-------------------------------|----------|-----------------|---------------|----------------------------------------------------------------------------------------|
| `dataset_to_data_product_urns`| ✅ | Dict[str, str] | | Dataset Entity urn as key and dataproduct urn as value to create with dataset as asset.|

Let’s suppose we’d like to add a set of dataproduct with specific datasets as its assets. To do so, we can use the `simple_add_dataset_dataproduct` transformer that’s included in the ingestion framework.

The config, which we’d append to our ingestion recipe YAML, would look like this:

```yaml
transformers:
- type: "simple_add_dataset_dataproduct"
config:
dataset_to_data_product_urns:
"urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)": "urn:li:dataProduct:first"
"urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)": "urn:li:dataProduct:second"
```

## Pattern Add Dataset dataProduct
### Config Details
| Field | Required | Type | Default | Description |
|---------------------------------------|----------|----------------------|-------------|---------------------------------------------------------------------------------------------|
| `dataset_to_data_product_urns_pattern`| ✅ | map[regx, urn] | | Dataset Entity urn with regular expression and dataproduct urn apply to matching entity urn.|

Let’s suppose we’d like to append a series of dataproducts with specific datasets as its assets. To do so, we can use the `pattern_add_dataset_dataproduct` module that’s included in the ingestion framework. This will match the regex pattern to `urn` of the dataset and create the data product entity with given urn and matched datasets as its assets.

The config, which we’d append to our ingestion recipe YAML, would look like this:

```yaml
transformers:
- type: "pattern_add_dataset_dataproduct"
config:
dataset_to_data_product_urns_pattern:
rules:
".*example1.*": "urn:li:dataProduct:first"
".*example2.*": "urn:li:dataProduct:second"
```

## Add Dataset dataProduct
### Config Details
| Field | Required | Type | Default | Description |
|-----------------------------|----------|-----------------------------------|---------------|------------------------------------------------------------------------------------------|
| `get_data_product_to_add` | ✅ | callable[[str], Optional[str]] | | A function which takes dataset entity urn as input and return dataproduct urn to create. |

If you'd like to add more complex logic for creating dataproducts, you can use the more generic add_dataset_dataproduct transformer, which calls a user-provided function to determine the dataproduct to create with specified datasets as its asset.

```yaml
transformers:
- type: "add_dataset_dataproduct"
config:
get_data_product_to_add: "<your_module>.<your_function>"
```

Then define your function to return a dataproduct entity urn, for example:

```python
import datahub.emitter.mce_builder as builder
def custom_dataproducts(entity_urn: str) -> Optional[str]:
"""Compute the dataproduct urn to a given dataset urn."""
dataset_to_data_product_map = {
builder.make_dataset_urn("bigquery", "example1"): "urn:li:dataProduct:first"
}
return dataset_to_data_product_map.get(dataset_urn)
```
Finally, you can install and use your custom transformer as [shown here](#installing-the-package).

## Relationship Between replace_existing and semantics
The transformer behaviour mentioned here is in context of `simple_add_dataset_ownership`, however it is applicable for all dataset transformers which are supporting `replace_existing`
and `semantics` configuration attributes, for example `simple_add_dataset_tags` will add or remove tags as per behaviour mentioned in this section.
Expand Down
Loading

0 comments on commit ea16284

Please sign in to comment.