Merge branch 'datahub-project:master' into master

acryldata · Jan 8, 2024 · ea16284 · ea16284
2 parents a593883 + ade7b61
commit ea16284
Show file tree

Hide file tree

Showing 28 changed files with 460 additions and 70 deletions.
diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml
@@ -76,6 +76,7 @@ jobs:
             **/build/reports/tests/test/**
             **/build/test-results/test/**
             **/junit.*.xml
+            !**/binary/**
       - name: Upload coverage to Codecov
         if: always()
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -29,11 +29,15 @@ jobs:
             "except_metadata_ingestion",
             "frontend",
           ]
-        timezone: ["UTC", "America/New_York"]
+        timezone: ["UTC"]
+        include:
+          # We only need the timezone variation for frontend tests.
+          - command: "frontend"
+            timezone: "America/New_York"
     runs-on: ubuntu-latest
     timeout-minutes: 60
     steps:
-      - uses: szenius/set-timezone@v1.0
+      - uses: szenius/set-timezone@v1.2
         with:
           timezoneLinux: ${{ matrix.timezone }}
       - uses: hsheth2/sane-checkout-action@v1
@@ -48,8 +52,7 @@ jobs:
           python-version: "3.10"
           cache: pip
       - name: Gradle build (and test) for metadata ingestion
-        # we only need the timezone runs for frontend tests
-        if: ${{  matrix.command == 'except_metadata_ingestion' && matrix.timezone == 'America/New_York' }}
+        if: ${{  matrix.command == 'except_metadata_ingestion' }}
         run: |
           ./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test -x :metadata-io:test -x :metadata-ingestion-modules:airflow-plugin:build -x :metadata-ingestion-modules:airflow-plugin:check -x :datahub-frontend:build -x :datahub-web-react:build --parallel
       - name: Gradle build (and test) for frontend
@@ -66,15 +69,9 @@ jobs:
             **/build/reports/tests/test/**
             **/build/test-results/test/**
             **/junit.*.xml
+            !**/binary/**
       - name: Ensure codegen is updated
         uses: ./.github/actions/ensure-codegen-updated
-      - name: Slack failure notification
-        if: failure() && github.event_name == 'push'
-        uses: kpritam/slack-job-status-action@v1
-        with:
-          job-status: ${{ job.status }}
-          slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
-          channel: github-activities
 
   quickstart-compose-validation:
     runs-on: ubuntu-latest
@@ -83,10 +80,6 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: "3.10"
-      - name: Download YQ
-        uses: chrisdickinson/[email protected]
-        with:
-          yq-version: v4.28.2
       - name: Quickstart Compose Validation
         run: ./docker/quickstart/generate_and_compare.sh
 

diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
@@ -911,13 +911,7 @@ jobs:
             **/build/reports/tests/test/**
             **/build/test-results/test/**
             **/junit.*.xml
-      - name: Slack failure notification
-        if: failure() && github.event_name == 'push'
-        uses: kpritam/slack-job-status-action@v1
-        with:
-          job-status: ${{ job.status }}
-          slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
-          channel: github-activities
+            !**/binary/**
   deploy_datahub_head:
     name: Deploy to Datahub HEAD
     runs-on: ubuntu-latest

diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml
@@ -79,6 +79,7 @@ jobs:
             **/build/reports/tests/test/**
             **/build/test-results/test/**
             **/junit.*.xml
+            !**/binary/**
       - name: Upload coverage to Codecov
         if: ${{ always() && matrix.python-version == '3.10' }}
         uses: codecov/codecov-action@v3

diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml
@@ -50,6 +50,7 @@ jobs:
             **/build/reports/tests/test/**
             **/build/test-results/test/**
             **/junit.*.xml
+            !**/binary/**
       - name: Ensure codegen is updated
         uses: ./.github/actions/ensure-codegen-updated
 

diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml
@@ -19,6 +19,38 @@ jobs:
         if: 
           ${{ 
             !contains(
+              fromJson('[
+                "anshbansal",
+                "asikowitz",
+                "chriscollins3456",
+                "david-leifker",
+                "shirshanka",
+                "sid-acryl",
+                "swaroopjagadish",
+                "treff7es",
+                "yoonhyejin",
+                "eboneil",
+                "ethan-cartwright",
+                "gabe-lyons",
+                "hsheth2",
+                "jjoyce0510",
+                "maggiehays",
+                "mrjefflewis",
+                "pedro93",
+                "RyanHolstien"
+              ]'), 
+              github.actor
+            ) 
+          }}
+        with:
+          github_token: ${{ github.token }}
+          labels: |
+            community-contribution
+      - uses: actions-ecosystem/[email protected]
+        # only add names of champions here. Confirm with DevRel Team
+        if: 
+          ${{ 
+            contains(
               fromJson('[
                 "skrydal",
                 "siladitya2",

diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml
@@ -68,10 +68,4 @@ jobs:
             **/build/reports/tests/test/**
             **/build/test-results/test/**
             **/junit.*.xml
-      - name: Slack failure notification
-        if: failure() && github.event_name == 'push'
-        uses: kpritam/slack-job-status-action@v1
-        with:
-          job-status: ${{ job.status }}
-          slack-bot-token: ${{ secrets.SLACK_BOT_TOKEN }}
-          channel: github-activities
+            !**/binary/**
diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/DownloadAsCsvModal.tsx b/datahub-web-react/src/app/entity/shared/components/styled/search/DownloadAsCsvModal.tsx
@@ -21,7 +21,7 @@ type Props = {
     setShowDownloadAsCsvModal: (showDownloadAsCsvModal: boolean) => any;
 };
 
-const SEARCH_PAGE_SIZE_FOR_DOWNLOAD = 500;
+const SEARCH_PAGE_SIZE_FOR_DOWNLOAD = 200;
 
 export default function DownloadAsCsvModal({
     downloadSearchResults,

diff --git a/datahub-web-react/yarn.lock b/datahub-web-react/yarn.lock
@@ -3,9 +3,9 @@
 
 
 "@adobe/css-tools@^4.3.1":
-  version "4.3.1"
-  resolved "https://registry.yarnpkg.com/@adobe/css-tools/-/css-tools-4.3.1.tgz#abfccb8ca78075a2b6187345c26243c1a0842f28"
-  integrity sha512-/62yikz7NLScCGAAST5SHdnjaDJQBDq0M2muyRTpf2VQhw6StBg2ALiu73zSJQ4fMVLA+0uBhBHAle7Wg+2kSg==
+  version "4.3.2"
+  resolved "https://registry.yarnpkg.com/@adobe/css-tools/-/css-tools-4.3.2.tgz#a6abc715fb6884851fca9dad37fc34739a04fd11"
+  integrity sha512-DA5a1C0gD/pLOvhv33YMrbf2FK3oUzwNl9oOJqE4XVjuEtt6XIakRcsd7eLiOSPkp1kTRQGICTA8cKra/vFbjw==
 
 "@ampproject/remapping@^2.2.0":
   version "2.2.1"

diff --git a/docker/README.md b/docker/README.md
@@ -64,7 +64,7 @@ successful release on Github will automatically publish the images.
 To build the full images (that we are going to publish), you need to run the following:
 
 ```
-COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub build
+COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub build
 ```
 
 This is because we're relying on builtkit for multistage builds. It does not hurt also set `DATAHUB_VERSION` to

diff --git a/docker/airflow/local_airflow.md b/docker/airflow/local_airflow.md
@@ -54,7 +54,7 @@ curl -L 'https://raw.githubusercontent.com/datahub-project/datahub/master/metada
 First you need to initialize airflow in order to create initial database tables and the initial airflow user.
 
 ```
-docker-compose up airflow-init
+docker compose up airflow-init
 ```
 
 You should see the following final initialization message
@@ -66,10 +66,10 @@ airflow_install_airflow-init_1 exited with code 0
 
 ```
 
-Afterwards you need to start the airflow docker-compose
+Afterwards you need to start the airflow docker compose
 
 ```
-docker-compose up
+docker compose up
 ```
 
 You should see a host of messages as Airflow starts up.

diff --git a/docker/dev-with-cassandra.sh b/docker/dev-with-cassandra.sh
@@ -23,13 +23,13 @@ fi
 # YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS.
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 cd $DIR && \
-  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \
+  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose \
     -f docker-compose-with-cassandra.yml \
     -f docker-compose.dev.yml \
     $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE \
     pull \
 && \
-  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
+  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
     -f docker-compose-with-cassandra.yml \
     -f docker-compose.dev.yml \
     $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE \

diff --git a/docker/dev-without-neo4j.sh b/docker/dev-without-neo4j.sh
@@ -23,13 +23,13 @@ fi
 # Launches dev instances of DataHub images. See documentation for more details.
 # YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS.
 cd "${DIR}/../.." && \
-  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \
+  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose \
     -f "${DIR}/docker-compose-without-neo4j.yml" \
     -f "${DIR}/docker-compose-without-neo4j.override.yml" \
     -f "${DIR}/docker-compose.dev.yml" \
     $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE pull \
 && \
-  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
+  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
     -f "${DIR}/docker-compose-without-neo4j.yml" \
     -f "${DIR}/docker-compose-without-neo4j.override.yml" \
     -f "${DIR}/docker-compose.dev.yml" \

diff --git a/docker/dev.sh b/docker/dev.sh
@@ -23,13 +23,13 @@ fi
 # YOU MUST BUILD VIA GRADLE BEFORE RUNNING THIS.
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 cd $DIR && \
-  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose \
+  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose \
     -f docker-compose.yml \
     -f docker-compose.override.yml \
     -f docker-compose.dev.yml \
     $CONSUMERS_COMPOSE $MONITORING_COMPOSE $M1_COMPOSE pull \
 && \
-  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
+  COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
     -f docker-compose.yml \
     -f docker-compose.override.yml \
     -f docker-compose.dev.yml \

diff --git a/docker/ingestion/ingestion.sh b/docker/ingestion/ingestion.sh
@@ -2,4 +2,4 @@
 
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 export DATAHUB_VERSION=${DATAHUB_VERSION:-head}
-cd $DIR && docker-compose pull && docker-compose -p datahub up
+cd $DIR && docker compose pull && docker compose -p datahub up
diff --git a/docker/nuke.sh b/docker/nuke.sh
@@ -4,8 +4,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 cd $DIR
 
 # Tear down and clean up all DataHub-related containers, volumes, and network
-docker-compose -p datahub down -v
-docker-compose rm -f -v
+docker compose -p datahub down -v
+docker compose rm -f -v
 
 # Tear down ingestion container
-(cd ingestion && docker-compose -p datahub down -v)
+(cd ingestion && docker compose -p datahub down -v)
diff --git a/docker/quickstart.sh b/docker/quickstart.sh
@@ -33,11 +33,11 @@ echo "Quickstarting DataHub: version ${DATAHUB_VERSION}"
 if docker volume ls | grep -c -q datahub_neo4jdata
 then
   echo "Datahub Neo4j volume found, starting with neo4j as graph service"
-  cd $DIR && docker-compose pull && docker-compose -p datahub up
+  cd $DIR && docker compose pull && docker compose -p datahub up
 else
   echo "No Datahub Neo4j volume found, starting with elasticsearch as graph service"
   cd $DIR && \
-  DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker-compose -p datahub \
+  DOCKER_DEFAULT_PLATFORM="$(uname -m)" docker compose -p datahub \
     -f quickstart/docker-compose-without-neo4j.quickstart.yml \
     $MONITORING_COMPOSE $CONSUMERS_COMPOSE $M1_COMPOSE up $@
 fi
diff --git a/docs/developers.md b/docs/developers.md
@@ -101,13 +101,13 @@ Replace whatever container you want in the existing deployment.
 I.e, replacing datahub's backend (GMS):
 
 ```shell
-(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-gms)
+(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-gms)
 ```
 
 Running the local version of the frontend
 
 ```shell
-(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker-compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-frontend-react)
+(cd docker && COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 docker compose -p datahub -f docker-compose-without-neo4j.yml -f docker-compose-without-neo4j.override.yml -f docker-compose.dev.yml up -d --no-deps --force-recreate --build datahub-frontend-react)
 ```
 
 ## IDE Support

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -274,7 +274,7 @@ It is not intended for a production environment. This recommendation is based on
 
 #### Default Credentials
 
-`quickstart` uses docker-compose configuration which includes default credentials for both DataHub, and it's underlying
+`quickstart` uses docker compose configuration which includes default credentials for both DataHub, and it's underlying
 prerequisite data stores, such as MySQL. Additionally, other components are unauthenticated out of the box. This is a
 design choice to make development easier and is not best practice for a production environment.
 

diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle
@@ -70,6 +70,7 @@ task customPackageGenerate(type: Exec, dependsOn: [environmentSetup, installPack
   def package_version = project.findProperty('package_version')
   commandLine 'bash', '-c',
     "source ${venv_name}/bin/activate && " +
+    "pip install build && " +
     "./scripts/custom_package_codegen.sh '${package_name}' '${package_version}'"
 }
 

diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md
@@ -14,6 +14,7 @@ The below table shows transformer which can transform aspects of entity [Dataset
 | `schemaMetadata`    | - [Pattern Add Dataset Schema Field glossaryTerms](#pattern-add-dataset-schema-field-glossaryterms)<br/> - [Pattern Add Dataset Schema Field globalTags](#pattern-add-dataset-schema-field-globaltags)            |
 | `datasetProperties` | - [Simple Add Dataset datasetProperties](#simple-add-dataset-datasetproperties)<br/> - [Add Dataset datasetProperties](#add-dataset-datasetproperties)                                                            |
 | `domains`           | - [Simple Add Dataset domains](#simple-add-dataset-domains)<br/> - [Pattern Add Dataset domains](#pattern-add-dataset-domains)                                                                                      | 
+| `dataProduct`       | - [Simple Add Dataset dataProduct ](#simple-add-dataset-dataproduct)<br/> - [Pattern Add Dataset dataProduct](#pattern-add-dataset-dataproduct)<br/> - [Add Dataset dataProduct](#add-dataset-dataproduct)  
 
 ## Extract Ownership from Tags
 ### Config Details
@@ -961,6 +962,75 @@ in both of the cases domain should be provisioned on DataHub GMS
                 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"]
                 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"] 
   ```
+## Simple Add Dataset dataProduct
+### Config Details
+| Field                         | Required | Type            | Default       | Description                                                                            |
+|-------------------------------|----------|-----------------|---------------|----------------------------------------------------------------------------------------|
+| `dataset_to_data_product_urns`| ✅       | Dict[str, str]  |               | Dataset Entity urn as key and dataproduct urn as value to create with dataset as asset.|
+
+Let’s suppose we’d like to add a set of dataproduct with specific datasets as its assets. To do so, we can use the `simple_add_dataset_dataproduct` transformer that’s included in the ingestion framework.
+
+The config, which we’d append to our ingestion recipe YAML, would look like this:
+
+  ```yaml
+  transformers:
+    - type: "simple_add_dataset_dataproduct"
+      config:
+        dataset_to_data_product_urns:
+          "urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)": "urn:li:dataProduct:first"
+          "urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)": "urn:li:dataProduct:second"
+  ```
+
+## Pattern Add Dataset dataProduct
+### Config Details
+| Field                                 | Required | Type                 | Default     | Description                                                                                 |
+|---------------------------------------|----------|----------------------|-------------|---------------------------------------------------------------------------------------------|
+| `dataset_to_data_product_urns_pattern`| ✅       | map[regx, urn]       |             | Dataset Entity urn with regular expression and dataproduct urn apply to matching entity urn.|
+
+Let’s suppose we’d like to append a series of dataproducts with specific datasets as its assets. To do so, we can use the `pattern_add_dataset_dataproduct` module that’s included in the ingestion framework.  This will match the regex pattern to `urn` of the dataset and create the data product entity with given urn and matched datasets as its assets. 
+
+The config, which we’d append to our ingestion recipe YAML, would look like this:
+
+  ```yaml
+  transformers:
+    - type: "pattern_add_dataset_dataproduct"
+      config:
+        dataset_to_data_product_urns_pattern:
+          rules:
+            ".*example1.*": "urn:li:dataProduct:first"
+            ".*example2.*": "urn:li:dataProduct:second"
+  ```
+
+## Add Dataset dataProduct
+### Config Details
+| Field                       | Required | Type                              | Default       | Description                                                                              |
+|-----------------------------|----------|-----------------------------------|---------------|------------------------------------------------------------------------------------------|
+| `get_data_product_to_add`   | ✅        | callable[[str], Optional[str]]   |               | A function which takes dataset entity urn as input and return dataproduct urn to create. |
+
+If you'd like to add more complex logic for creating dataproducts, you can use the more generic add_dataset_dataproduct transformer, which calls a user-provided function to determine the dataproduct to create with specified datasets as its asset.
+
+```yaml
+transformers:
+  - type: "add_dataset_dataproduct"
+    config:
+      get_data_product_to_add: "<your_module>.<your_function>"
+```
+
+Then define your function to return a dataproduct entity urn, for example:
+
+```python
+import datahub.emitter.mce_builder as builder
+
+def custom_dataproducts(entity_urn: str) -> Optional[str]:
+    """Compute the dataproduct urn to a given dataset urn."""
+
+    dataset_to_data_product_map = {
+        builder.make_dataset_urn("bigquery", "example1"): "urn:li:dataProduct:first"
+    }
+    return dataset_to_data_product_map.get(dataset_urn)
+```
+Finally, you can install and use your custom transformer as [shown here](#installing-the-package).
+
 ## Relationship Between replace_existing and semantics
 The transformer behaviour mentioned here is in context of `simple_add_dataset_ownership`, however it is applicable for all dataset transformers which are supporting `replace_existing`
 and `semantics` configuration attributes, for example `simple_add_dataset_tags` will add or remove tags as per behaviour mentioned in this section.