diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index a4fb895a4d068b..b6f734622e6d95 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -479,6 +479,39 @@ jobs: context: . file: ./docker/kafka-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + kafka_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan Kafka Setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, kafka_setup_build ] + if: ${{ needs.setup.outputs.kafka_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" mysql_setup_build: name: Build and Push DataHub MySQL Setup Docker Image @@ -500,6 +533,39 @@ jobs: context: . file: ./docker/mysql-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + mysql_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan MySQL Setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, mysql_setup_build ] + if: ${{ needs.setup.outputs.mysql_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" elasticsearch_setup_build: name: Build and Push DataHub Elasticsearch Setup Docker Image @@ -521,6 +587,39 @@ jobs: context: . file: ./docker/elasticsearch-setup/Dockerfile platforms: linux/amd64,linux/arm64/v8 + elasticsearch_setup_scan: + permissions: + contents: read # for actions/checkout to fetch code + security-events: write # for github/codeql-action/upload-sarif to upload SARIF results + actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status + name: "[Monitoring] Scan ElasticSearch setup images for vulnerabilities" + runs-on: ubuntu-latest + needs: [ setup, elasticsearch_setup_build ] + if: ${{ needs.setup.outputs.elasticsearch_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' ) }} + steps: + - name: Checkout # adding checkout step just to make trivy upload happy + uses: acryldata/sane-checkout-action@v3 + - name: Download image + uses: ishworkh/docker-image-artifact-download@v1 + if: ${{ needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' }} + with: + image: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@0.25.0 + env: + TRIVY_OFFLINE_SCAN: true + with: + image-ref: ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }}:${{ needs.setup.outputs.unique_tag }} + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + severity: "CRITICAL,HIGH" + ignore-unfixed: true + vuln-type: "os,library" + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" datahub_ingestion_base_build: name: Build and Push DataHub Ingestion (Base) Docker Image diff --git a/docker/airflow/docker-compose.yaml b/docker/airflow/docker-compose.yaml index c0768f12bcb11c..43a01d000a1460 100644 --- a/docker/airflow/docker-compose.yaml +++ b/docker/airflow/docker-compose.yaml @@ -38,7 +38,6 @@ # # Feel free to modify this file to suit your needs. --- -version: '3' x-airflow-common: # In order to add custom dependencies or upgrade provider packages you can use your extended image. # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml diff --git a/docker/cassandra/docker-compose.cassandra.yml b/docker/cassandra/docker-compose.cassandra.yml index ae7d649ab3d239..3fde920161685d 100644 --- a/docker/cassandra/docker-compose.cassandra.yml +++ b/docker/cassandra/docker-compose.cassandra.yml @@ -1,6 +1,5 @@ # Override to use Cassandra as a backing store for datahub-gms. --- -version: '3.8' services: cassandra: hostname: cassandra diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 6709aee98d6979..55bca606c5ba36 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -4,7 +4,6 @@ # NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/docker-compose-without-neo4j.override.yml b/docker/docker-compose-without-neo4j.override.yml index 11d7cd0c0c87b1..adf610ec3a9ed7 100644 --- a/docker/docker-compose-without-neo4j.override.yml +++ b/docker/docker-compose-without-neo4j.override.yml @@ -1,5 +1,4 @@ --- -version: '3.9' services: datahub-gms: env_file: datahub-gms/env/docker-without-neo4j.env diff --git a/docker/docker-compose-without-neo4j.postgres.override.yml b/docker/docker-compose-without-neo4j.postgres.override.yml index b81fb6435c2973..2d1e0958dfb8bd 100644 --- a/docker/docker-compose-without-neo4j.postgres.override.yml +++ b/docker/docker-compose-without-neo4j.postgres.override.yml @@ -1,6 +1,5 @@ # Override to use PostgreSQL as a backing store for datahub-gms. --- -version: '3.9' services: datahub-gms: env_file: diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 53fcc77c6e8f31..4350322a17379e 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -4,7 +4,6 @@ # NOTE: This file will cannot build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/docker-compose.consumers-without-neo4j.yml b/docker/docker-compose.consumers-without-neo4j.yml index f1aa6b30cede09..01dff8c0d7b3d9 100644 --- a/docker/docker-compose.consumers-without-neo4j.yml +++ b/docker/docker-compose.consumers-without-neo4j.yml @@ -1,5 +1,4 @@ # Service definitions for standalone Kafka consumer containers. -version: '3.9' services: datahub-gms: environment: diff --git a/docker/docker-compose.consumers.dev.yml b/docker/docker-compose.consumers.dev.yml index 00f7b52df151f3..df180e22f55d04 100644 --- a/docker/docker-compose.consumers.dev.yml +++ b/docker/docker-compose.consumers.dev.yml @@ -1,4 +1,3 @@ -version: '3.9' services: datahub-mae-consumer: image: acryldata/datahub-mae-consumer:debug diff --git a/docker/docker-compose.consumers.yml b/docker/docker-compose.consumers.yml index 74b9adaeb99485..36fdb5451ebd62 100644 --- a/docker/docker-compose.consumers.yml +++ b/docker/docker-compose.consumers.yml @@ -1,5 +1,4 @@ # Service definitions for standalone Kafka consumer containers. -version: '3.9' services: datahub-gms: environment: diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 2202f362abd992..c68a4c1f5a8fcf 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -8,7 +8,6 @@ # To make a JVM app debuggable via IntelliJ, go to its env file and add JVM debug flags, and then add the JVM debug # port to this file. --- -version: '3.9' services: datahub-frontend-react: image: acryldata/datahub-frontend-react:head diff --git a/docker/docker-compose.kafka-setup.yml b/docker/docker-compose.kafka-setup.yml index 59b3459bf4555d..67b7641f509b32 100644 --- a/docker/docker-compose.kafka-setup.yml +++ b/docker/docker-compose.kafka-setup.yml @@ -1,3 +1,2 @@ # Empty docker compose for kafka-setup as we have moved kafka-setup back into the main compose -version: '3.9' services: \ No newline at end of file diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml index 51fbe0060aa5f0..25abf247a5d045 100644 --- a/docker/docker-compose.override.yml +++ b/docker/docker-compose.override.yml @@ -1,6 +1,5 @@ # Default override to use MySQL as a backing store for datahub-gms (same as docker-compose.mysql.yml). --- -version: '3.9' services: datahub-gms: env_file: datahub-gms/env/docker.env diff --git a/docker/docker-compose.tools.yml b/docker/docker-compose.tools.yml index 8d2c30c64e6c8c..9f0e2639521a45 100644 --- a/docker/docker-compose.tools.yml +++ b/docker/docker-compose.tools.yml @@ -1,6 +1,5 @@ # Tools useful for operating & debugging DataHub. --- -version: '3.8' services: kafka-rest-proxy: image: confluentinc/cp-kafka-rest:7.4.0 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5430a8a6fcd5bd..e77b6d11ef2417 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -4,7 +4,6 @@ # NOTE: This file does not build! No dockerfiles are set. See the README.md in this directory. --- -version: '3.9' services: datahub-frontend-react: hostname: datahub-frontend-react diff --git a/docker/ingestion/docker-compose.yml b/docker/ingestion/docker-compose.yml index 06d4e47aa4a404..a474d9505285d5 100644 --- a/docker/ingestion/docker-compose.yml +++ b/docker/ingestion/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: '3.5' services: ingestion: build: diff --git a/docker/mariadb/docker-compose.mariadb.yml b/docker/mariadb/docker-compose.mariadb.yml index 0ee172fb1f1a52..63f5ed929711b3 100644 --- a/docker/mariadb/docker-compose.mariadb.yml +++ b/docker/mariadb/docker-compose.mariadb.yml @@ -1,6 +1,5 @@ # Override to use MariaDB as a backing store for datahub-gms. --- -version: '3.8' services: mariadb: hostname: mariadb diff --git a/docker/monitoring/docker-compose.consumers.monitoring.yml b/docker/monitoring/docker-compose.consumers.monitoring.yml index 254b0a58d0223e..147ddde37798f0 100644 --- a/docker/monitoring/docker-compose.consumers.monitoring.yml +++ b/docker/monitoring/docker-compose.consumers.monitoring.yml @@ -1,5 +1,4 @@ --- -version: '3.8' services: datahub-mae-consumer: environment: diff --git a/docker/monitoring/docker-compose.monitoring.yml b/docker/monitoring/docker-compose.monitoring.yml index c6fa019cf99fcd..039aa55bd38678 100644 --- a/docker/monitoring/docker-compose.monitoring.yml +++ b/docker/monitoring/docker-compose.monitoring.yml @@ -1,5 +1,4 @@ --- -version: '3.9' services: datahub-frontend-react: environment: diff --git a/docker/mysql/docker-compose.mysql.yml b/docker/mysql/docker-compose.mysql.yml index 7cc27b9f8b1547..ee00e51c09b1f0 100644 --- a/docker/mysql/docker-compose.mysql.yml +++ b/docker/mysql/docker-compose.mysql.yml @@ -1,6 +1,5 @@ # Override to use MySQL as a backing store for datahub-gms. --- -version: '3.8' services: mysql: hostname: mysql diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 046ab96cf30029..5f415d4000178d 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -291,7 +291,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 6295572aac98fd..c2a0ddb0391b9a 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -266,7 +266,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index ed5f203ff4d058..46ea765f45e1f0 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -266,7 +266,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index a4211acedcf102..8801d26eddbf4d 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -55,4 +55,3 @@ services: image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 -version: '3.9' diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index e7571e4baf8b4e..1daa969af82d88 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -69,4 +69,3 @@ services: image: ${DATAHUB_MCE_CONSUMER_IMAGE:-acryldata/datahub-mce-consumer}:${DATAHUB_VERSION:-head} ports: - 9090:9090 -version: '3.9' diff --git a/docker/quickstart/docker-compose.kafka-setup.quickstart.yml b/docker/quickstart/docker-compose.kafka-setup.quickstart.yml index d126caac6b5a88..ad189ddbec9052 100644 --- a/docker/quickstart/docker-compose.kafka-setup.quickstart.yml +++ b/docker/quickstart/docker-compose.kafka-setup.quickstart.yml @@ -1,2 +1 @@ services: {} -version: '3.9' diff --git a/docker/quickstart/docker-compose.monitoring.quickstart.yml b/docker/quickstart/docker-compose.monitoring.quickstart.yml index bddd82e393338f..f4afd397e785dc 100644 --- a/docker/quickstart/docker-compose.monitoring.quickstart.yml +++ b/docker/quickstart/docker-compose.monitoring.quickstart.yml @@ -41,6 +41,5 @@ services: - 9089:9090 volumes: - ../monitoring/prometheus.yaml:/etc/prometheus/prometheus.yml -version: '3.9' volumes: grafana-storage: null diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 66616be98bec16..4d03b22598606c 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -291,7 +291,6 @@ services: volumes: - zkdata:/var/lib/zookeeper/data - zklogs:/var/lib/zookeeper/log -version: '3.9' volumes: broker: null esdata: null diff --git a/docker/quickstart/generate_docker_quickstart.py b/docker/quickstart/generate_docker_quickstart.py index 9846233b369d76..6a3b64155adea1 100644 --- a/docker/quickstart/generate_docker_quickstart.py +++ b/docker/quickstart/generate_docker_quickstart.py @@ -120,11 +120,6 @@ def modify_docker_config(base_path, docker_yaml_config): elif volumes[i].startswith("./"): volumes[i] = "." + volumes[i] - # 10. Set docker compose version to 3. - # We need at least this version, since we use features like start_period for - # healthchecks (with services dependencies based on them) and shell-like variable interpolation. - docker_yaml_config["version"] = "3.9" - def dedup_env_vars(merged_docker_config): for service in merged_docker_config["services"]: diff --git a/docs/how/search.md b/docs/how/search.md index 4df5e7c1984d59..2274fe7c09240d 100644 --- a/docs/how/search.md +++ b/docs/how/search.md @@ -105,6 +105,20 @@ If you want to: - ```/q customProperties: encoding*``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20customProperties%3A%20encoding%2A) - Dataset Properties are indexed in ElasticSearch the manner of key=value. Hence if you know the precise key-value pair, you can search using ```"key=value"```. However, if you only know the key, you can use wildcards to replace the value and that is what is being done here. +- Find an entity with an **unversioned** structured property + - ```/q structuredProperties.io_acryl_privacy_retentionTime01:60``` + - This will return results for an **unversioned** structured property's qualified name `io.acryl.private.retentionTime01` and value `60`. + - ```/q _exists_:structuredProperties.io_acryl_privacy_retentionTime01``` + - In this example, the query will return any entity which has any value for the **unversioned** structured property with qualified name `io.acryl.private.retentionTime01`. + +- Find an entity with a **versioned** structured property + - ```/q structuredProperties._versioned.io_acryl_privacy_retentionTime.20240614080000.number:365``` + - This query will return results for a **versioned** structured property with qualified name `io.acryl.privacy.retentionTime`, version `20240614080000`, type `number` and value `365`. + - ```/q _exists_:structuredProperties._versioned.io_acryl_privacy_retentionTime.20240614080000.number``` + - Returns results for a **versioned** structured property with qualified name `io.acryl.privacy.retentionTime`, version `20240614080000` and type `number`. + - ```/q structuredProperties._versioned.io_acryl_privacy_retentionTime.\*.\*:365``` + - Returns results for a **versioned** structured property with any version and type with a values of `365` + - Find a dataset with a column name, **latitude** - ```/q fieldPaths: latitude``` [Sample results](https://demo.datahubproject.io/search?page=1&query=%2Fq%20fieldPaths%3A%20latitude) - fieldPaths is the name of the attribute that holds the column name in Datasets. diff --git a/metadata-ingestion/docs/sources/datahub/datahub_pre.md b/metadata-ingestion/docs/sources/datahub/datahub_pre.md index cb1cc2c4d59036..b35eb5811e4c9b 100644 --- a/metadata-ingestion/docs/sources/datahub/datahub_pre.md +++ b/metadata-ingestion/docs/sources/datahub/datahub_pre.md @@ -71,3 +71,27 @@ and [mce-consumer](../../../../metadata-jobs/mce-consumer-job/README.md)) - Increase the number of gms pods to add redundancy and increase resilience to node evictions * If you are migrating large amounts of data, consider increasing elasticsearch's thread count via the `ELASTICSEARCH_THREAD_COUNT` environment variable. + +#### Exclusions +You will likely want to exclude some urn types from your ingestion, as they contain instance-specific +metadata, such as settings, roles, policies, ingestion sources, and ingestion runs. For example, you +will likely want to start with this: + +```yaml +source: + config: + urn_pattern: # URN pattern to ignore/include in the ingestion + deny: + # Ignores all datahub metadata where the urn matches the regex + - ^urn:li:role.* # Only exclude if you do not want to ingest roles + - ^urn:li:dataHubRole.* # Only exclude if you do not want to ingest roles + - ^urn:li:dataHubPolicy.* # Only exclude if you do not want to ingest policies + - ^urn:li:dataHubIngestionSource.* # Only exclude if you do not want to ingest ingestion sources + - ^urn:li:dataHubSecret.* + - ^urn:li:dataHubExecutionRequest.* + - ^urn:li:dataHubAccessToken.* + - ^urn:li:dataHubUpgrade.* + - ^urn:li:inviteToken.* + - ^urn:li:globalSettings.* + - ^urn:li:dataHubStepState.* +``` diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index b5cc67532a9dd0..584bfc1f7cda56 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -338,10 +338,18 @@ def by_filter( # TODO: add some validation on entity_type if not force and not soft and not dry_run: - click.confirm( - "This will permanently delete data from DataHub. Do you want to continue?", - abort=True, - ) + if only_soft_deleted: + click.confirm( + "This will permanently delete data from DataHub. Do you want to continue?", + abort=True, + ) + else: + click.confirm( + "Hard deletion will permanently delete data from DataHub and can be slow. " + "We generally recommend using soft deletes instead. " + "Do you want to continue?", + abort=True, + ) graph = get_default_graph() logger.info(f"Using {graph}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 6ea8f21e8b2916..11d06771d4e4f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -1,5 +1,6 @@ import logging import re +from base64 import b32decode from collections import defaultdict from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast @@ -89,12 +90,13 @@ HiveColumnToAvroConverter, get_schema_fields_for_hive_column, ) -from datahub.utilities.mapping import Constants from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.ratelimiter import RateLimiter from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor +ENCODED_TAG_PREFIX = "urn_li_encoded_tag_" + logger: logging.Logger = logging.getLogger(__name__) # Handle table snapshots # See https://cloud.google.com/bigquery/docs/table-snapshots-intro. @@ -194,6 +196,18 @@ def store_table_refs(self): or self.config.use_queries_v2 ) + def modified_base32decode(self, text_to_decode: str) -> str: + # When we sync from DataHub to BigQuery, we encode the tags as modified base32 strings. + # BiqQuery labels only support lowercase letters, international characters, numbers, or underscores. + # So we need to modify the base32 encoding to replace the padding character `=` with `_` and convert to lowercase. + if not text_to_decode.startswith("%s" % ENCODED_TAG_PREFIX): + return text_to_decode + text_to_decode = ( + text_to_decode.replace(ENCODED_TAG_PREFIX, "").upper().replace("_", "=") + ) + text = b32decode(text_to_decode.encode("utf-8")).decode("utf-8") + return text + def get_project_workunits( self, project: BigqueryProject ) -> Iterable[MetadataWorkUnit]: @@ -253,7 +267,7 @@ def gen_dataset_containers( tags_joined: Optional[List[str]] = None if tags and self.config.capture_dataset_label_as_tag: tags_joined = [ - f"{k}:{v}" + self.make_tag_from_label(k, v) for k, v in tags.items() if is_tag_allowed(self.config.capture_dataset_label_as_tag, k) ] @@ -662,6 +676,11 @@ def _process_snapshot( dataset_name=dataset_name, ) + def make_tag_from_label(self, key: str, value: str) -> str: + if not value.startswith(ENCODED_TAG_PREFIX): + return make_tag_urn(f"""{key}:{value}""") + return self.modified_base32decode(value) + def gen_table_dataset_workunits( self, table: BigqueryTable, @@ -707,7 +726,7 @@ def gen_table_dataset_workunits( tags_to_add = [] tags_to_add.extend( [ - make_tag_urn(f"""{k}:{v}""") + self.make_tag_from_label(k, v) for k, v in table.labels.items() if is_tag_allowed(self.config.capture_table_label_as_tag, k) ] @@ -733,7 +752,7 @@ def gen_view_dataset_workunits( tags_to_add = None if table.labels and self.config.capture_view_label_as_tag: tags_to_add = [ - make_tag_urn(f"{k}:{v}") + self.make_tag_from_label(k, v) for k, v in table.labels.items() if is_tag_allowed(self.config.capture_view_label_as_tag, k) ] @@ -922,11 +941,6 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: break else: tags = [] - if col.is_partition_column: - tags.append( - TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) - ) - if col.cluster_column_position is not None: tags.append( TagAssociationClass( @@ -944,6 +958,7 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: type=SchemaFieldDataType( self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)() ), + isPartitioningKey=col.is_partition_column, nativeDataType=col.data_type, description=col.comment, nullable=col.is_nullable, diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index e3f9a150ad0001..e4829f8713cf76 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -129,6 +129,10 @@ class CSVEnricherSource(Source): If ownership_type_urn is set then ownership_type must be set to CUSTOM. + Note that you have the option in your recipe config to write as a PATCH or as an OVERRIDE. This choice will apply to + all metadata for the entity, not just a single aspect. So OVERRIDE will override all metadata, including performing + deletes if a metadata field is empty. The default is PATCH. + :::note This source will not work on very large csv files that do not fit in memory. ::: diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index e4dadaf602852c..e3fd8849ed8444 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -290,8 +290,8 @@ def __init__(self, ctx: PipelineContext, config: MongoDBConfig): # See https://pymongo.readthedocs.io/en/stable/examples/datetimes.html#handling-out-of-range-datetimes self.mongo_client = MongoClient( - self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options - ) # type: ignore + self.config.connect_uri, datetime_conversion="DATETIME_AUTO", **options # type: ignore + ) # This cheaply tests the connection. For details, see # https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py index e62c46888ef0e8..f0f0ab95ca8119 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/csv_enricher.py @@ -11,7 +11,7 @@ class CSVEnricherConfig(ConfigModel): ) write_semantics: str = pydantic.Field( default="PATCH", - description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE"', + description='Whether the new tags, terms and owners to be added will override the existing ones added only by this source or not. Value for this config can be "PATCH" or "OVERRIDE". NOTE: this will apply to all metadata for the entity, not just a single aspect.', ) delimiter: str = pydantic.Field( default=",", description="Delimiter to use when parsing CSV" diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index fcf65130df9757..02660f0fae08ed 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -269,7 +269,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -296,7 +297,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -328,6 +330,29 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:priority:high" + }, + { + "tag": "urn:li:tag:purchase" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", @@ -463,7 +488,8 @@ } ] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -479,7 +505,8 @@ "globalTags": { "tags": [] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -620,7 +647,8 @@ } ] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -636,7 +664,8 @@ "globalTags": { "tags": [] }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } @@ -1021,5 +1050,37 @@ "runId": "bigquery-2022_02_03-07_00_00", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:priority:high", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "priority:high" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:purchase", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "purchase" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json new file mode 100644 index 00000000000000..2a7336144f23d8 --- /dev/null +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_queries_golden.json @@ -0,0 +1,1031 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1" + }, + "name": "project-id-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "bigquery", + "env": "PROD", + "project_id": "project-id-1", + "dataset_id": "bigquery-dataset-1", + "location": "US" + }, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m4!1m3!3m2!1sproject-id-1!2sbigquery-dataset-1", + "name": "bigquery-dataset-1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Age" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "glossaryTerms": { + "terms": [ + { + "urn": "urn:li:glossaryTerm:Email_Address" + } + ], + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:datahub" + } + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3stable-1", + "name": "table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.view-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3sview-1", + "name": "view-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.view-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view `bigquery-dataset-1.view-1` as select email from `bigquery-dataset-1.table-1`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "age", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Test Policy Tag" + } + ] + }, + "isPartOfKey": false, + "isPartitioningKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "description": "comment", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "globalTags": { + "tags": [] + }, + "isPartOfKey": false, + "isPartitioningKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/bigquery?project=project-id-1&ws=!1m5!1m4!4m3!1sproject-id-1!2sbigquery-dataset-1!3ssnapshot-table-1", + "name": "snapshot-table-1", + "qualifiedName": "project-id-1.bigquery-dataset-1.snapshot-table-1", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Bigquery Table Snapshot" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "COPY" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD),email)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.snapshot-table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:068bd9323110994a40019fcf6cfc60d3", + "urn": "urn:li:container:068bd9323110994a40019fcf6cfc60d3" + }, + { + "id": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0", + "urn": "urn:li:container:8df46c5e3ded05a3122b0015822c0ef0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + ], + "confidenceScore": 0.9, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW `bigquery-dataset-1.view-1` AS\nSELECT\n email\nFROM `bigquery-dataset-1.table-1`", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD),email)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.view-1,PROD),email)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Abigquery%2Cproject-id-1.bigquery-dataset-1.view-1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Age", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Age" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Email_Address", + "changeType": "UPSERT", + "aspectName": "glossaryTermKey", + "aspect": { + "json": { + "name": "Email_Address" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Test Policy Tag", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Test Policy Tag" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json index ab59b95a9f3889..a1e5c3fd18f239 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_project_label_mcp_golden.json @@ -268,7 +268,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false }, { "fieldPath": "email", @@ -295,7 +296,8 @@ "actor": "urn:li:corpuser:datahub" } }, - "isPartOfKey": false + "isPartOfKey": false, + "isPartitioningKey": false } ] } diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index f9481d1d83d8b2..0ac4e94a5a24f3 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -54,6 +54,7 @@ def recipe(mcp_output_path: str, source_config_override: dict = {}) -> dict: "include_usage_statistics": False, "include_table_lineage": True, "include_data_platform_instance": True, + "capture_table_label_as_tag": True, "classification": ClassificationConfig( enabled=True, classifiers=[ @@ -155,6 +156,10 @@ def test_bigquery_v2_ingest( last_altered=None, size_in_bytes=None, rows_count=None, + labels={ + "priority": "high", + "purchase": "urn_li_encoded_tag_ovzg4otmne5hiylhhjyhk4tdnbqxgzi_", + }, ) get_tables_for_dataset.return_value = iter([bigquery_table]) snapshot_table = BigqueryTableSnapshot( @@ -319,8 +324,8 @@ def test_bigquery_queries_v2_ingest( tmp_path, ): test_resources_dir = pytestconfig.rootpath / "tests/integration/bigquery_v2" - mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_golden.json" - mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_output.json") + mcp_golden_path = f"{test_resources_dir}/bigquery_mcp_queries_golden.json" + mcp_output_path = "{}/{}".format(tmp_path, "bigquery_mcp_queries_output.json") dataset_name = "bigquery-dataset-1" get_datasets_for_project_id.return_value = [ diff --git a/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml b/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml index 865c5dee0f86e8..e2e0eba62f3d9d 100644 --- a/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml +++ b/metadata-integration/java/spark-lineage-legacy/spark-smoke-test/docker/spark-docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.6" services: spark-master: image: spark-master diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index cb02fb1c8b2f76..01a1e9cb159844 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -13,6 +13,8 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.linkedin.common.urn.Urn; +import com.linkedin.data.schema.DataSchema; +import com.linkedin.data.schema.MapDataSchema; import com.linkedin.data.template.DoubleMap; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; @@ -662,8 +664,48 @@ private static Map> buildSearchableF Collections.emptySet()))) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + List objectFieldSpec = + entitySpec.getSearchableFieldSpecs().stream() + .filter( + searchableFieldSpec -> + searchableFieldSpec.getSearchableAnnotation().getFieldType() + == SearchableAnnotation.FieldType.OBJECT) + .collect(Collectors.toList()); + + Map> objectFieldTypes = new HashMap<>(); + + objectFieldSpec.forEach( + fieldSpec -> { + String fieldName = fieldSpec.getSearchableAnnotation().getFieldName(); + DataSchema.Type dataType = + ((MapDataSchema) fieldSpec.getPegasusSchema()).getValues().getType(); + + Set fieldType; + + switch (dataType) { + case BOOLEAN: + fieldType = Set.of(SearchableAnnotation.FieldType.BOOLEAN); + break; + case INT: + fieldType = Set.of(SearchableAnnotation.FieldType.COUNT); + break; + case DOUBLE: + case LONG: + case FLOAT: + fieldType = Set.of(SearchableAnnotation.FieldType.DOUBLE); + break; + default: + fieldType = Set.of(SearchableAnnotation.FieldType.TEXT); + break; + } + objectFieldTypes.put(fieldName, fieldType); + annotationFieldTypes.remove(fieldName); + }); + return Stream.concat( - annotationFieldTypes.entrySet().stream(), + Stream.concat( + objectFieldTypes.entrySet().stream(), + annotationFieldTypes.entrySet().stream()), Stream.concat( mappingFieldTypes.entrySet().stream(), aliasFieldTypes.entrySet().stream())); }) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index e135f1941bfec9..f72b5fc1f6d227 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -759,7 +759,7 @@ private static Set getFieldTypes( StructuredPropertyUtils.toElasticsearchFieldType(fieldName, aspectRetriever); } else { Set fieldTypes = - searchableFields.getOrDefault(fieldName, Collections.emptySet()); + searchableFields.getOrDefault(fieldName.split("\\.")[0], Collections.emptySet()); finalFieldTypes = fieldTypes.stream().map(ESUtils::getElasticTypeForFieldType).collect(Collectors.toSet()); } @@ -785,6 +785,7 @@ private static RangeQueryBuilder buildRangeQueryFromCriterion( // Determine criterion value, range query only accepts single value so take first value in // values if multiple String criterionValueString = criterion.getValues().get(0).trim(); + Object criterionValue; String documentFieldName; if (fieldTypes.contains(BOOLEAN_FIELD_TYPE)) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraAspectMigrationsDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraAspectMigrationsDaoTest.java similarity index 92% rename from metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraAspectMigrationsDaoTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraAspectMigrationsDaoTest.java index d9077b72791602..11b00011ee6e04 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraAspectMigrationsDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraAspectMigrationsDaoTest.java @@ -1,12 +1,12 @@ -package com.linkedin.metadata.entity; +package com.linkedin.metadata.entity.cassandra; import static org.mockito.Mockito.*; import com.datastax.oss.driver.api.core.CqlSession; import com.linkedin.metadata.CassandraTestUtils; import com.linkedin.metadata.config.PreProcessHooks; -import com.linkedin.metadata.entity.cassandra.CassandraAspectDao; -import com.linkedin.metadata.entity.cassandra.CassandraRetentionService; +import com.linkedin.metadata.entity.AspectMigrationsDaoTest; +import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.service.UpdateIndicesService; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java similarity index 97% rename from metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraEntityServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index ef6c9e56e132b3..5535866f87c99b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.entity; +package com.linkedin.metadata.entity.cassandra; import static org.mockito.Mockito.*; import static org.testng.Assert.*; @@ -11,8 +11,10 @@ import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; import com.linkedin.metadata.config.PreProcessHooks; -import com.linkedin.metadata.entity.cassandra.CassandraAspectDao; -import com.linkedin.metadata.entity.cassandra.CassandraRetentionService; +import com.linkedin.metadata.entity.EntityServiceAspectRetriever; +import com.linkedin.metadata.entity.EntityServiceImpl; +import com.linkedin.metadata.entity.EntityServiceTest; +import com.linkedin.metadata.entity.ListResult; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java index 109c9b5c44efb9..7d1fdd34026f9f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/EbeanAspectDaoTest.java @@ -56,7 +56,8 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { testDao.runInTransactionWithRetryUnlocked( (txContext) -> { - testDao.getLatestAspects(Map.of("urn:li:corpuser:test", Set.of("status")), true); + testDao.getLatestAspects( + Map.of("urn:li:corpuser:testGetLatestAspectsForUpdate", Set.of("status")), true); return ""; }, mock(AspectsBatch.class), @@ -65,7 +66,7 @@ public void testGetLatestAspectsForUpdate() throws JsonProcessingException { // Get the captured SQL statements List sql = LoggedSql.stop().stream() - .filter(str -> str.contains("(t0.urn,t0.aspect,t0.version)")) + .filter(str -> str.contains("testGetLatestAspectsForUpdate")) .toList(); assertEquals( sql.size(), 1, String.format("Found: %s", new ObjectMapper().writeValueAsString(sql))); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java index 8d06594e415e08..6665faacae3373 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/utils/ESUtilsTest.java @@ -10,10 +10,13 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.search.elasticsearch.query.filter.QueryFilterRewriteChain; @@ -38,7 +41,7 @@ public class ESUtilsTest { private static AspectRetriever aspectRetrieverV1; @BeforeClass - public void setup() throws RemoteInvocationException, URISyntaxException { + public static void setup() throws RemoteInvocationException, URISyntaxException { Urn abFghTenUrn = Urn.createFromString("urn:li:structuredProperty:ab.fgh.ten"); // legacy @@ -835,4 +838,61 @@ public void testGetQueryBuilderFromStructPropExistsV1() { + "}"; Assert.assertEquals(result.toString(), expected); } + + @Test + public void testGetQueryBuilderForObjectFields() { + final Criterion singleValueCriterion = + new Criterion() + .setField("testObjectField.numericField") + .setCondition(Condition.EQUAL) + .setValues(new StringArray(ImmutableList.of("10"))); + + Map> searchableFieldTypes = new HashMap<>(); + searchableFieldTypes.put("testObjectField", Set.of(SearchableAnnotation.FieldType.DOUBLE)); + + QueryBuilder result = + ESUtils.getQueryBuilderFromCriterion( + singleValueCriterion, + false, + searchableFieldTypes, + mock(OperationContext.class), + QueryFilterRewriteChain.EMPTY); + String expected = + "{\n" + + " \"terms\" : {\n" + + " \"testObjectField.numericField\" : [\n" + + " 10.0\n" + + " ],\n" + + " \"boost\" : 1.0,\n" + + " \"_name\" : \"testObjectField.numericField\"\n" + + " }\n" + + "}"; + Assert.assertEquals(result.toString(), expected); + + final Criterion multiValueCriterion = + new Criterion() + .setField("testObjectField.numericField") + .setCondition(Condition.EQUAL) + .setValues(new StringArray(ImmutableList.of("10", "20"))); + + result = + ESUtils.getQueryBuilderFromCriterion( + multiValueCriterion, + false, + searchableFieldTypes, + mock(OperationContext.class), + QueryFilterRewriteChain.EMPTY); + expected = + "{\n" + + " \"terms\" : {\n" + + " \"testObjectField.numericField\" : [\n" + + " 10.0,\n" + + " 20.0\n" + + " ],\n" + + " \"boost\" : 1.0,\n" + + " \"_name\" : \"testObjectField.numericField\"\n" + + " }\n" + + "}"; + Assert.assertEquals(result.toString(), expected); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeline/CassandraTimelineServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeline/cassandra/CassandraTimelineServiceTest.java similarity index 94% rename from metadata-io/src/test/java/com/linkedin/metadata/timeline/CassandraTimelineServiceTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/timeline/cassandra/CassandraTimelineServiceTest.java index 4f515ba0144120..6dbeef32730f73 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeline/CassandraTimelineServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeline/cassandra/CassandraTimelineServiceTest.java @@ -1,4 +1,4 @@ -package com.linkedin.metadata.timeline; +package com.linkedin.metadata.timeline.cassandra; import static org.mockito.Mockito.mock; @@ -9,6 +9,8 @@ import com.linkedin.metadata.entity.cassandra.CassandraAspectDao; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.models.registry.EntityRegistryException; +import com.linkedin.metadata.timeline.TimelineServiceImpl; +import com.linkedin.metadata.timeline.TimelineServiceTest; import org.testcontainers.containers.CassandraContainer; import org.testng.Assert; import org.testng.annotations.AfterClass; diff --git a/metadata-io/src/test/resources/testng-cassandra.xml b/metadata-io/src/test/resources/testng-cassandra.xml new file mode 100644 index 00000000000000..ea3d0dae0ede6a --- /dev/null +++ b/metadata-io/src/test/resources/testng-cassandra.xml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/metadata-io/src/test/resources/testng-other.xml b/metadata-io/src/test/resources/testng-other.xml index e214fdb8c1f61f..c591e8fcc4e43f 100644 --- a/metadata-io/src/test/resources/testng-other.xml +++ b/metadata-io/src/test/resources/testng-other.xml @@ -8,6 +8,8 @@ + + diff --git a/metadata-io/src/test/resources/testng.xml b/metadata-io/src/test/resources/testng.xml index fdd1c1a6c8921e..f004136dec9a49 100644 --- a/metadata-io/src/test/resources/testng.xml +++ b/metadata-io/src/test/resources/testng.xml @@ -9,6 +9,7 @@ parallel followed by everything else. + \ No newline at end of file diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml index 47f6deedda2c8f..e70ab1162a3816 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml @@ -30,5 +30,6 @@ extraArgs: {} debugMode: false executorId: default - source: {{source}}{{^source}}SYSTEM{{/source}} - headers: {} \ No newline at end of file + source: + type: {{source}}{{^source}}SYSTEM{{/source}} + headers: {} diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index bce2d794b18691..3db81b11c67fa1 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -8,7 +8,10 @@ if [ "${RUN_QUICKSTART:-true}" == "true" ]; then source ./run-quickstart.sh fi +set +x +echo "Activating virtual environment" source venv/bin/activate +set -x # set environment variables for the test source ./set-test-env-vars.sh diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index e83a116c670a4a..902dc1030660a0 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -5,7 +5,10 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" cd "$DIR" ../gradlew :smoke-test:installDev +set +x +echo "Activating virtual environment" source venv/bin/activate +set -x mkdir -p ~/.datahub/plugins/frontend/auth/ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props @@ -23,4 +26,4 @@ DATAHUB_SEARCH_IMAGE="$DATAHUB_SEARCH_IMAGE" DATAHUB_SEARCH_TAG="$DATAHUB_SEARCH XPACK_SECURITY_ENABLED="$XPACK_SECURITY_ENABLED" ELASTICSEARCH_USE_SSL="$ELASTICSEARCH_USE_SSL" \ USE_AWS_ELASTICSEARCH="$USE_AWS_ELASTICSEARCH" \ DATAHUB_VERSION=${DATAHUB_VERSION} \ -docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900 +docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900