diff --git a/README.md b/README.md index 96b30ea49359c..17aab332a07da 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [Geotab](https://www.geotab.com) - [Grofers](https://grofers.com) - [hipages](https://hipages.com.au/) +- [IOMED](https://iomed.health) - [Klarna](https://www.klarna.com) - [LinkedIn](http://linkedin.com) - [Peloton](https://www.onepeloton.com) diff --git a/build.gradle b/build.gradle index dc9ca7e85a050..3b6818b4b575a 100644 --- a/build.gradle +++ b/build.gradle @@ -88,7 +88,7 @@ project.ext.externalDependency = [ // avro-serde includes dependencies for `kafka-avro-serializer` `kafka-schema-registry-client` and `avro` 'kafkaAvroSerde': 'io.confluent:kafka-streams-avro-serde:5.5.1', 'kafkaClients': 'org.apache.kafka:kafka-clients:2.3.0', - 'logbackClassic': 'ch.qos.logback:logback-classic:1.2.3', + 'logbackClassic': 'ch.qos.logback:logback-classic:1.2.9', 'lombok': 'org.projectlombok:lombok:1.18.12', 'mariadbConnector': 'org.mariadb.jdbc:mariadb-java-client:2.6.0', 'mavenArtifact': "org.apache.maven:maven-artifact:$mavenVersion", diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java index 1e4d1007de4de..35c6db088292e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java @@ -59,7 +59,7 @@ public CompletableFuture> get(DataFetchingEnvironment env // Step 1: Get aspects. List aspects = _client.getTimeseriesAspectValues(urn, _entityName, _aspectName, maybeStartTimeMillis, maybeEndTimeMillis, - maybeLimit, context.getAuthentication()); + maybeLimit, null, null, context.getAuthentication()); // Step 2: Bind profiles into GraphQL strong types. return aspects.stream().map(_aspectMapper::apply).collect(Collectors.toList()); diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index cff1e717852b4..e16cce8d45e8b 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -71,7 +71,7 @@ services: environment: - discovery.type=single-node - xpack.security.enabled=false - - ES_JAVA_OPTS=-Xms256m -Xmx256m + - ES_JAVA_OPTS=-Xms256m -Xmx256m -Dlog4j2.formatMsgNoLookups=true healthcheck: retries: 4 start_period: 2m diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 9c29164e6c134..1a73014acdda1 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -17,11 +17,11 @@ dependencies { testCompile externalDependency.mockito constraints { - implementation("org.apache.logging.log4j:log4j-core:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-core:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } - implementation("org.apache.logging.log4j:log4j-api:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-api:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } } } \ No newline at end of file diff --git a/metadata-events/mxe-registration/build.gradle b/metadata-events/mxe-registration/build.gradle index c6d9bafc2372f..5c3e64b506607 100644 --- a/metadata-events/mxe-registration/build.gradle +++ b/metadata-events/mxe-registration/build.gradle @@ -15,11 +15,11 @@ dependencies { avroOriginal project(path: ':metadata-models', configuration: 'avroSchema') constraints { - implementation("org.apache.logging.log4j:log4j-core:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-core:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } - implementation("org.apache.logging.log4j:log4j-api:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-api:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } } } diff --git a/metadata-events/mxe-utils-avro-1.7/build.gradle b/metadata-events/mxe-utils-avro-1.7/build.gradle index 0352700b630f3..71148d7c4ea0d 100644 --- a/metadata-events/mxe-utils-avro-1.7/build.gradle +++ b/metadata-events/mxe-utils-avro-1.7/build.gradle @@ -9,11 +9,11 @@ dependencies { testCompile project(':metadata-testing:metadata-test-utils') constraints { - implementation("org.apache.logging.log4j:log4j-core:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-core:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } - implementation("org.apache.logging.log4j:log4j-api:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-api:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } } } diff --git a/metadata-ingestion-examples/common/build.gradle b/metadata-ingestion-examples/common/build.gradle index d31f75d607f8e..d2d3637f6892c 100644 --- a/metadata-ingestion-examples/common/build.gradle +++ b/metadata-ingestion-examples/common/build.gradle @@ -19,11 +19,11 @@ dependencies { runtime externalDependency.logbackClassic constraints { - implementation("org.apache.logging.log4j:log4j-core:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-core:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } - implementation("org.apache.logging.log4j:log4j-api:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-api:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } } } diff --git a/metadata-ingestion-examples/kafka-etl/build.gradle b/metadata-ingestion-examples/kafka-etl/build.gradle index c00cabb9fbf74..0ad4da77888a1 100644 --- a/metadata-ingestion-examples/kafka-etl/build.gradle +++ b/metadata-ingestion-examples/kafka-etl/build.gradle @@ -22,11 +22,11 @@ dependencies { runtime externalDependency.logbackClassic constraints { - implementation("org.apache.logging.log4j:log4j-core:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-core:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } - implementation("org.apache.logging.log4j:log4j-api:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-api:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } } } diff --git a/metadata-ingestion-examples/mce-cli/build.gradle b/metadata-ingestion-examples/mce-cli/build.gradle index f384afe747363..5d887ae799775 100644 --- a/metadata-ingestion-examples/mce-cli/build.gradle +++ b/metadata-ingestion-examples/mce-cli/build.gradle @@ -28,11 +28,11 @@ dependencies { annotationProcessor externalDependency.picocli constraints { - implementation("org.apache.logging.log4j:log4j-core:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-core:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } - implementation("org.apache.logging.log4j:log4j-api:2.15.0") { - because("previous versions are vulnerable to CVE-2021-44228") + implementation("org.apache.logging.log4j:log4j-api:2.17.0") { + because("previous versions are vulnerable to CVE-2021-45105") } } diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index d19ccac08abd5..aad490159905a 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -165,10 +165,6 @@ sink: server: "http://localhost:8080" ``` -We automatically expand environment variables in the config, -similar to variable substitution in GNU bash or in docker-compose files. For details, see -https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitution. - Running a recipe is quite easy. ```shell @@ -177,6 +173,11 @@ datahub ingest -c ./examples/recipes/mssql_to_datahub.yml A number of recipes are included in the [examples/recipes](./examples/recipes) directory. For full info and context on each source and sink, see the pages described in the [table of plugins](#installing-plugins). +### Handling sensitive information in recipes + +We automatically expand environment variables in the config (e.g. `${MSSQL_PASSWORD}`), +similar to variable substitution in GNU bash or in docker-compose files. For details, see +https://docs.docker.com/compose/compose-file/compose-file-v2/#variable-substitution. This environment variable substitution should be used to mask sensitive information in recipe files. As long as you can get env variables securely to the ingestion process there would not be any need to store sensitive information in recipes. ## Transformations If you'd like to modify data before it reaches the ingestion sinks – for instance, adding additional owners or tags – you can use a transformer to write your own module and integrate it with DataHub. diff --git a/metadata-ingestion/examples/recipes/mode_to_datahub.yml b/metadata-ingestion/examples/recipes/mode_to_datahub.yml index 82bdb7a1d54b4..f1503d9526d10 100644 --- a/metadata-ingestion/examples/recipes/mode_to_datahub.yml +++ b/metadata-ingestion/examples/recipes/mode_to_datahub.yml @@ -2,12 +2,16 @@ source: type: "mode" config: - token: 9fa6a90fcd33 - password: a03bcbc011d6f77c585f5682 + token: token + password: password connect_uri: https://app.mode.com/ - workspace: "petabloc" + workspace: "workspace" default_schema: "public" owner_username_instead_of_email: False + api_options: + retry_backoff_multiplier: 2 + max_retry_interval: 10 + max_attempts: 5 # see https://datahubproject.io/docs/metadata-ingestion/sink_docs/datahub for complete documentation sink: diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c0afd76928d81..fec00f3ff2d1d 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -23,7 +23,7 @@ def get_long_description(): base_requirements = { # Compatability. "dataclasses>=0.6; python_version < '3.7'", - "typing_extensions>=3.10.0.2", + "typing_extensions>=3.10.0.2,<4", "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", @@ -97,7 +97,7 @@ def get_long_description(): "bigquery": sql_common | bigquery_common | {"pybigquery >= 0.6.0"}, "bigquery-usage": bigquery_common | {"cachetools"}, "datahub-business-glossary": set(), - "dbt": set(), + "dbt": {"requests"}, "druid": sql_common | {"pydruid>=0.6.2"}, "feast": {"docker"}, "glue": aws_common, @@ -124,7 +124,8 @@ def get_long_description(): "oracle": sql_common | {"cx_Oracle"}, "postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"}, "redash": {"redash-toolbelt", "sql-metadata"}, - "redshift": sql_common | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage"}, + "redshift": sql_common + | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2", "sqllineage"}, "redshift-usage": sql_common | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2"}, "sagemaker": aws_common, @@ -147,7 +148,6 @@ def get_long_description(): "sqlalchemy-trino" }, "nifi": {"requests"}, - } all_exclude_plugins: Set[str] = { @@ -183,7 +183,8 @@ def get_long_description(): "flake8>=3.8.3", "flake8-tidy-imports>=4.3.0", "isort>=5.7.0", - "mypy>=0.901", + # Waiting for https://github.com/samuelcolvin/pydantic/pull/3175 before allowing mypy 0.920. + "mypy>=0.901,<0.920", "pytest>=6.2.2", "pytest-cov>=2.8.1", "pytest-docker>=0.10.3", @@ -303,7 +304,6 @@ def get_long_description(): "trino = datahub.ingestion.source.sql.trino:TrinoSource", "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource", "nifi = datahub.ingestion.source.nifi:NifiSource", - ], "datahub.ingestion.sink.plugins": [ "file = datahub.ingestion.sink.file:FileSink", @@ -311,6 +311,9 @@ def get_long_description(): "datahub-kafka = datahub.ingestion.sink.datahub_kafka:DatahubKafkaSink", "datahub-rest = datahub.ingestion.sink.datahub_rest:DatahubRestSink", ], + "datahub.ingestion.state_provider.plugins": [ + "datahub = datahub.ingestion.source.state_provider.datahub_ingestion_state_provider:DatahubIngestionStateProvider", + ], "apache_airflow_provider": ["provider_info=datahub_provider:get_provider_info"], } diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 1d73a761a6c72..567fecca490c8 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -44,27 +44,32 @@ Note that a `.` is used to denote nested fields in the YAML recipe. As a SQL-based service, the Athena integration is also supported by our SQL profiler. See [here](./sql_profiles.md) for more details on configuration. -| Field | Required | Default | Description | -| --------------------------- | -------- | ------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `project_id` | | Autodetected | Project ID to ingest from. If not specified, will infer from environment. | -| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | -| `options.