Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Oct 31, 2023
2 parents f83ba67 + b565a65 commit 464cac2
Show file tree
Hide file tree
Showing 34 changed files with 418 additions and 60 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/airflow-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:
branches:
- "**"
paths:
- ".github/**"
- ".github/workflows/airflow-plugin.yml"
- "metadata-ingestion-modules/airflow-plugin/**"
- "metadata-ingestion/**"
- "metadata-models/**"
Expand Down
12 changes: 4 additions & 8 deletions .github/workflows/check-datahub-jars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@ on:
push:
branches:
- master
paths-ignore:
- "docker/**"
- "docs/**"
- "**.md"
paths:
- "metadata-integration"
pull_request:
branches:
- "**"
paths-ignore:
- "docker/**"
- "docs/**"
- "**.md"
paths:
- "metadata-integration"
release:
types: [published]

Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@ on:
pull_request:
branches:
- "**"
paths:
- "metadata-ingestion/**"
- "metadata-models/**"
- "docs-website/**"
push:
branches:
- master
paths:
- "metadata-ingestion/**"
- "metadata-models/**"
- "docs-website/**"
# release:
# types: [published, edited]

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/metadata-ingestion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ on:
branches:
- "**"
paths:
- ".github/**"
- ".github/workflows/metadata-ingestion.yml"
- "metadata-ingestion/**"
- "metadata-models/**"
release:
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/metadata-model.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@ on:
push:
branches:
- master
paths-ignore:
- "docs/**"
- "**.md"
paths:
- "metadata-models/**"
release:
types: [published]

Expand Down
6 changes: 5 additions & 1 deletion docker/docker-compose-with-cassandra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ services:
retries: 5
timeout: 5s
volumes:
- zkdata:/var/lib/zookeeper
# See https://stackoverflow.com/a/61008432 for why we need two volumes.
# See also: https://docs.confluent.io/platform/current/installation/docker/operations/external-volumes.html#data-volumes-for-kafka-and-zk
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
networks:
default:
name: datahub_network
Expand All @@ -210,3 +213,4 @@ volumes:
neo4jdata:
broker:
zkdata:
zklogs:
6 changes: 5 additions & 1 deletion docker/docker-compose-without-neo4j.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,15 @@ services:
retries: 3
timeout: 5s
volumes:
- zkdata:/var/lib/zookeeper
# See https://stackoverflow.com/a/61008432 for why we need two volumes.
# See also: https://docs.confluent.io/platform/current/installation/docker/operations/external-volumes.html#data-volumes-for-kafka-and-zk
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
networks:
default:
name: datahub_network
volumes:
esdata:
broker:
zkdata:
zklogs:
6 changes: 5 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,10 @@ services:
retries: 3
timeout: 5s
volumes:
- zkdata:/var/lib/zookeeper
# See https://stackoverflow.com/a/61008432 for why we need two volumes.
# See also: https://docs.confluent.io/platform/current/installation/docker/operations/external-volumes.html#data-volumes-for-kafka-and-zk
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
networks:
default:
name: datahub_network
Expand All @@ -204,3 +207,4 @@ volumes:
neo4jdata:
broker:
zkdata:
zklogs:
4 changes: 3 additions & 1 deletion docker/quickstart/docker-compose-m1.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,13 @@ services:
ports:
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/lib/zookeeper
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
mysqldata: null
neo4jdata: null
zkdata: null
zklogs: null
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,12 @@ services:
ports:
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/lib/zookeeper
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
mysqldata: null
zkdata: null
zklogs: null
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,12 @@ services:
ports:
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/lib/zookeeper
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
mysqldata: null
zkdata: null
zklogs: null
4 changes: 3 additions & 1 deletion docker/quickstart/docker-compose.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,13 @@ services:
ports:
- ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181
volumes:
- zkdata:/var/lib/zookeeper
- zkdata:/var/lib/zookeeper/data
- zklogs:/var/lib/zookeeper/log
version: '3.9'
volumes:
broker: null
esdata: null
mysqldata: null
neo4jdata: null
zkdata: null
zklogs: null
2 changes: 1 addition & 1 deletion docs/deploy/aws.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ This guide requires the following tools:
- [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage kubernetes resources
- [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. Note, we only support Helm
3.
- [eksctl](https://eksctl.io/introduction/#installation) to create and manage clusters on EKS
- [eksctl](https://eksctl.io/installation/) to create and manage clusters on EKS
- [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) to manage AWS resources

To use the above tools, you need to set up AWS credentials by following
Expand Down
12 changes: 11 additions & 1 deletion docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,20 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

## Next

- #9010 - In Redshift source's config `incremental_lineage` is set default to off.
### Breaking Changes

### Potential Downtime

### Deprecations

### Other Notable Changes

## 0.12.0

### Breaking Changes

- #9044 - GraphQL APIs for adding ownership now expect either an `ownershipTypeUrn` referencing a customer ownership type or a (deprecated) `type`. Where before adding an ownership without a concrete type was allowed, this is no longer the case. For simplicity you can use the `type` parameter which will get translated to a custom ownership type internally if one exists for the type being added.
- #9010 - In Redshift source's config `incremental_lineage` is set default to off.
- #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now.
- #8942 - Removed `urn:li:corpuser:datahub` owner for the `Measure`, `Dimension` and `Temporal` tags emitted
by Looker and LookML source connectors.
Expand Down
2 changes: 1 addition & 1 deletion docs/what-is-datahub/datahub-concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ List of Data Platforms
- Tableau
- Vertica

Reference : [data_platforms.json](https://github.com/acryldata/datahub-fork/blob/acryl-main/metadata-service/war/src/main/resources/boot/data_platforms.json)
Reference : [data_platforms.json](https://github.com/datahub-project/datahub/blob/master/metadata-service/war/src/main/resources/boot/data_platforms.json)

</details>

Expand Down
6 changes: 4 additions & 2 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
sqlglot_lib = {
# Using an Acryl fork of sqlglot.
# https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1
"acryl-sqlglot==18.5.2.dev45",
"acryl-sqlglot==18.17.1.dev16",
}

sql_common = (
Expand Down Expand Up @@ -173,7 +173,9 @@

clickhouse_common = {
# Clickhouse 0.2.0 adds support for SQLAlchemy 1.4.x
"clickhouse-sqlalchemy>=0.2.0",
# Disallow 0.2.5 because of https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/272.
# Note that there's also a known issue around nested map types: https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/269.
"clickhouse-sqlalchemy>=0.2.0,<0.2.5",
}

redshift_common = {
Expand Down
5 changes: 5 additions & 0 deletions metadata-ingestion/src/datahub/cli/docker_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pathlib
import platform
import signal
import subprocess
import sys
import tempfile
Expand Down Expand Up @@ -770,6 +771,10 @@ def quickstart( # noqa: C901
logger.debug("docker compose up still running, sending SIGKILL")
up_process.kill()
up_process.wait()
else:
# If the docker process got a keyboard interrupt, raise one here.
if up_process.returncode in {128 + signal.SIGINT, -signal.SIGINT}:
raise KeyboardInterrupt

# Check docker health every few seconds.
status = check_docker_quickstart()
Expand Down
13 changes: 12 additions & 1 deletion metadata-ingestion/src/datahub/emitter/mcp_builder.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from typing import Dict, Iterable, List, Optional, TypeVar
from typing import Dict, Iterable, List, Optional, Type, TypeVar

from pydantic.fields import Field
from pydantic.main import BaseModel

from datahub.emitter.mce_builder import (
Aspect,
datahub_guid,
make_container_urn,
make_data_platform_urn,
Expand All @@ -18,6 +19,7 @@
)
from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
from datahub.metadata.schema_classes import (
KEY_ASPECTS,
ContainerClass,
DomainsClass,
EmbedClass,
Expand Down Expand Up @@ -306,3 +308,12 @@ def create_embed_mcp(urn: str, embed_url: str) -> MetadataChangeProposalWrapper:
entityUrn=urn,
aspect=EmbedClass(renderUrl=embed_url),
)


def entity_supports_aspect(entity_type: str, aspect_type: Type[Aspect]) -> bool:
entity_key_aspect = KEY_ASPECTS[entity_type]
aspect_name = aspect_type.get_aspect_name()

supported_aspects = entity_key_aspect.ASPECT_INFO["entityAspects"]

return aspect_name in supported_aspects
6 changes: 5 additions & 1 deletion metadata-ingestion/src/datahub/ingestion/api/workunit.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ class MetadataWorkUnit(WorkUnit):
metadata: Union[
MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper
]
# A workunit creator can determine if this workunit is allowed to fail

# A workunit creator can determine if this workunit is allowed to fail.
# TODO: This flag was initially added during the rollout of the subType aspect
# to improve backwards compatibility, but is not really needed anymore and so
# should be removed.
treat_errors_as_warnings: bool = False

# When this is set to false, this MWU will be ignored by automatic helpers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,12 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]:
transport_options=self.transport_options,
)
except SDKError as e:
logger.warning(f"Could not find user with id {id_}")
logger.warning(f"Failure was {e}")
if "Looker Not Found (404)" in str(e):
# User not found
logger.info(f"Could not find user with id {id_}: 404 error")
else:
logger.warning(f"Could not find user with id {id_}")
logger.warning(f"Failure was {e}")
# User not found
return None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -926,14 +926,7 @@ def process_metrics_dimensions_and_fields_for_dashboard(
mcps = chart_mcps
mcps.append(dashboard_mcp)

workunits = [
MetadataWorkUnit(
id=f"looker-{mcp.aspectName}-{mcp.entityUrn}",
mcp=mcp,
treat_errors_as_warnings=True,
)
for mcp in mcps
]
workunits = [mcp.as_workunit() for mcp in mcps]

return workunits

Expand Down Expand Up @@ -1320,10 +1313,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
id=f"looker-{event.proposedSnapshot.urn}", mce=event
)
elif isinstance(event, MetadataChangeProposalWrapper):
# We want to treat subtype aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures
yield event.as_workunit(
treat_errors_as_warnings=event.aspectName in ["subTypes"]
)
yield event.as_workunit()
else:
raise Exception(f"Unexpected type of event {event}")
self.reporter.report_stage_end("explore_metadata")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2171,10 +2171,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
for mcp in self._build_dataset_mcps(
maybe_looker_view
):
# We want to treat mcp aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures
yield mcp.as_workunit(
treat_errors_as_warnings=True
)
yield mcp.as_workunit()
else:
(
prev_model_name,
Expand Down
12 changes: 4 additions & 8 deletions metadata-ingestion/src/datahub/upgrade/upgrade.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import asyncio
import contextlib
import functools
import logging
import sys
from datetime import datetime, timedelta, timezone
Expand Down Expand Up @@ -374,17 +373,14 @@ def check_upgrade(func: Callable[..., T]) -> Callable[..., T]:
@wraps(func)
def async_wrapper(*args: Any, **kwargs: Any) -> Any:
async def run_inner_func():
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None, functools.partial(func, *args, **kwargs)
)
return func(*args, **kwargs)

async def run_func_check_upgrade():
version_stats_future = asyncio.ensure_future(retrieve_version_stats())
the_one_future = asyncio.ensure_future(run_inner_func())
ret = await the_one_future
main_func_future = asyncio.ensure_future(run_inner_func())
ret = await main_func_future

# the one future has returned
# the main future has returned
# we check the other futures quickly
try:
version_stats = await asyncio.wait_for(version_stats_future, 0.5)
Expand Down
Loading

0 comments on commit 464cac2

Please sign in to comment.