diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index cd1e159b7d53cc..70816e5f093d13 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -32,6 +32,7 @@ jobs: strategy: matrix: include: + # Note: this should be kept in sync with tox.ini. - python-version: "3.8" extra_pip_requirements: "apache-airflow~=2.1.4" extra_pip_extras: plugin-v1 @@ -39,13 +40,13 @@ jobs: extra_pip_requirements: "apache-airflow~=2.2.4" extra_pip_extras: plugin-v1 - python-version: "3.10" - extra_pip_requirements: "apache-airflow~=2.4.0" + extra_pip_requirements: 'apache-airflow~=2.4.0 pluggy==1.0.0 "pendulum<3.0"' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: "apache-airflow~=2.6.0" + extra_pip_requirements: 'apache-airflow~=2.6.0 "pendulum<3.0"' extra_pip_extras: plugin-v2 - python-version: "3.10" - extra_pip_requirements: "apache-airflow>=2.7.0" + extra_pip_requirements: "apache-airflow>=2.7.0 pydantic==2.4.2" extra_pip_extras: plugin-v2 fail-fast: false steps: diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 169a86000adccb..7cef38b1cd47ce 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -911,13 +911,13 @@ jobs: ] steps: - uses: aws-actions/configure-aws-credentials@v1 - if: ${{ needs.setup.outputs.publish != 'false' }} + if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }} with: aws-access-key-id: ${{ secrets.AWS_SQS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SQS_ACCESS_KEY }} aws-region: us-west-2 - uses: isbang/sqs-action@v0.2.0 - if: ${{ needs.setup.outputs.publish != 'false' }} + if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }} with: sqs-url: ${{ secrets.DATAHUB_HEAD_SYNC_QUEUE }} message: '{ "command": "git-sync", "args" : {"repoName": "${{ needs.setup.outputs.repository_name }}", "repoOrg": "${{ github.repository_owner }}", "repoBranch": "${{ needs.setup.outputs.branch_name }}", "repoShaShort": "${{ needs.setup.outputs.short_sha }}" }}' diff --git a/build.gradle b/build.gradle index a7a85db0398e21..bb01a15a7db8d6 100644 --- a/build.gradle +++ b/build.gradle @@ -46,6 +46,7 @@ plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.1' id 'com.github.johnrengelman.shadow' version '8.1.1' apply false id 'com.palantir.docker' version '0.35.0' apply false + id 'com.avast.gradle.docker-compose' version '0.17.5' id "com.diffplug.spotless" version "6.23.3" // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java index 3f635872747a57..6ba3c5090f1c40 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java @@ -91,6 +91,7 @@ private List getProductAnalyticsCharts(Authentication authentica final List charts = new ArrayList<>(); DateUtil dateUtil = new DateUtil(); final DateTime startOfNextWeek = dateUtil.getStartOfNextWeek(); + final DateTime startOfThisMonth = dateUtil.getStartOfThisMonth(); final DateTime startOfNextMonth = dateUtil.getStartOfNextMonth(); final DateRange trailingWeekDateRange = dateUtil.getTrailingWeekDateRange(); @@ -103,7 +104,7 @@ private List getProductAnalyticsCharts(Authentication authentica charts.add( getActiveUsersTimeSeriesChart( startOfNextMonth.minusMonths(12), - startOfNextMonth.minusMillis(1), + startOfThisMonth.minusMillis(1), "Monthly Active Users", DateInterval.MONTH)); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java index 07bd1fba5d8a86..e74ed09849763c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/featureflags/FeatureFlags.java @@ -12,6 +12,7 @@ public class FeatureFlags { private boolean readOnlyModeEnabled = false; private boolean showSearchFiltersV2 = false; private boolean showBrowseV2 = false; + private boolean platformBrowseV2 = false; private PreProcessHooks preProcessHooks; private boolean showAcrylInfo = false; private boolean showAccessManagement = false; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java index 292d6108b7a044..da4a3a76dd7e0e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/chart/BrowseV2Resolver.java @@ -2,14 +2,16 @@ import static com.linkedin.datahub.graphql.Constants.BROWSE_PATH_V2_DELIMITER; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.bindArgument; -import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.resolveView; +import static com.linkedin.datahub.graphql.resolvers.search.SearchUtils.*; +import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.BrowseResultGroupV2; import com.linkedin.datahub.graphql.generated.BrowseResultMetadata; import com.linkedin.datahub.graphql.generated.BrowseResultsV2; import com.linkedin.datahub.graphql.generated.BrowseV2Input; +import com.linkedin.datahub.graphql.generated.EntityType; import com.linkedin.datahub.graphql.resolvers.EntityTypeMapper; import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.resolvers.search.SearchUtils; @@ -43,8 +45,8 @@ public class BrowseV2Resolver implements DataFetcher get(DataFetchingEnvironment environment) { final QueryContext context = environment.getContext(); final BrowseV2Input input = bindArgument(environment.getArgument("input"), BrowseV2Input.class); - final String entityName = EntityTypeMapper.getName(input.getType()); + final List entityNames = getEntityNames(input); final int start = input.getStart() != null ? input.getStart() : DEFAULT_START; final int count = input.getCount() != null ? input.getCount() : DEFAULT_COUNT; final String query = input.getQuery() != null ? input.getQuery() : "*"; @@ -70,7 +72,7 @@ public CompletableFuture get(DataFetchingEnvironment environmen BrowseResultV2 browseResults = _entityClient.browseV2( - entityName, + entityNames, pathStr, maybeResolvedView != null ? SearchUtils.combineFilters( @@ -87,6 +89,18 @@ public CompletableFuture get(DataFetchingEnvironment environmen }); } + public static List getEntityNames(BrowseV2Input input) { + List entityTypes; + if (input.getTypes() != null && input.getTypes().size() > 0) { + entityTypes = input.getTypes(); + } else if (input.getType() != null) { + entityTypes = ImmutableList.of(input.getType()); + } else { + entityTypes = BROWSE_ENTITY_TYPES; + } + return entityTypes.stream().map(EntityTypeMapper::getName).collect(Collectors.toList()); + } + private BrowseResultsV2 mapBrowseResults(BrowseResultV2 browseResults) { BrowseResultsV2 results = new BrowseResultsV2(); results.setTotal(browseResults.getNumGroups()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java index 34f7f133f6fb94..81b52991cde90c 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/config/AppConfigResolver.java @@ -175,6 +175,7 @@ public CompletableFuture get(final DataFetchingEnvironment environmen .setShowAcrylInfo(_featureFlags.isShowAcrylInfo()) .setShowAccessManagement(_featureFlags.isShowAccessManagement()) .setNestedDomainsEnabled(_featureFlags.isNestedDomainsEnabled()) + .setPlatformBrowseV2(_featureFlags.isPlatformBrowseV2()) .build(); appConfig.setFeatureFlags(featureFlagsConfig); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java index d04cb57e1a860e..444ab4bcc3c3c9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/search/SearchUtils.java @@ -92,6 +92,20 @@ private SearchUtils() {} EntityType.NOTEBOOK, EntityType.DATA_PRODUCT); + /** Entities that are part of browse by default */ + public static final List BROWSE_ENTITY_TYPES = + ImmutableList.of( + EntityType.DATASET, + EntityType.DASHBOARD, + EntityType.CHART, + EntityType.CONTAINER, + EntityType.MLMODEL, + EntityType.MLMODEL_GROUP, + EntityType.MLFEATURE_TABLE, + EntityType.DATA_FLOW, + EntityType.DATA_JOB, + EntityType.NOTEBOOK); + /** A prioritized list of source filter types used to generate quick filters */ public static final List PRIORITIZED_SOURCE_ENTITY_TYPES = Stream.of( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java index 4b837605d4e318..677ad8afbaca31 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/util/DateUtil.java @@ -13,6 +13,10 @@ public DateTime getStartOfNextWeek() { return setTimeToZero(getNow().withDayOfWeek(DateTimeConstants.SUNDAY).plusDays(1)); } + public DateTime getStartOfThisMonth() { + return setTimeToZero(getNow().withDayOfMonth(1)); + } + public DateTime getStartOfNextMonth() { return setTimeToZero(getNow().withDayOfMonth(1).plusMonths(1)); } diff --git a/datahub-graphql-core/src/main/resources/app.graphql b/datahub-graphql-core/src/main/resources/app.graphql index 075a3b0fac43bc..52451e195ee841 100644 --- a/datahub-graphql-core/src/main/resources/app.graphql +++ b/datahub-graphql-core/src/main/resources/app.graphql @@ -437,6 +437,11 @@ type FeatureFlagsConfig { """ showBrowseV2: Boolean! + """ + Whether browse v2 is platform mode, which means that platforms are displayed instead of entity types at the root. + """ + platformBrowseV2: Boolean! + """ Whether we should show CTAs in the UI related to moving to Managed DataHub by Acryl. """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index e0cde5a2db9f99..8f2377edb546e0 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -1176,9 +1176,14 @@ Input required for browse queries """ input BrowseV2Input { """ - The browse entity type + The browse entity type - deprecated use types instead """ - type: EntityType! + type: EntityType + + """ + The browse entity type - deprecated use types instead. If not provided, all types will be used. + """ + types: [EntityType!] """ The browse path V2 - a list with each entry being part of the browse path V2 diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java index bffc2b31af2b9a..433772d7e2cfe1 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/browse/BrowseV2ResolverTest.java @@ -249,7 +249,7 @@ private static EntityClient initMockEntityClient( EntityClient client = Mockito.mock(EntityClient.class); Mockito.when( client.browseV2( - Mockito.eq(entityName), + Mockito.eq(ImmutableList.of(entityName)), Mockito.eq(path), Mockito.eq(filter), Mockito.eq(query), diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java index 24bcec5852b4fc..5ba5c8a90fd4ac 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeCleanupConfig.java @@ -7,13 +7,16 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.ebean.Database; import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; import org.opensearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.DependsOn; +@Slf4j @Configuration public class NoCodeCleanupConfig { @@ -26,6 +29,7 @@ public class NoCodeCleanupConfig { "elasticSearchRestHighLevelClient", INDEX_CONVENTION_BEAN }) + @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true) @Nonnull public NoCodeCleanupUpgrade createInstance() { final Database ebeanServer = applicationContext.getBean(Database.class); @@ -34,4 +38,12 @@ public NoCodeCleanupUpgrade createInstance() { final IndexConvention indexConvention = applicationContext.getBean(IndexConvention.class); return new NoCodeCleanupUpgrade(ebeanServer, graphClient, searchClient, indexConvention); } + + @Bean(name = "noCodeCleanup") + @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra") + @Nonnull + public NoCodeCleanupUpgrade createNotImplInstance() { + log.warn("NoCode is not supported for cassandra!"); + return new NoCodeCleanupUpgrade(null, null, null, null); + } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java index 68009d7ed1718a..d968e8521867e8 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/NoCodeUpgradeConfig.java @@ -6,12 +6,15 @@ import com.linkedin.metadata.models.registry.EntityRegistry; import io.ebean.Database; import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.DependsOn; +@Slf4j @Configuration public class NoCodeUpgradeConfig { @@ -19,6 +22,7 @@ public class NoCodeUpgradeConfig { @Bean(name = "noCodeUpgrade") @DependsOn({"ebeanServer", "entityService", "systemRestliEntityClient", "entityRegistry"}) + @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true) @Nonnull public NoCodeUpgrade createInstance() { final Database ebeanServer = applicationContext.getBean(Database.class); @@ -29,4 +33,12 @@ public NoCodeUpgrade createInstance() { return new NoCodeUpgrade(ebeanServer, entityService, entityRegistry, entityClient); } + + @Bean(name = "noCodeUpgrade") + @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra") + @Nonnull + public NoCodeUpgrade createNotImplInstance() { + log.warn("NoCode is not supported for cassandra!"); + return new NoCodeUpgrade(null, null, null, null); + } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java index 743e4ffe84b0e4..116d62878f5c6e 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreBackupConfig.java @@ -8,12 +8,15 @@ import com.linkedin.metadata.search.EntitySearchService; import io.ebean.Database; import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.DependsOn; +@Slf4j @Configuration public class RestoreBackupConfig { @Autowired ApplicationContext applicationContext; @@ -27,6 +30,7 @@ public class RestoreBackupConfig { "searchService", "entityRegistry" }) + @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true) @Nonnull public RestoreBackup createInstance() { final Database ebeanServer = applicationContext.getBean(Database.class); @@ -40,4 +44,12 @@ public RestoreBackup createInstance() { return new RestoreBackup( ebeanServer, entityService, entityRegistry, entityClient, graphClient, searchClient); } + + @Bean(name = "restoreBackup") + @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra") + @Nonnull + public RestoreBackup createNotImplInstance() { + log.warn("restoreIndices is not supported for cassandra!"); + return new RestoreBackup(null, null, null, null, null, null); + } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java index d258c4a4d1a529..9d229f315d709d 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java @@ -7,18 +7,22 @@ import com.linkedin.metadata.search.EntitySearchService; import io.ebean.Database; import javax.annotation.Nonnull; +import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.DependsOn; +@Slf4j @Configuration public class RestoreIndicesConfig { @Autowired ApplicationContext applicationContext; @Bean(name = "restoreIndices") @DependsOn({"ebeanServer", "entityService", "searchService", "graphService", "entityRegistry"}) + @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true) @Nonnull public RestoreIndices createInstance() { final Database ebeanServer = applicationContext.getBean(Database.class); @@ -31,4 +35,12 @@ public RestoreIndices createInstance() { return new RestoreIndices( ebeanServer, entityService, entityRegistry, entitySearchService, graphService); } + + @Bean(name = "restoreIndices") + @ConditionalOnProperty(name = "entityService.impl", havingValue = "cassandra") + @Nonnull + public RestoreIndices createNotImplInstance() { + log.warn("restoreIndices is not supported for cassandra!"); + return new RestoreIndices(null, null, null, null, null); + } } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java index 6753d309b9f501..674efb2b8ba78c 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocode/NoCodeUpgrade.java @@ -13,6 +13,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import javax.annotation.Nullable; public class NoCodeUpgrade implements Upgrade { @@ -26,12 +27,17 @@ public class NoCodeUpgrade implements Upgrade { // Upgrade requires the Database. public NoCodeUpgrade( - final Database server, + @Nullable final Database server, final EntityService entityService, final EntityRegistry entityRegistry, final SystemRestliEntityClient entityClient) { - _steps = buildUpgradeSteps(server, entityService, entityRegistry, entityClient); - _cleanupSteps = buildCleanupSteps(); + if (server != null) { + _steps = buildUpgradeSteps(server, entityService, entityRegistry, entityClient); + _cleanupSteps = buildCleanupSteps(); + } else { + _steps = List.of(); + _cleanupSteps = List.of(); + } } @Override diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java index 8a267be6ad8086..6d3125423b4433 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/nocodecleanup/NoCodeCleanupUpgrade.java @@ -9,6 +9,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; +import javax.annotation.Nullable; import org.opensearch.client.RestHighLevelClient; public class NoCodeCleanupUpgrade implements Upgrade { @@ -18,12 +19,17 @@ public class NoCodeCleanupUpgrade implements Upgrade { // Upgrade requires the Database. public NoCodeCleanupUpgrade( - final Database server, + @Nullable final Database server, final GraphService graphClient, final RestHighLevelClient searchClient, final IndexConvention indexConvention) { - _steps = buildUpgradeSteps(server, graphClient, searchClient, indexConvention); - _cleanupSteps = buildCleanupSteps(); + if (server != null) { + _steps = buildUpgradeSteps(server, graphClient, searchClient, indexConvention); + _cleanupSteps = buildCleanupSteps(); + } else { + _steps = List.of(); + _cleanupSteps = List.of(); + } } @Override diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java index b11abb2d6bc23a..4ac295b4fdfb75 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreBackup.java @@ -16,20 +16,26 @@ import io.ebean.Database; import java.util.ArrayList; import java.util.List; +import javax.annotation.Nullable; public class RestoreBackup implements Upgrade { private final List _steps; public RestoreBackup( - final Database server, + @Nullable final Database server, final EntityService entityService, final EntityRegistry entityRegistry, final SystemRestliEntityClient entityClient, final GraphService graphClient, final EntitySearchService searchClient) { - _steps = - buildSteps(server, entityService, entityRegistry, entityClient, graphClient, searchClient); + if (server != null) { + _steps = + buildSteps( + server, entityService, entityRegistry, entityClient, graphClient, searchClient); + } else { + _steps = List.of(); + } } @Override diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java index 8bb3b0073710a3..f46bb9b05624db 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/RestoreIndices.java @@ -13,6 +13,7 @@ import io.ebean.Database; import java.util.ArrayList; import java.util.List; +import javax.annotation.Nullable; public class RestoreIndices implements Upgrade { public static final String BATCH_SIZE_ARG_NAME = "batchSize"; @@ -23,18 +24,23 @@ public class RestoreIndices implements Upgrade { public static final String WRITER_POOL_SIZE = "WRITER_POOL_SIZE"; public static final String URN_ARG_NAME = "urn"; public static final String URN_LIKE_ARG_NAME = "urnLike"; + public static final String URN_BASED_PAGINATION_ARG_NAME = "urnBasedPagination"; public static final String STARTING_OFFSET_ARG_NAME = "startingOffset"; private final List _steps; public RestoreIndices( - final Database server, + @Nullable final Database server, final EntityService entityService, final EntityRegistry entityRegistry, final EntitySearchService entitySearchService, final GraphService graphService) { - _steps = buildSteps(server, entityService, entityRegistry, entitySearchService, graphService); + if (server != null) { + _steps = buildSteps(server, entityService, entityRegistry, entitySearchService, graphService); + } else { + _steps = List.of(); + } } @Override diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java index ce59cf2edb84e9..574b1f08b5f543 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restoreindices/SendMAEStep.java @@ -31,6 +31,7 @@ public class SendMAEStep implements UpgradeStep { private static final int DEFAULT_STARTING_OFFSET = 0; private static final int DEFAULT_THREADS = 1; + private static final boolean DEFAULT_URN_BASED_PAGINATION = false; private final Database _server; private final EntityService _entityService; @@ -89,6 +90,7 @@ private RestoreIndicesArgs getArgs(UpgradeContext context) { result.numThreads = getThreadCount(context.parsedArgs()); result.batchDelayMs = getBatchDelayMs(context.parsedArgs()); result.start = getStartingOffset(context.parsedArgs()); + result.urnBasedPagination = getUrnBasedPagination(context.parsedArgs()); if (containsKey(context.parsedArgs(), RestoreIndices.ASPECT_NAME_ARG_NAME)) { result.aspectName = context.parsedArgs().get(RestoreIndices.ASPECT_NAME_ARG_NAME).get(); } @@ -140,18 +142,49 @@ public Function executable() { List> futures = new ArrayList<>(); startTime = System.currentTimeMillis(); - while (start < rowCount) { - args = args.clone(); - args.start = start; - futures.add(executor.submit(new KafkaJob(context, args))); - start = start + args.batchSize; - } - while (futures.size() > 0) { - List tmpResults = iterateFutures(futures); - for (RestoreIndicesResult tmpResult : tmpResults) { - reportStats(context, finalJobResult, tmpResult, rowCount, startTime); + if (args.urnBasedPagination) { + RestoreIndicesResult previousResult = null; + int rowsProcessed = 1; + while (rowsProcessed > 0) { + args = args.clone(); + if (previousResult != null) { + args.lastUrn = previousResult.lastUrn; + args.lastAspect = previousResult.lastAspect; + } + args.start = start; + context + .report() + .addLine( + String.format( + "Getting next batch of urns + aspects, starting with %s - %s", + args.lastUrn, args.lastAspect)); + Future future = executor.submit(new KafkaJob(context, args)); + try { + RestoreIndicesResult result = future.get(); + reportStats(context, finalJobResult, result, rowCount, startTime); + previousResult = result; + rowsProcessed = result.rowsMigrated + result.ignored; + context.report().addLine(String.format("Rows processed this loop %d", rowsProcessed)); + start += args.batchSize; + } catch (InterruptedException | ExecutionException e) { + return new DefaultUpgradeStepResult(id(), UpgradeStepResult.Result.FAILED); + } + } + } else { + while (start < rowCount) { + args = args.clone(); + args.start = start; + futures.add(executor.submit(new KafkaJob(context, args))); + start = start + args.batchSize; + } + while (futures.size() > 0) { + List tmpResults = iterateFutures(futures); + for (RestoreIndicesResult tmpResult : tmpResults) { + reportStats(context, finalJobResult, tmpResult, rowCount, startTime); + } } } + executor.shutdown(); if (finalJobResult.rowsMigrated != rowCount) { float percentFailed = 0.0f; @@ -233,6 +266,15 @@ private int getThreadCount(final Map> parsedArgs) { return getInt(parsedArgs, DEFAULT_THREADS, RestoreIndices.NUM_THREADS_ARG_NAME); } + private boolean getUrnBasedPagination(final Map> parsedArgs) { + boolean urnBasedPagination = DEFAULT_URN_BASED_PAGINATION; + if (containsKey(parsedArgs, RestoreIndices.URN_BASED_PAGINATION_ARG_NAME)) { + urnBasedPagination = + Boolean.parseBoolean(parsedArgs.get(RestoreIndices.URN_BASED_PAGINATION_ARG_NAME).get()); + } + return urnBasedPagination; + } + private int getInt( final Map> parsedArgs, int defaultVal, String argKey) { int result = defaultVal; diff --git a/datahub-web-react/src/app/AppProviders.tsx b/datahub-web-react/src/app/AppProviders.tsx index 81a8ddbfc9bace..00597e1cf76406 100644 --- a/datahub-web-react/src/app/AppProviders.tsx +++ b/datahub-web-react/src/app/AppProviders.tsx @@ -5,6 +5,7 @@ import UserContextProvider from './context/UserContextProvider'; import QuickFiltersProvider from '../providers/QuickFiltersProvider'; import SearchContextProvider from './search/context/SearchContextProvider'; import EntityRegistryProvider from './EntityRegistryProvider'; +import { BrowserTitleProvider } from './shared/BrowserTabTitleContext'; interface Props { children: React.ReactNode; @@ -15,11 +16,13 @@ export default function AppProviders({ children }: Props) { - - - {children} - - + + + + {children} + + + diff --git a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx index 07885a4d0f6304..044b09dc185e53 100644 --- a/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx +++ b/datahub-web-react/src/app/entity/group/GroupInfoSideBar.tsx @@ -21,6 +21,7 @@ import { } from '../shared/SidebarStyledComponents'; import GroupMembersSideBarSection from './GroupMembersSideBarSection'; import { useUserContext } from '../../context/useUserContext'; +import { useBrowserTitle } from '../../shared/BrowserTabTitleContext'; import StripMarkdownText, { removeMarkdown } from '../shared/components/styled/StripMarkdownText'; import { Editor } from '../shared/tabs/Documentation/components/editor/Editor'; import EditGroupDescriptionModal from './EditGroupDescriptionModal'; @@ -157,6 +158,22 @@ export default function GroupInfoSidebar({ sideBarData, refetch }: Props) { const { url } = useRouteMatch(); const history = useHistory(); + const { updateTitle } = useBrowserTitle(); + + useEffect(()=>{ + // You can use the title and updateTitle function here + // For example, updating the title when the component mounts + if(name){ + updateTitle(`Group | ${name}`); + } + // // Don't forget to clean up the title when the component unmounts + return () => { + if(name){ // added to condition for rerendering issue + updateTitle(''); + } + }; + }, [name, updateTitle]); + /* eslint-disable @typescript-eslint/no-unused-vars */ const [editGroupModal, showEditGroupModal] = useState(false); const me = useUserContext(); diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx index 8d7f1cca9c1cbd..664a77a731d348 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/EntityDropdown.tsx @@ -180,6 +180,7 @@ function EntityDropdown(props: Props) { )} {menuItems.has(EntityMenuItems.ADD_TERM) && ( setIsCreateTermModalVisible(true)} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx index 58693eca8af0e8..25e044259f240e 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityTabs.tsx @@ -39,6 +39,7 @@ export const EntityTabs = ({ tabs, selectedTab }: Props) => { return ( { + // You can use the title and updateTitle function here + // For example, updating the title when the component mounts + if(name){ + updateTitle(`User | ${name}`); + } + // // Don't forget to clean up the title when the component unmounts + return () => { + if(name){ // added to condition for rerendering issue + updateTitle(''); + } + }; + }, [name, updateTitle]); + const getEditModalData = { urn, name, diff --git a/datahub-web-react/src/app/glossary/GlossaryBrowser/NodeItem.tsx b/datahub-web-react/src/app/glossary/GlossaryBrowser/NodeItem.tsx index d517ab04a8bc90..cd6593e2d2f07e 100644 --- a/datahub-web-react/src/app/glossary/GlossaryBrowser/NodeItem.tsx +++ b/datahub-web-react/src/app/glossary/GlossaryBrowser/NodeItem.tsx @@ -166,7 +166,7 @@ function NodeItem(props: Props) { ))} {!hideTerms && (childTerms as GlossaryTerm[]).map((child) => ( - + ))} )} diff --git a/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx b/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx index 6980c15a1c256a..56495b53eded35 100644 --- a/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx +++ b/datahub-web-react/src/app/glossary/GlossaryBrowser/TermItem.tsx @@ -5,6 +5,7 @@ import { useEntityRegistry } from '../../useEntityRegistry'; import { ANTD_GRAY } from '../../entity/shared/constants'; import { ChildGlossaryTermFragment } from '../../../graphql/glossaryNode.generated'; import { useGlossaryEntityData } from '../../entity/shared/GlossaryEntityContext'; +import { useGlossaryActiveTabPath } from '../../entity/shared/containers/profile/utils'; const TermWrapper = styled.div` font-weight: normal; @@ -47,13 +48,15 @@ interface Props { term: ChildGlossaryTermFragment; isSelecting?: boolean; selectTerm?: (urn: string, displayName: string) => void; + includeActiveTabPath?: boolean; } function TermItem(props: Props) { - const { term, isSelecting, selectTerm } = props; + const { term, isSelecting, selectTerm, includeActiveTabPath } = props; const { entityData } = useGlossaryEntityData(); const entityRegistry = useEntityRegistry(); + const activeTabPath = useGlossaryActiveTabPath(); function handleSelectTerm() { if (selectTerm) { @@ -68,7 +71,9 @@ function TermItem(props: Props) { {!isSelecting && ( {entityRegistry.getDisplayName(term.type, isOnEntityPage ? entityData : term)} diff --git a/datahub-web-react/src/app/identity/user/UserList.tsx b/datahub-web-react/src/app/identity/user/UserList.tsx index dce3aa2c68a8dc..8e2bc21f0693f7 100644 --- a/datahub-web-react/src/app/identity/user/UserList.tsx +++ b/datahub-web-react/src/app/identity/user/UserList.tsx @@ -77,7 +77,7 @@ export const UserList = () => { query: (query?.length && query) || undefined, }, }, - fetchPolicy: (query?.length || 0) > 0 ? 'no-cache' : 'cache-first', + fetchPolicy: 'no-cache', }); const totalUsers = usersData?.listUsers?.total || 0; diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index 96dfc05e391532..0799f8af1173dc 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -83,11 +83,11 @@ const ShowMoreButton = styled(Button)` padding: 0px; `; -const LogsContainer = styled.div` +const DetailsContainer = styled.div` margin-bottom: -25px; ${(props) => - props.areLogsExpandable && - !props.showExpandedLogs && + props.areDetailsExpandable && + !props.showExpandedDetails && ` -webkit-mask-image: linear-gradient(to bottom, rgba(0,0,0,1) 50%, rgba(255,0,0,0.5) 60%, rgba(255,0,0,0) 90% ); mask-image: linear-gradient(to bottom, rgba(0,0,0,1) 50%, rgba(255,0,0,0.5) 60%, rgba(255,0,0,0) 90%); @@ -102,9 +102,9 @@ const modalBodyStyle = { padding: 0, }; -type LogsContainerProps = { - showExpandedLogs: boolean; - areLogsExpandable: boolean; +type DetailsContainerProps = { + showExpandedDetails: boolean; + areDetailsExpandable: boolean; }; type Props = { @@ -124,7 +124,7 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { downloadFile(output, `exec-${urn}.log`); }; - const logs = (showExpandedLogs && output) || output.slice(0, 250); + const logs = (showExpandedLogs && output) || output?.split('\n').slice(0, 5).join('\n'); const result = data?.executionRequest?.result?.status; useEffect(() => { @@ -154,10 +154,10 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { } catch (e) { recipeYaml = ''; } - const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 1).join('\n'); + const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 5).join('\n'); - const areLogsExpandable = output.length > 250; - const isRecipeExpandable = recipeYaml?.includes('\n'); + const areLogsExpandable = output?.split(/\r\n|\r|\n/)?.length > 5; + const isRecipeExpandable = recipeYaml?.split(/\r\n|\r|\n/)?.length > 5; return ( { Download - +
{`${logs}${!showExpandedLogs && areLogsExpandable ? '...' : ''}`}
-
+ {areLogsExpandable && ( setShowExpandedLogs(!showExpandedLogs)}> {showExpandedLogs ? 'Hide' : 'Show More'} @@ -216,9 +216,14 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { The recipe used for this ingestion run. - -
{`${recipe}${!showExpandedRecipe && isRecipeExpandable ? '\n...' : ''}`}
-
+ + +
{`${recipe}${!showExpandedRecipe && isRecipeExpandable ? '...' : ''}`}
+
+
{isRecipeExpandable && ( setShowExpandedRecipe((v) => !v)}> {showExpandedRecipe ? 'Hide' : 'Show More'} diff --git a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx index 36c4c020e71317..a6d8422f827d58 100644 --- a/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx +++ b/datahub-web-react/src/app/preview/DefaultPreviewCard.tsx @@ -114,6 +114,7 @@ const TagContainer = styled.div` margin-left: 0px; margin-top: 3px; flex-wrap: wrap; + margin-right: 8px; `; const TagSeparator = styled.div` diff --git a/datahub-web-react/src/app/search/SearchablePage.tsx b/datahub-web-react/src/app/search/SearchablePage.tsx index 9d02d85d3634c0..53dfc866b9b64b 100644 --- a/datahub-web-react/src/app/search/SearchablePage.tsx +++ b/datahub-web-react/src/app/search/SearchablePage.tsx @@ -3,6 +3,7 @@ import { useHistory, useLocation } from 'react-router'; import { debounce } from 'lodash'; import * as QueryString from 'query-string'; import { useTheme } from 'styled-components'; +import { Helmet } from 'react-helmet-async'; import { SearchHeader } from './SearchHeader'; import { useEntityRegistry } from '../useEntityRegistry'; import { EntityType, FacetFilterInput } from '../../types.generated'; @@ -19,6 +20,7 @@ import { useQuickFiltersContext } from '../../providers/QuickFiltersContext'; import { useUserContext } from '../context/useUserContext'; import { useSelectedSortOption } from './context/SearchContext'; import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants'; +import { useBrowserTitle } from '../shared/BrowserTabTitleContext'; const styles = { children: { @@ -68,6 +70,28 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) => const { user } = userContext; const viewUrn = userContext.localState?.selectedViewUrn; + const { title, updateTitle } = useBrowserTitle(); + + useEffect(() => { + // Update the title only if it's not already set and there is a valid pathname + if (!title && location.pathname) { + const formattedPath = location.pathname + .split('/') + .filter(word => word !== '') + .map(word => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' | '); + + if (formattedPath) { + return updateTitle(formattedPath); + } + } + + // Clean up the title when the component unmounts + return () => { + updateTitle(''); + }; + }, [location.pathname, title, updateTitle]); + useEffect(() => { if (suggestionsData !== undefined) { setNewSuggestionData(suggestionsData); @@ -140,6 +164,9 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) => authenticatedUserPictureLink={user?.editableProperties?.pictureLink} entityRegistry={entityRegistry} /> + + {title} +
{children}
); diff --git a/datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx b/datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx new file mode 100644 index 00000000000000..284e2771124c83 --- /dev/null +++ b/datahub-web-react/src/app/shared/BrowserTabTitleContext.tsx @@ -0,0 +1,30 @@ +import React, { createContext, ReactNode, useContext } from 'react'; + +interface BrowserTitleContextProps { + title: string; + updateTitle: (newTitle: string) => void; +} + +const BrowserTitleContext = createContext(undefined); + +export const BrowserTitleProvider: React.FC<{ children: ReactNode }> = ({ children }) => { + const [title, setTitle] = React.useState(''); + + const updateTitle = (newTitle: string) => { + setTitle(newTitle); + }; + + return ( + + {children} + + ); +}; + +export const useBrowserTitle = () => { + const context = useContext(BrowserTitleContext); + if (!context) { + throw new Error('useBrowserTitle must be used within a BrowserTitleProvider'); + } + return context; +}; diff --git a/datahub-web-react/src/appConfigContext.tsx b/datahub-web-react/src/appConfigContext.tsx index 4087ad453687c8..8c1089b868e5ab 100644 --- a/datahub-web-react/src/appConfigContext.tsx +++ b/datahub-web-react/src/appConfigContext.tsx @@ -50,6 +50,7 @@ export const DEFAULT_APP_CONFIG = { showAcrylInfo: false, showAccessManagement: false, nestedDomainsEnabled: true, + platformBrowseV2: false, }, }; diff --git a/datahub-web-react/src/graphql/app.graphql b/datahub-web-react/src/graphql/app.graphql index 4e9bbb11d8c5aa..fe283403491479 100644 --- a/datahub-web-react/src/graphql/app.graphql +++ b/datahub-web-react/src/graphql/app.graphql @@ -65,6 +65,7 @@ query appConfig { showAcrylInfo showAccessManagement nestedDomainsEnabled + platformBrowseV2 } } } diff --git a/docker/build.gradle b/docker/build.gradle index bc79be501b3952..190202620c382c 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -1,6 +1,9 @@ plugins { id 'java' // required by versioning + id 'docker-compose' } +import com.avast.gradle.dockercompose.tasks.ComposeUp +import com.avast.gradle.dockercompose.tasks.ComposeDownForced apply from: "../gradle/versioning/versioning.gradle" @@ -18,144 +21,107 @@ ext { debug_modules = quickstart_modules - [':metadata-jobs:mce-consumer-job', ':metadata-jobs:mae-consumer-job'] - debug_compose_args = [ - '-f', 'docker-compose-without-neo4j.yml', - '-f', 'docker-compose-without-neo4j.override.yml', - '-f', 'docker-compose-without-neo4j.m1.yml', // updates to mariadb - '-f', 'docker-compose.dev.yml' - ] + compose_args = ['-f', 'profiles/docker-compose.yml'] debug_reloadable = [ - 'datahub-gms', - 'datahub-frontend-react' + 'datahub-gms-debug', + 'system-update-debug', + 'frontend-debug' ] - // Postgres pg_quickstart_modules = quickstart_modules - [':docker:mysql-setup'] + [':docker:postgres-setup'] - pg_compose_args = [ - '-f', 'docker-compose-without-neo4j.yml', - '-f', 'docker-compose-without-neo4j.postgres.override.yml' - ] } -task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { - dependsOn(quickstart_modules.collect { it + ':dockerTag' }) - shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' - - environment "DATAHUB_TELEMETRY_ENABLED", "false" - environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - // environment "ACTIONS_VERSION", 'alpine3.18-slim' - // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' - - // Elastic - // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch' - // environment "DATAHUB_SEARCH_TAG", '7.10.1' - - // OpenSearch - environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch' - environment "DATAHUB_SEARCH_TAG", '2.9.0' - environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true' - environment "USE_AWS_ELASTICSEARCH", 'true' - - def cmd = [ - 'source ../metadata-ingestion/venv/bin/activate && ', - 'datahub docker quickstart', - '--no-pull-images', - '--standalone_consumers', - '--version', "v${version}", - '--dump-logs-on-failure' - ] +tasks.register('quickstart') {} +tasks.register('quickstartSlim') {} +tasks.register('quickstartDebug') {} +tasks.register('quickstartPg') {} - commandLine 'bash', '-c', cmd.join(" ") +tasks.withType(ComposeDownForced) { + removeVolumes = true } - -task quickstartSlim(type: Exec, dependsOn: ':metadata-ingestion:install') { - dependsOn(([':docker:datahub-ingestion'] + quickstart_modules).collect { it + ':dockerTag' }) - shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' - - environment "DATAHUB_TELEMETRY_ENABLED", "false" - environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - environment "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" - environment "ACTIONS_VERSION", "v${version}-slim" - environment "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' - environment "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' - - def cmd = [ - 'source ../metadata-ingestion/venv/bin/activate && ', - 'datahub docker quickstart', - '--no-pull-images', - '--standalone_consumers', - '--version', "v${version}", - '--dump-logs-on-failure' - ] - - commandLine 'bash', '-c', cmd.join(" ") +task quickstartNuke { + finalizedBy(tasks.withType(ComposeDownForced)) } -task quickstartNuke(type: Exec, dependsOn: ":metadata-ingestion:install") { - shouldRunAfter(':metadata-ingestion:clean') - - def cmd = [ - 'source ../metadata-ingestion/venv/bin/activate && ', - 'datahub docker nuke' - ] - commandLine 'bash', '-c', cmd.join(" ") +dockerCompose { + quickstart { + isRequiredBy(tasks.named('quickstart')) + composeAdditionalArgs = ['--profile', 'quickstart-consumers'] + + environment.put 'DATAHUB_VERSION', "v${version}" + + useComposeFiles = ['profiles/docker-compose.yml'] + projectName = 'datahub' + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + } + + quickstartPg { + isRequiredBy(tasks.named('quickstartPg')) + composeAdditionalArgs = ['--profile', 'quickstart-postgres'] + + environment.put 'DATAHUB_VERSION', "v${version}" + + useComposeFiles = ['profiles/docker-compose.yml'] + projectName = 'datahub' + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + } + + quickstartSlim { + isRequiredBy(tasks.named('quickstartSlim')) + composeAdditionalArgs = ['--profile', 'quickstart-consumers'] + + environment.put 'DATAHUB_VERSION', "v${version}" + environment.put "DATAHUB_ACTIONS_IMAGE", "acryldata/datahub-ingestion" + environment.put "ACTIONS_VERSION", "v${version}-slim" + environment.put "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions' + environment.put "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' + + useComposeFiles = ['profiles/docker-compose.yml'] + projectName = 'datahub' + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + } + + quickstartDebug { + isRequiredBy(tasks.named('quickstartDebug')) + composeAdditionalArgs = ['--profile', 'debug'] + + useComposeFiles = ['profiles/docker-compose.yml'] + projectName = 'datahub' + projectNamePrefix = '' + buildBeforeUp = false + buildBeforePull = false + stopContainers = false + removeVolumes = false + } } - -task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { - dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) - shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' - - environment "DATAHUB_TELEMETRY_ENABLED", "false" - environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - - // Elastic - // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch' - // environment "DATAHUB_SEARCH_TAG", '7.10.1' - - // OpenSearch - environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch' - environment "DATAHUB_SEARCH_TAG", '2.9.0' - environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true' - environment "USE_AWS_ELASTICSEARCH", 'true' - - - def cmd = [ - 'source ../metadata-ingestion/venv/bin/activate && ', - 'datahub docker quickstart', - '--no-pull-images', - '--version', "debug", - '--dump-logs-on-failure' - ] + debug_compose_args - commandLine 'bash', '-c', cmd.join(" ") +tasks.getByName('quickstartComposeUp').dependsOn( + quickstart_modules.collect { it + ':dockerTag' }) +tasks.getByName('quickstartPgComposeUp').dependsOn( + pg_quickstart_modules.collect { it + ':dockerTag' }) +tasks.getByName('quickstartSlimComposeUp').dependsOn( + ([':docker:datahub-ingestion'] + quickstart_modules) + .collect { it + ':dockerTag' }) +tasks.getByName('quickstartDebugComposeUp').dependsOn( + debug_modules.collect { it + ':dockerTagDebug' } +) +tasks.withType(ComposeUp).configureEach { + shouldRunAfter('quickstartNuke') } + task debugReload(type: Exec) { - def cmd = ['docker compose -p datahub'] + debug_compose_args + ['restart'] + debug_reloadable + def cmd = ['docker compose -p datahub --profile debug'] + compose_args + ['restart'] + debug_reloadable commandLine 'bash', '-c', cmd.join(" ") } - -task quickstartPg(type: Exec, dependsOn: ':metadata-ingestion:install') { - dependsOn(pg_quickstart_modules.collect { it + ':dockerTag' }) - shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' - - environment "DATAHUB_TELEMETRY_ENABLED", "false" - environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - environment "DATAHUB_POSTGRES_VERSION", "15.5" - - // OpenSearch - environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch' - environment "DATAHUB_SEARCH_TAG", '2.9.0' - environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true' - environment "USE_AWS_ELASTICSEARCH", 'true' - - def cmd = [ - 'source ../metadata-ingestion/venv/bin/activate && ', - 'datahub docker quickstart', - '--no-pull-images', - '--standalone_consumers', - '--version', "v${version}", - '--dump-logs-on-failure' - ] + pg_compose_args - - commandLine 'bash', '-c', cmd.join(" ") -} \ No newline at end of file diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index e0f9fdc997071c..81fec61ea50733 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -4,7 +4,7 @@ ARG BASE_IMAGE=base # Defining custom repo urls for use in enterprise environments. Re-used between stages below. ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com -ARG DEBIAN_REPO_URL=http://deb.debian.org/debian +ARG DEBIAN_REPO_URL=https://deb.debian.org/debian ARG PIP_MIRROR_URL=null FROM golang:1-alpine3.18 AS dockerize-binary diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 9516c31a19e21b..2898a363a0a185 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -3,7 +3,7 @@ ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=head ARG PIP_MIRROR_URL=null -ARG DEBIAN_REPO_URL=http://deb.debian.org/debian +ARG DEBIAN_REPO_URL=https://deb.debian.org/debian FROM $BASE_IMAGE:$DOCKER_VERSION as base USER 0 diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 36444210f1938b..0b08f189e6b45a 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -33,7 +33,7 @@ docker { i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } - def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')] + def dockerBuildArgs = [DOCKER_VERSION: version, RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", ''), BASE_IMAGE: "${docker_registry}/datahub-ingestion-base"] // Add build args if they are defined (needed for some CI or enterprise environments) if (project.hasProperty('pipMirrorUrl')) { diff --git a/docker/datahub-upgrade/README.md b/docker/datahub-upgrade/README.md index 0d019971604d6b..9c96114cdb2dd9 100644 --- a/docker/datahub-upgrade/README.md +++ b/docker/datahub-upgrade/README.md @@ -15,8 +15,16 @@ to metadata_aspect_v2 table. Arguments: 2. **NoCodeDataMigrationCleanup**: Cleanses graph index, search index, and key-value store of legacy DataHub data (metadata_aspect table) once the No Code Data Migration has completed successfully. No arguments. -3. **RestoreIndices**: Restores indices by fetching the latest version of each aspect and producing MAE - +3. **RestoreIndices**: Restores indices by fetching the latest version of each aspect and producing MAE. Arguments: + - *batchSize* (Optional): The number of rows to migrate at a time. Defaults to 1000. + - *batchDelayMs* (Optional): The number of milliseconds of delay between migrated batches. Used for rate limiting. Defaults to 250. + - *numThreads* (Optional): The number of threads to use, defaults to 1. Note that this is not used if `urnBasedPagination` is true. + - *aspectName* (Optional): The aspect name for producing events. + - *urn* (Optional): The urn for producing events. + - *urnLike* (Optional): The urn pattern for producing events, using `%` as a wild card + - *urnBasedPagination* (Optional): Paginate the SQL results using the urn + aspect string instead of `OFFSET`. Defaults to false, + though should improve performance for large amounts of data. + 4. **RestoreBackup**: Restores the storage stack from a backup of the local database ## Environment Variables diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 6191994eaa1ea5..0d58a1d91b70b1 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -43,6 +43,8 @@ services: context: ../ dockerfile: docker/datahub-gms/Dockerfile env_file: datahub-gms/env/docker-without-neo4j.env + environment: + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} healthcheck: test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health start_period: 90s diff --git a/docker/docker-compose.consumers-without-neo4j.yml b/docker/docker-compose.consumers-without-neo4j.yml index 8228951d9385f8..f1be585232a1a8 100644 --- a/docker/docker-compose.consumers-without-neo4j.yml +++ b/docker/docker-compose.consumers-without-neo4j.yml @@ -15,6 +15,8 @@ services: context: ../ dockerfile: docker/datahub-mae-consumer/Dockerfile env_file: datahub-mae-consumer/env/docker-without-neo4j.env + environment: + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} datahub-mce-consumer: container_name: datahub-mce-consumer hostname: datahub-mce-consumer @@ -28,3 +30,4 @@ services: environment: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} diff --git a/docker/docker-compose.consumers.yml b/docker/docker-compose.consumers.yml index 2d37094035859b..8d331cea2f0b95 100644 --- a/docker/docker-compose.consumers.yml +++ b/docker/docker-compose.consumers.yml @@ -15,6 +15,8 @@ services: context: ../ dockerfile: docker/datahub-mae-consumer/Dockerfile env_file: datahub-mae-consumer/env/docker.env + environment: + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} depends_on: neo4j: condition: service_healthy @@ -36,6 +38,7 @@ services: - NEO4J_USERNAME=neo4j - NEO4J_PASSWORD=datahub - GRAPH_SERVICE_IMPL=neo4j + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} depends_on: neo4j: condition: service_healthy diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index 774c4e17bee21f..7067b68fba3f9c 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -24,7 +24,7 @@ services: - JAVA_TOOL_OPTIONS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5002 - DATAHUB_ANALYTICS_ENABLED=${DATAHUB_ANALYTICS_ENABLED:-true} volumes: - - ../datahub-frontend/build/stage/playBinary:/datahub-frontend + - ../datahub-frontend/build/stage/main:/datahub-frontend datahub-gms: image: linkedin/datahub-gms:debug ports: @@ -45,6 +45,7 @@ services: - SEARCH_SERVICE_ENABLE_CACHE=false - LINEAGE_SEARCH_CACHE_ENABLED=false - SHOW_BROWSE_V2=true + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} volumes: - ./datahub-gms/start.sh:/datahub/datahub-gms/scripts/start.sh - ./datahub-gms/jetty.xml:/datahub/datahub-gms/scripts/jetty.xml diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 95f56fe47e3cca..146055830d04e5 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -36,6 +36,8 @@ services: container_name: datahub-gms hostname: datahub-gms image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:${DATAHUB_VERSION:-head} + environment: + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} ports: - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 build: diff --git a/docker/profiles/README.md b/docker/profiles/README.md new file mode 100644 index 00000000000000..df09f15cd85cee --- /dev/null +++ b/docker/profiles/README.md @@ -0,0 +1,104 @@ +# Docker Compose Profiles + +This directory contains a set of docker compose definitions which are designed to run several configurations +for quickstart use-cases as well as development use-cases. These configurations cover a few of the wide variety of +infrastructure configurations that DataHub can operate on. + +Requirements: +* Use the profiles requires a modern version of docker. +* If using the debug/development profiles, you will need to have built the `debug` docker images locally. See the Development Profiles section for more details. + +```bash +$ cd docker/profiles +$ docker compose --profile up +``` + +Use Control-c (`^c`) to terminate the running system. This will automatically stop all running containers. + +To remove the containers use the following: + +```bash +docker compose --profile rm +``` + +Please refer to docker's documentation for more details. + +The following sections detail a few of the profiles and their intended use-cases. For a complete list of profiles +and their configuration please see the table at the end of each section. + +## Quickstart Profiles + +Quickstart profiles are primarily a way to test drive DataHub features before committing to a production ready deployment. +A couple of these profiles are also used in our continuous integration (CI) tests. + +Note: Quickstart profiles use docker images with the `head` tag. These images up updated when changes are committed +to the DataHub github repository. This can be overridden to use a stable release tag by prefixing the commands with +`DATAHUB_VERSION=v0.12.1` for example. + +### `quickstart` + +This is the default configuration MySQL and OpenSearch for the storage and GMS running with integrated consumers. + +### `quickstart-consumers` + +This configuration is identical to `quickstart` how it runs standalone consumers instead of consumers integrated with the GMS container. + +### `quickstart-postgres` + +Identical to `quickstart` with Postgres instead of MySQL. + +### `quickstart-cassandra` + +Uses Cassandra as the primary data store along with Neo4j as the graph database. + +### `quickstart-storage` + +Just run the `quickstart` data stores without the DataHub components. This mode is useful for debugging when running the frontend and GMS components outside +of docker. + +### Quickstart Profiles Table +| Profile Name | MySQL | Postgres | Cassandra | Neo4j | Frontend | GMS | Actions | SystemUpdate | MAE | MCE | Kafka | OpenSearch | +|----------------------|-------|----------|-----------|-------|----------|-----|---------|--------------|-----|-----|-------|------------| +| quickstart | X | | | | X | X | X | X | | | X | X | +| quickstart-frontend | X | | | | X | | | X | | | X | X | +| quickstart-backend | X | | | | | X | X | X | | | X | X | +| quickstart-postgres | | X | | | X | X | X | X | | | X | X | +| quickstart-cassandra | | | X | X | X | X | X | X | | | X | X | +| quickstart-consumers | X | | | | X | X | X | X | X | X | X | X | +| quickstart-storage | X | | | | | | | | | | X | X | + +## Development Profiles + +* Runs `debug` tagged images +* JVM Debug Mode Enabled +* Exposes local jars and scripts to the containers +* Can run non-default one-off configurations (neo4j, cassandra, elasticsearch) + +The docker images used are the `debug` images which are created by building locally. These images are +created by running the gradle command. + +```bash +./gradlew dockerTagDebug +``` + +For a complete list of profiles see the table at the end of this section. + +### `quickstart-backend` + +Run everything except for the `frontend` component. Useful for running just a local (non-docker) frontend. + +### `quickstart-frontend` + +Runs everything except for the GMS. Useful for running just a local (non-docker) GMS instance. + +### Development Profiles Table +| Profile Name | MySQL | Postgres | Cassandra | Neo4j | Frontend | GMS | Actions | SystemUpdate | MAE | MCE | Kafka | OpenSearch | Elasticsearch | +|---------------------|-------|----------|-----------|-------|----------|-----|---------|--------------|-----|-----|-------|------------|---------------| +| debug | X | | | | X | X | X | X | | | X | X | | +| debug-frontend | X | | | | X | | | X | | | X | X | | +| debug-backend | X | | | | | X | X | X | | | X | X | | +| debug-postgres | | X | | | X | X | X | X | | | X | X | | +| debug-cassandra | | | X | | X | X | X | X | | | X | X | | +| debug-consumers | X | | | | X | X | X | X | X | X | X | X | | +| debug-neo4j | X | | | X | X | X | X | X | | | X | X | | +| debug-elasticsearch | X | | | | X | X | X | X | | | X | | X | \ No newline at end of file diff --git a/docker/profiles/cassandra b/docker/profiles/cassandra new file mode 120000 index 00000000000000..d9af9adbce5cad --- /dev/null +++ b/docker/profiles/cassandra @@ -0,0 +1 @@ +../cassandra \ No newline at end of file diff --git a/docker/profiles/datahub-actions b/docker/profiles/datahub-actions new file mode 120000 index 00000000000000..fea4275be45ffc --- /dev/null +++ b/docker/profiles/datahub-actions @@ -0,0 +1 @@ +../datahub-actions/ \ No newline at end of file diff --git a/docker/profiles/datahub-frontend b/docker/profiles/datahub-frontend new file mode 120000 index 00000000000000..74a18b81b7e3b8 --- /dev/null +++ b/docker/profiles/datahub-frontend @@ -0,0 +1 @@ +../datahub-frontend \ No newline at end of file diff --git a/docker/profiles/datahub-gms b/docker/profiles/datahub-gms new file mode 120000 index 00000000000000..de2f067e4c0e0d --- /dev/null +++ b/docker/profiles/datahub-gms @@ -0,0 +1 @@ +../datahub-gms \ No newline at end of file diff --git a/docker/profiles/datahub-mae-consumer b/docker/profiles/datahub-mae-consumer new file mode 120000 index 00000000000000..90974047792c50 --- /dev/null +++ b/docker/profiles/datahub-mae-consumer @@ -0,0 +1 @@ +../datahub-mae-consumer \ No newline at end of file diff --git a/docker/profiles/datahub-mce-consumer b/docker/profiles/datahub-mce-consumer new file mode 120000 index 00000000000000..288c9d91c28b3e --- /dev/null +++ b/docker/profiles/datahub-mce-consumer @@ -0,0 +1 @@ +../datahub-mce-consumer \ No newline at end of file diff --git a/docker/profiles/datahub-upgrade b/docker/profiles/datahub-upgrade new file mode 120000 index 00000000000000..8ff77fd5562e7f --- /dev/null +++ b/docker/profiles/datahub-upgrade @@ -0,0 +1 @@ +../datahub-upgrade \ No newline at end of file diff --git a/docker/profiles/docker-compose.actions.yml b/docker/profiles/docker-compose.actions.yml new file mode 100644 index 00000000000000..a509a6a67d2705 --- /dev/null +++ b/docker/profiles/docker-compose.actions.yml @@ -0,0 +1,45 @@ + +x-datahub-actions-service: &datahub-actions-service + hostname: actions + image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head} + env_file: datahub-actions/env/docker.env + environment: + ACTIONS_EXTRA_PACKAGES: ${ACTIONS_EXTRA_PACKAGES:-} + ACTIONS_CONFIG: ${ACTIONS_CONFIG:-} + KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092 + SCHEMA_REGISTRY_URL: http://datahub-gms:8080/schema-registry/api/ + +services: + datahub-actions-quickstart: + <<: *datahub-actions-service + container_name: actions + profiles: + - quickstart + - quickstart-backend + depends_on: + datahub-gms-quickstart: + condition: service_healthy + datahub-actions-quickstart-cassandra: + <<: *datahub-actions-service + container_name: actions + profiles: + - quickstart-cassandra + depends_on: + datahub-gms-quickstart-cassandra: + condition: service_healthy + datahub-actions-quickstart-postgres: + <<: *datahub-actions-service + container_name: actions + profiles: + - quickstart-postgres + depends_on: + datahub-gms-quickstart-postgres: + condition: service_healthy + datahub-actions-quickstart-consumers: + <<: *datahub-actions-service + container_name: actions + profiles: + - quickstart-consumers + depends_on: + datahub-gms-quickstart-consumers: + condition: service_healthy diff --git a/docker/profiles/docker-compose.frontend.yml b/docker/profiles/docker-compose.frontend.yml new file mode 100644 index 00000000000000..80cb4e7b4b596d --- /dev/null +++ b/docker/profiles/docker-compose.frontend.yml @@ -0,0 +1,119 @@ + +x-datahub-frontend-service: &datahub-frontend-service + hostname: datahub-frontend-react + image: ${DATAHUB_FRONTEND_IMAGE:-linkedin/datahub-frontend-react}:${DATAHUB_VERSION:-head} + ports: + - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002 + env_file: datahub-frontend/env/docker.env + environment: &datahub-frontend-service-env + KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092 + volumes: + - ${HOME}/.datahub/plugins:/etc/datahub/plugins + +x-datahub-frontend-service-dev: &datahub-frontend-service-dev + <<: *datahub-frontend-service + image: linkedin/datahub-frontend-react:debug + ports: + - ${DATAHUB_MAPPED_FRONTEND_DEBUG_PORT:-5002}:5002 + - ${DATAHUB_MAPPED_FRONTEND_PORT:-9002}:9002 + environment: + <<: *datahub-frontend-service-env + JAVA_TOOL_OPTIONS: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5002 + DATAHUB_ANALYTICS_ENABLED: ${DATAHUB_ANALYTICS_ENABLED:-true} + volumes: + - ../../datahub-frontend/build/stage/main:/datahub-frontend + +services: + frontend-quickstart: + <<: *datahub-frontend-service + container_name: frontend + profiles: + - quickstart + - quickstart-frontend + depends_on: + system-update-quickstart: + condition: service_completed_successfully + frontend-quickstart-cassandra: + <<: *datahub-frontend-service + container_name: frontend + profiles: + - quickstart-cassandra + depends_on: + system-update-quickstart-cassandra: + condition: service_completed_successfully + frontend-quickstart-postgres: + <<: *datahub-frontend-service + container_name: frontend + profiles: + - quickstart-postgres + depends_on: + system-update-quickstart-postgres: + condition: service_completed_successfully + frontend-quickstart-consumers: + <<: *datahub-frontend-service + container_name: frontend + profiles: + - quickstart-consumers + depends_on: + system-update-quickstart: + condition: service_completed_successfully + frontend-debug: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug + depends_on: + system-update-debug: + condition: service_completed_successfully + frontend-debug-frontend: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug-frontend + depends_on: + mysql-setup-dev: + condition: service_completed_successfully + opensearch-setup-dev: + condition: service_completed_successfully + kafka-setup-dev: + condition: service_completed_successfully + frontend-debug-postgres: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug-postgres + depends_on: + system-update-debug-postgres: + condition: service_completed_successfully + frontend-debug-cassandra: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug-cassandra + depends_on: + system-update-debug-cassandra: + condition: service_completed_successfully + frontend-debug-consumers: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug-consumers + depends_on: + system-update-debug: + condition: service_completed_successfully + frontend-debug-neo4j: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug-neo4j + depends_on: + system-update-debug-neo4j: + condition: service_completed_successfully + frontend-debug-elasticsearch: + <<: *datahub-frontend-service-dev + container_name: datahub-frontend-dev + profiles: + - debug-elasticsearch + depends_on: + system-update-debug-elasticsearch: + condition: service_completed_successfully \ No newline at end of file diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml new file mode 100644 index 00000000000000..01602c8b906b91 --- /dev/null +++ b/docker/profiles/docker-compose.gms.yml @@ -0,0 +1,429 @@ +################################# +# Common Environment Variables +################################# +x-primary-datastore-mysql-env: &primary-datastore-mysql-env + EBEAN_DATASOURCE_HOST: mysql:3306 + EBEAN_DATASOURCE_URL: 'jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2' + EBEAN_DATASOURCE_DRIVER: com.mysql.jdbc.Driver + +x-primary-datastore-postgres-env: &primary-datastore-postgres-env + EBEAN_DATASOURCE_HOST: postgres:5432 + EBEAN_DATASOURCE_URL: 'jdbc:postgresql://postgres:5432/datahub' + EBEAN_DATASOURCE_DRIVER: org.postgresql.Driver + EBEAN_POSTGRES_USE_AWS_IAM_AUTH: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:-false} + +x-primary-datastore-cassandra-env: &primary-datastore-cassandra-env + CASSANDRA_DATASOURCE_USERNAME: cassandra + CASSANDRA_DATASOURCE_PASSWORD: cassandra + CASSANDRA_HOSTS: cassandra + CASSANDRA_PORT: 9042 + CASSANDRA_DATASOURCE_HOST: 'cassandra:9042' + ENTITY_SERVICE_IMPL: cassandra + +x-graph-datastore-neo4j-env: &graph-datastore-neo4j-env + GRAPH_SERVICE_IMPL: neo4j + NEO4J_HOST: 'http://neo4j:7474' + NEO4J_URI: 'bolt://neo4j' + NEO4J_USERNAME: neo4j + NEO4J_PASSWORD: datahub +x-graph-datastore-search-env: &graph-datastore-search-env + GRAPH_SERVICE_IMPL: elasticsearch + +x-search-datastore-elasticsearch-env: &search-datastore-env + ELASTICSEARCH_HOST: search + ELASTICSEARCH_PORT: 9200 + ELASTICSEARCH_PROTOCOL: http + ELASTICSEARCH_USE_SSL: ${ELASTICSEARCH_USE_SSL:-false} + +x-kafka-env: &kafka-env + KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092 + # KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + SCHEMA_REGISTRY_TYPE: INTERNAL + KAFKA_SCHEMAREGISTRY_URL: http://datahub-gms:8080/schema-registry/api/ + +x-datahub-quickstart-telemetry-env: &datahub-quickstart-telemetry-env + DATAHUB_SERVER_TYPE: ${DATAHUB_SERVER_TYPE:-quickstart} + DATAHUB_TELEMETRY_ENABLED: ${DATAHUB_TELEMETRY_ENABLED:-true} + +x-datahub-dev-telemetry-env: &datahub-dev-telemetry-env + DATAHUB_SERVER_TYPE: ${DATAHUB_SERVER_TYPE:-dev} + DATAHUB_TELEMETRY_ENABLED: ${DATAHUB_TELEMETRY_ENABLED:-true} + +################################# +# System Update +################################# +x-datahub-system-update-service: &datahub-system-update-service + hostname: datahub-system-update + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} + command: + - -u + - SystemUpdate + env_file: datahub-upgrade/env/docker.env + environment: &datahub-system-update-env + <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *kafka-env] + SCHEMA_REGISTRY_SYSTEM_UPDATE: ${SCHEMA_REGISTRY_SYSTEM_UPDATE:-true} + SPRING_KAFKA_PROPERTIES_AUTO_REGISTER_SCHEMAS: ${SPRING_KAFKA_PROPERTIES_AUTO_REGISTER_SCHEMAS:-true} + SPRING_KAFKA_PROPERTIES_USE_LATEST_VERSION: ${SPRING_KAFKA_PROPERTIES_USE_LATEST_VERSION:-true} + +x-datahub-system-update-service-dev: &datahub-system-update-service-dev + <<: *datahub-system-update-service + image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:debug + ports: + - ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003 + environment: &datahub-system-update-dev-env + <<: [*datahub-dev-telemetry-env, *datahub-system-update-env] + SKIP_ELASTICSEARCH_CHECK: false + REPROCESS_DEFAULT_BROWSE_PATHS_V2: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false} + JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5003' + volumes: + - ../../datahub-upgrade/build/libs/:/datahub/datahub-upgrade/bin/ + - ../../metadata-models/src/main/resources/:/datahub/datahub-gms/resources + - ${HOME}/.datahub/plugins:/etc/datahub/plugins + +################################# +# GMS +################################# +x-datahub-gms-service: &datahub-gms-service + hostname: datahub-gms + image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:${DATAHUB_VERSION:-head} + ports: + - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 + env_file: datahub-gms/env/docker.env + environment: &datahub-gms-env + <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env] + healthcheck: + test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health + start_period: 90s + interval: 1s + retries: 3 + timeout: 5s + volumes: + - ${HOME}/.datahub/plugins:/etc/datahub/plugins + +x-datahub-gms-service-dev: &datahub-gms-service-dev + <<: *datahub-gms-service + image: ${DATAHUB_GMS_IMAGE:-linkedin/datahub-gms}:debug + ports: + - ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001 + - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080 + environment: &datahub-gms-dev-env + <<: [*datahub-dev-telemetry-env, *datahub-gms-env] + SKIP_ELASTICSEARCH_CHECK: false + METADATA_SERVICE_AUTH_ENABLED: false + JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001' + BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false + SEARCH_SERVICE_ENABLE_CACHE: false + LINEAGE_SEARCH_CACHE_ENABLED: false + SHOW_BROWSE_V2: true + volumes: + - ./datahub-gms/start.sh:/datahub/datahub-gms/scripts/start.sh + - ./datahub-gms/jetty.xml:/datahub/datahub-gms/scripts/jetty.xml + - ./monitoring/client-prometheus-config.yaml:/datahub/datahub-gms/scripts/prometheus-config.yaml + - ../../metadata-models/src/main/resources/:/datahub/datahub-gms/resources + - ../../metadata-service/war/build/libs/:/datahub/datahub-gms/bin + - ${HOME}/.datahub/plugins:/etc/datahub/plugins + +################################# +# MAE Consumer +################################# +x-datahub-mae-consumer-service: &datahub-mae-consumer-service + hostname: datahub-mae-consumer + image: ${DATAHUB_MAE_CONSUMER_IMAGE:-linkedin/datahub-mae-consumer}:${DATAHUB_VERSION:-head} + ports: + - 9091:9091 + env_file: datahub-mae-consumer/env/docker.env + environment: &datahub-mae-consumer-env + <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *kafka-env] + +x-datahub-mae-consumer-service-dev: &datahub-mae-consumer-service-dev + <<: *datahub-mae-consumer-service + image: ${DATAHUB_MAE_CONSUMER_IMAGE:-linkedin/datahub-mae-consumer}:debug + environment: + <<: [*datahub-dev-telemetry-env, *datahub-mae-consumer-env] + volumes: + - ./datahub-mae-consumer/start.sh:/datahub/datahub-mae-consumer/scripts/start.sh + - ../../metadata-models/src/main/resources/:/datahub/datahub-mae-consumer/resources + - ../../metadata-jobs/mae-consumer-job/build/libs/:/datahub/datahub-mae-consumer/bin/ + - ./monitoring/client-prometheus-config.yaml:/datahub/datahub-mae-consumer/scripts/prometheus-config.yaml + +################################# +# MCE Consumer +################################# +x-datahub-mce-consumer-service: &datahub-mce-consumer-service + hostname: datahub-mce-consumer + image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:${DATAHUB_VERSION:-head} + ports: + - 9090:9090 + env_file: datahub-mce-consumer/env/docker.env + environment: &datahub-mce-consumer-env + <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env] + +x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev + <<: *datahub-mce-consumer-service + image: ${DATAHUB_MCE_CONSUMER_IMAGE:-linkedin/datahub-mce-consumer}:debug + environment: + <<: [*datahub-dev-telemetry-env, *datahub-mce-consumer-env] + volumes: + - ./datahub-mce-consumer/start.sh:/datahub/datahub-mce-consumer/scripts/start.sh + - ../../metadata-jobs/mce-consumer-job/build/libs/:/datahub/datahub-mce-consumer/bin + - ./monitoring/client-prometheus-config.yaml:/datahub/datahub-mce-consumer/scripts/prometheus-config.yaml + +services: + ################################# + # System Update + ################################# + system-update-quickstart: + <<: *datahub-system-update-service + container_name: system-update + profiles: + - quickstart + - quickstart-storage + - quickstart-consumers + - quickstart-frontend + - quickstart-backend + depends_on: + mysql-setup: + condition: service_completed_successfully + opensearch-setup: + condition: service_completed_successfully + kafka-setup: + condition: service_completed_successfully + system-update-quickstart-cassandra: + <<: *datahub-system-update-service + container_name: system-update + profiles: + - quickstart-cassandra + environment: + <<: [*primary-datastore-cassandra-env, *graph-datastore-neo4j-env, *datahub-system-update-env] + depends_on: + neo4j: + condition: service_healthy + cassandra-setup: + condition: service_completed_successfully + opensearch-setup: + condition: service_completed_successfully + kafka-setup: + condition: service_completed_successfully + system-update-quickstart-postgres: + <<: *datahub-system-update-service + container_name: system-update + profiles: + - quickstart-postgres + environment: + <<: [*primary-datastore-postgres-env, *datahub-system-update-env] + depends_on: + postgres-setup: + condition: service_completed_successfully + opensearch-setup: + condition: service_completed_successfully + kafka-setup: + condition: service_completed_successfully + system-update-debug: + <<: *datahub-system-update-service-dev + container_name: system-update-dev + profiles: + - debug + - debug-backend + - debug-consumers + depends_on: + mysql-setup-dev: + condition: service_completed_successfully + opensearch-setup-dev: + condition: service_completed_successfully + kafka-setup-dev: + condition: service_completed_successfully + system-update-debug-elasticsearch: + <<: *datahub-system-update-service-dev + container_name: system-update-dev + profiles: + - debug-elasticsearch + depends_on: + mysql-setup-dev: + condition: service_completed_successfully + elasticsearch-setup-dev: + condition: service_completed_successfully + kafka-setup-dev: + condition: service_completed_successfully + system-update-debug-postgres: + <<: *datahub-system-update-service-dev + container_name: system-update-dev + profiles: + - debug-postgres + environment: + <<: [*primary-datastore-postgres-env, *datahub-system-update-dev-env] + depends_on: + postgres-setup-dev: + condition: service_completed_successfully + opensearch-setup-dev: + condition: service_completed_successfully + kafka-setup-dev: + condition: service_completed_successfully + system-update-debug-cassandra: + <<: *datahub-system-update-service-dev + container_name: system-update-dev + profiles: + - debug-cassandra + environment: + <<: [*primary-datastore-cassandra-env, *datahub-system-update-dev-env] + depends_on: + cassandra-setup: + condition: service_completed_successfully + opensearch-setup-dev: + condition: service_completed_successfully + kafka-setup-dev: + condition: service_completed_successfully + system-update-debug-neo4j: + <<: *datahub-system-update-service-dev + container_name: system-update-dev + profiles: + - debug-neo4j + environment: + <<: [*graph-datastore-neo4j-env, *datahub-system-update-dev-env] + depends_on: + neo4j: + condition: service_healthy + opensearch-setup-dev: + condition: service_completed_successfully + kafka-setup-dev: + condition: service_completed_successfully + ################################# + # GMS + ################################# + datahub-gms-quickstart: + <<: *datahub-gms-service + profiles: + - quickstart + - quickstart-backend + container_name: datahub-gms + depends_on: + system-update-quickstart: + condition: service_completed_successfully + datahub-gms-quickstart-cassandra: + <<: *datahub-gms-service + profiles: + - quickstart-cassandra + container_name: datahub-gms + environment: + <<: [*primary-datastore-cassandra-env, *graph-datastore-neo4j-env, *datahub-gms-env] + depends_on: + system-update-quickstart-cassandra: + condition: service_completed_successfully + datahub-gms-quickstart-postgres: + <<: *datahub-gms-service + profiles: + - quickstart-postgres + container_name: datahub-gms + environment: + <<: [*primary-datastore-postgres-env, *datahub-gms-env] + depends_on: + system-update-quickstart-postgres: + condition: service_completed_successfully + datahub-gms-quickstart-consumers: + <<: *datahub-gms-service + profiles: + - quickstart-consumers + container_name: datahub-gms + environment: + <<: *datahub-gms-env + MAE_CONSUMER_ENABLED: false + MCE_CONSUMER_ENABLED: false + depends_on: + system-update-quickstart: + condition: service_completed_successfully + datahub-gms-debug: + <<: *datahub-gms-service-dev + profiles: + - debug + - debug-backend + container_name: datahub-gms-dev + depends_on: + system-update-debug: + condition: service_completed_successfully + datahub-gms-debug-postgres: + <<: *datahub-gms-service-dev + profiles: + - debug-postgres + environment: + <<: [*primary-datastore-postgres-env, *datahub-gms-dev-env] + container_name: datahub-gms-dev + depends_on: + system-update-debug-postgres: + condition: service_completed_successfully + datahub-gms-debug-cassandra: + <<: *datahub-gms-service-dev + profiles: + - debug-cassandra + environment: + <<: [*primary-datastore-cassandra-env, *datahub-gms-dev-env] + container_name: datahub-gms-dev + depends_on: + system-update-debug-cassandra: + condition: service_completed_successfully + datahub-gms-debug-consumers: + <<: *datahub-gms-service-dev + profiles: + - debug-consumers + environment: + <<: *datahub-gms-dev-env + MAE_CONSUMER_ENABLED: false + MCE_CONSUMER_ENABLED: false + container_name: datahub-gms-dev + depends_on: + system-update-debug: + condition: service_completed_successfully + datahub-gms-debug-neo4j: + <<: *datahub-gms-service-dev + profiles: + - debug-neo4j + environment: + <<: [*graph-datastore-neo4j-env, *datahub-gms-dev-env] + container_name: datahub-gms-dev + depends_on: + system-update-debug-neo4j: + condition: service_completed_successfully + datahub-gms-debug-elasticsearch: + <<: *datahub-gms-service-dev + profiles: + - debug-elasticsearch + container_name: datahub-gms-dev + depends_on: + system-update-debug-elasticsearch: + condition: service_completed_successfully + ################################# + # MAE Consumer + ################################# + datahub-mae-consumer-quickstart-consumers: + <<: *datahub-mae-consumer-service + profiles: + - quickstart-consumers + container_name: datahub-mae-consumer + depends_on: + datahub-gms-quickstart-consumers: + condition: service_healthy + datahub-mae-consumer-quickstart-consumers-dev: + <<: *datahub-mae-consumer-service-dev + profiles: + - debug-consumers + container_name: datahub-mae-consumer-dev + depends_on: + datahub-gms-debug-consumers: + condition: service_healthy + ################################# + # MCE Consumer + ################################# + datahub-mce-consumer-quickstart-consumers: + <<: *datahub-mce-consumer-service + profiles: + - quickstart-consumers + container_name: datahub-mce-consumer + depends_on: + datahub-gms-quickstart-consumers: + condition: service_healthy + datahub-mce-consumer-quickstart-consumers-dev: + <<: *datahub-mce-consumer-service-dev + profiles: + - debug-consumers + container_name: datahub-mce-consumer-dev + depends_on: + datahub-gms-debug-consumers: + condition: service_healthy \ No newline at end of file diff --git a/docker/profiles/docker-compose.prerequisites.yml b/docker/profiles/docker-compose.prerequisites.yml new file mode 100644 index 00000000000000..d90d4a252f9935 --- /dev/null +++ b/docker/profiles/docker-compose.prerequisites.yml @@ -0,0 +1,387 @@ +# Common environment +x-search-datastore-search: &search-datastore-environment + ELASTICSEARCH_HOST: search + ELASTICSEARCH_PORT: 9200 + ELASTICSEARCH_PROTOCOL: http + ELASTICSEARCH_USE_SSL: ${ELASTICSEARCH_USE_SSL:-false} + +# Primary Storage Profiles +x-mysql-profiles-quickstart: &mysql-profiles-quickstart + - quickstart + - quickstart-backend + - quickstart-frontend + - quickstart-storage + - quickstart-consumers +x-mysql-profiles-dev: &mysql-profiles-dev + - debug + - debug-frontend + - debug-backend + - debug-consumers + - debug-neo4j + - debug-elasticsearch +x-mysql-profiles: &mysql-profiles + - quickstart + - quickstart-backend + - quickstart-frontend + - quickstart-storage + - quickstart-consumers + - debug + - debug-frontend + - debug-backend + - debug-consumers + - debug-neo4j + - debug-elasticsearch + +x-postgres-profiles-quickstart: &postgres-profiles-quickstart + - quickstart-postgres +x-postgres-profiles-dev: &postgres-profiles-dev + - debug-postgres +x-postgres-profiles: &postgres-profiles + - quickstart-postgres + - debug-postgres + +x-cassandra-profiles: &cassandra-profiles + - quickstart-cassandra + - debug-cassandra + +# Graph Storage Profiles +x-neo4j-profiles: &neo4j-profiles + - quickstart-cassandra + - debug-neo4j + +# Search Storage Profiles +x-elasticsearch-profiles: &elasticsearch-profiles + - debug-elasticsearch + +x-opensearch-profiles-quickstart: &opensearch-profiles-quickstart + - quickstart + - quickstart-backend + - quickstart-frontend + - quickstart-storage + - quickstart-cassandra + - quickstart-postgres + - quickstart-consumers +x-opensearch-profiles-dev: &opensearch-profiles-dev + - debug + - debug-frontend + - debug-backend + - debug-postgres + - debug-cassandra + - debug-consumers + - debug-neo4j +x-opensearch-profiles: &opensearch-profiles + - quickstart + - quickstart-backend + - quickstart-frontend + - quickstart-storage + - quickstart-cassandra + - quickstart-postgres + - quickstart-consumers + - debug + - debug-frontend + - debug-backend + - debug-postgres + - debug-cassandra + - debug-consumers + - debug-neo4j + +# Debug vs Quickstart Profiles +x-profiles-quickstart: &profiles-quickstart + - quickstart + - quickstart-backend + - quickstart-frontend + - quickstart-storage + - quickstart-cassandra + - quickstart-postgres + - quickstart-consumers +x-profiles-dev: &profiles-dev + - debug + - debug-frontend + - debug-backend + - debug-postgres + - debug-cassandra + - debug-consumers + - debug-neo4j + - debug-elasticsearch + +services: + mysql: + container_name: mysql + profiles: *mysql-profiles + hostname: mysql + image: mysql:${DATAHUB_MYSQL_VERSION:-8.2} + command: --character-set-server=utf8mb4 --collation-server=utf8mb4_bin --default-authentication-plugin=caching_sha2_password + ports: + - ${DATAHUB_MAPPED_MYSQL_PORT:-3306}:3306 + env_file: mysql/env/docker.env + restart: on-failure + healthcheck: + test: mysqladmin ping -h mysql -u $$MYSQL_USER --password=$$MYSQL_PASSWORD + start_period: 10s + interval: 1s + retries: 3 + timeout: 5s + volumes: + - ./mysql/init.sql:/docker-entrypoint-initdb.d/init.sql + - mysqldata:/var/lib/mysql + mysql-setup: &mysql-setup + container_name: mysql-setup + profiles: *mysql-profiles-quickstart + hostname: mysql-setup + image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:${DATAHUB_VERSION:-head} + env_file: mysql-setup/env/docker.env + depends_on: + mysql: + condition: service_healthy + labels: + datahub_setup_job: true + mysql-setup-dev: + <<: *mysql-setup + container_name: mysql-setup-dev + profiles: *mysql-profiles-dev + image: ${DATAHUB_MYSQL_SETUP_IMAGE:-acryldata/datahub-mysql-setup}:debug + postgres: + container_name: postgres + profiles: *postgres-profiles + hostname: postgres + image: postgres:${DATAHUB_POSTGRES_VERSION:-15.5} + env_file: postgres/env/docker.env + ports: + - '5432:5432' + restart: on-failure + healthcheck: + test: [ "CMD-SHELL", "pg_isready" ] + start_period: 20s + interval: 2s + timeout: 10s + retries: 5 + volumes: + - ./postgres/init.sql:/docker-entrypoint-initdb.d/init.sql + - postgresdata:/var/lib/postgresql/data + postgres-setup: &postgres-setup + container_name: postgres-setup + profiles: *postgres-profiles-quickstart + hostname: postgres-setup + image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-acryldata/datahub-postgres-setup}:${DATAHUB_VERSION:-head} + env_file: postgres-setup/env/docker.env + depends_on: + postgres: + condition: service_healthy + labels: + datahub_setup_job: true + postgres-setup-dev: + <<: *postgres-setup + container_name: postgres-setup-dev + profiles: *postgres-profiles-dev + image: ${DATAHUB_POSTGRES_SETUP_IMAGE:-acryldata/datahub-postgres-setup}:debug + cassandra: + container_name: cassandra + profiles: *cassandra-profiles + hostname: cassandra + image: cassandra:4.1 + ports: + - 9042:9042 + healthcheck: + test: cqlsh -u cassandra -p cassandra -e 'describe keyspaces' + interval: 15s + timeout: 10s + retries: 10 + volumes: + - cassandradata:/var/lib/cassandra + cassandra-setup: + container_name: cassandra-setup + profiles: *cassandra-profiles + hostname: cassandra-setup + image: cassandra:4.1 + command: /bin/bash -c "cqlsh cassandra -f /init.cql" + depends_on: + cassandra: + condition: service_healthy + volumes: + - ./cassandra/init.cql:/init.cql + labels: + datahub_setup_job: true + neo4j: + container_name: neo4j + profiles: *neo4j-profiles + hostname: neo4j + image: neo4j:4.4.28-community + ports: + - ${DATAHUB_MAPPED_NEO4J_HTTP_PORT:-7474}:7474 + - ${DATAHUB_MAPPED_NEO4J_BOLT_PORT:-7687}:7687 + env_file: neo4j/env/docker.env + healthcheck: + test: wget http://neo4j:$${DATAHUB_NEO4J_HTTP_PORT:-7474} + start_period: 5s + interval: 1s + retries: 5 + timeout: 5s + volumes: + - neo4jdata:/data + kafka-broker: + container_name: kafka-broker + hostname: kafka-broker + image: confluentinc/cp-kafka:7.4.0 + command: + - /bin/bash + - -c + - | + # Generate KRaft clusterID + file_path="/var/lib/kafka/data/clusterID" + + if [ ! -f "$$file_path" ]; then + /bin/kafka-storage random-uuid > $$file_path + echo "Cluster id has been created..." + # KRaft required step: Format the storage directory with a new cluster ID + kafka-storage format --ignore-formatted -t $$(cat "$$file_path") -c /etc/kafka/kafka.properties + fi + + export CLUSTER_ID=$$(cat "$$file_path") + echo "CLUSTER_ID=$$CLUSTER_ID" + + /etc/confluent/docker/run + ports: + - ${DATAHUB_MAPPED_KAFKA_BROKER_PORT:-9092}:9092 + env_file: kafka-broker/env/docker.env + environment: + KAFKA_NODE_ID: 1 + KAFKA_ADVERTISED_LISTENERS: BROKER://kafka-broker:29092,EXTERNAL://kafka-broker:9092 + KAFKA_LISTENERS: BROKER://kafka-broker:29092,EXTERNAL://kafka-broker:9092,CONTROLLER://kafka-broker:39092 + KAFKA_INTER_BROKER_LISTENER_NAME: BROKER + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: CONTROLLER:PLAINTEXT,BROKER:PLAINTEXT,EXTERNAL:PLAINTEXT + KAFKA_PROCESS_ROLES: controller, broker + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka-broker:39092 + # https://github.com/confluentinc/cp-all-in-one/issues/120 + KAFKA_LOG4J_LOGGERS: 'org.apache.kafka.image.loader.MetadataLoader=WARN' + KAFKA_ZOOKEEPER_CONNECT: null + healthcheck: + test: nc -z kafka-broker $${DATAHUB_KAFKA_BROKER_PORT:-9092} + start_period: 60s + interval: 1s + retries: 5 + timeout: 5s + volumes: + - broker:/var/lib/kafka/data/ + kafka-setup: &kafka-setup + container_name: kafka-setup + profiles: *profiles-quickstart + hostname: kafka-setup + image: ${DATAHUB_KAFKA_SETUP_IMAGE:-linkedin/datahub-kafka-setup}:${DATAHUB_VERSION:-head} + env_file: kafka-setup/env/docker.env + environment: &kafka-setup-env + DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-false} + KAFKA_BOOTSTRAP_SERVER: kafka-broker:29092 + USE_CONFLUENT_SCHEMA_REGISTRY: false + depends_on: + kafka-broker: + condition: service_healthy + labels: + datahub_setup_job: true + kafka-setup-dev: + <<: *kafka-setup + container_name: kafka-setup-dev + profiles: *profiles-dev + environment: + <<: *kafka-setup-env + DATAHUB_PRECREATE_TOPICS: ${DATAHUB_PRECREATE_TOPICS:-true} + image: ${DATAHUB_KAFKA_SETUP_IMAGE:-linkedin/datahub-kafka-setup}:debug + elasticsearch: + container_name: elasticsearch + profiles: *elasticsearch-profiles + hostname: search + image: ${DATAHUB_SEARCH_IMAGE:-elasticsearch}:${DATAHUB_SEARCH_TAG:-7.10.1} + ports: + - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 + env_file: elasticsearch/env/docker.env + environment: + - discovery.type=single-node + - ${XPACK_SECURITY_ENABLED:-xpack.security.enabled=false} + deploy: + resources: + limits: + memory: 1G + healthcheck: + test: curl -sS --fail http://search:$${DATAHUB_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s + start_period: 20s + interval: 1s + retries: 3 + timeout: 5s + volumes: + - esdata:/usr/share/elasticsearch/data + elasticsearch-setup-dev: &elasticsearch-setup-dev + container_name: elasticsearch-setup-dev + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:debug + profiles: *elasticsearch-profiles + hostname: elasticsearch-setup + env_file: elasticsearch-setup/env/docker.env + environment: + <<: *search-datastore-environment + USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-false} + depends_on: + elasticsearch: + condition: service_healthy + labels: + datahub_setup_job: true + opensearch: + container_name: opensearch + profiles: *opensearch-profiles + hostname: search + image: ${DATAHUB_SEARCH_IMAGE:-opensearchproject/opensearch}:${DATAHUB_SEARCH_TAG:-2.9.0} + ports: + - ${DATAHUB_MAPPED_ELASTIC_PORT:-9200}:9200 + env_file: elasticsearch/env/docker.env + environment: + - discovery.type=single-node + - ${XPACK_SECURITY_ENABLED:-plugins.security.disabled=true} + deploy: + resources: + limits: + memory: 1G + healthcheck: + test: curl -sS --fail http://search:$${DATAHUB_ELASTIC_PORT:-9200}/_cluster/health?wait_for_status=yellow&timeout=0s + start_period: 20s + interval: 1s + retries: 3 + timeout: 5s + volumes: + - osdata:/usr/share/elasticsearch/data + opensearch-setup: &opensearch-setup + <<: *elasticsearch-setup-dev + container_name: opensearch-setup + profiles: *opensearch-profiles-quickstart + hostname: opensearch-setup + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:${DATAHUB_VERSION:-head} + environment: + <<: *search-datastore-environment + USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true} + depends_on: + opensearch: + condition: service_healthy + labels: + datahub_setup_job: true + opensearch-setup-dev: + <<: *opensearch-setup + container_name: opensearch-setup-dev + profiles: *opensearch-profiles-dev + hostname: opensearch-setup-dev + image: ${DATAHUB_ELASTIC_SETUP_IMAGE:-linkedin/datahub-elasticsearch-setup}:debug + environment: + <<: *search-datastore-environment + USE_AWS_ELASTICSEARCH: ${USE_AWS_ELASTICSEARCH:-true} + depends_on: + opensearch: + condition: service_healthy + +networks: + default: + name: datahub_network + +volumes: + neo4jdata: + esdata: + osdata: + broker: + mysqldata: + cassandradata: + postgresdata: diff --git a/docker/profiles/docker-compose.yml b/docker/profiles/docker-compose.yml new file mode 100644 index 00000000000000..534ca9702e2d79 --- /dev/null +++ b/docker/profiles/docker-compose.yml @@ -0,0 +1,13 @@ +--- +version: '3.9' +name: datahub + +include: + # Contains storage layers: i.e. mysql, kafka, elasticsearch + - docker-compose.prerequisites.yml + # Actions pod + - docker-compose.actions.yml + # Frontend + - docker-compose.frontend.yml + # Remaining components: i.e. gms, system-update, consumers + - docker-compose.gms.yml diff --git a/docker/profiles/elasticsearch b/docker/profiles/elasticsearch new file mode 120000 index 00000000000000..7712783b3e8d64 --- /dev/null +++ b/docker/profiles/elasticsearch @@ -0,0 +1 @@ +../elasticsearch \ No newline at end of file diff --git a/docker/profiles/elasticsearch-setup b/docker/profiles/elasticsearch-setup new file mode 120000 index 00000000000000..670a10e8c37865 --- /dev/null +++ b/docker/profiles/elasticsearch-setup @@ -0,0 +1 @@ +../elasticsearch-setup \ No newline at end of file diff --git a/docker/profiles/kafka-broker b/docker/profiles/kafka-broker new file mode 120000 index 00000000000000..23b248a4e0bbd4 --- /dev/null +++ b/docker/profiles/kafka-broker @@ -0,0 +1 @@ +../broker \ No newline at end of file diff --git a/docker/profiles/kafka-setup b/docker/profiles/kafka-setup new file mode 120000 index 00000000000000..35b9c167ac26e9 --- /dev/null +++ b/docker/profiles/kafka-setup @@ -0,0 +1 @@ +../kafka-setup \ No newline at end of file diff --git a/docker/profiles/monitoring b/docker/profiles/monitoring new file mode 120000 index 00000000000000..1371b42ae4593c --- /dev/null +++ b/docker/profiles/monitoring @@ -0,0 +1 @@ +../monitoring \ No newline at end of file diff --git a/docker/profiles/mysql b/docker/profiles/mysql new file mode 120000 index 00000000000000..057b59f7601652 --- /dev/null +++ b/docker/profiles/mysql @@ -0,0 +1 @@ +../mysql \ No newline at end of file diff --git a/docker/profiles/mysql-setup b/docker/profiles/mysql-setup new file mode 120000 index 00000000000000..f9199ec3fc58f0 --- /dev/null +++ b/docker/profiles/mysql-setup @@ -0,0 +1 @@ +../mysql-setup \ No newline at end of file diff --git a/docker/profiles/neo4j b/docker/profiles/neo4j new file mode 120000 index 00000000000000..0d4849d989d436 --- /dev/null +++ b/docker/profiles/neo4j @@ -0,0 +1 @@ +../neo4j \ No newline at end of file diff --git a/docker/profiles/postgres b/docker/profiles/postgres new file mode 120000 index 00000000000000..be56a57bd0ab8f --- /dev/null +++ b/docker/profiles/postgres @@ -0,0 +1 @@ +../postgres \ No newline at end of file diff --git a/docker/profiles/postgres-setup b/docker/profiles/postgres-setup new file mode 120000 index 00000000000000..38f51721feacb9 --- /dev/null +++ b/docker/profiles/postgres-setup @@ -0,0 +1 @@ +../postgres-setup/ \ No newline at end of file diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 7b7ca4052f3245..8b870019152834 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -97,6 +97,7 @@ services: - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} - JAVA_OPTS=-Xms1g -Xmx1g - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 53dacaf6ef63b0..5373e93da6bcb7 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -97,6 +97,7 @@ services: - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 1ca91aa19206da..51a40395e3459f 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -97,6 +97,7 @@ services: - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index d05933df96a433..4ed57dca1f080a 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -6,6 +6,7 @@ services: datahub-mae-consumer: container_name: datahub-mae-consumer environment: + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-mcl - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 @@ -44,6 +45,7 @@ services: - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=false - MCE_CONSUMER_ENABLED=true diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index f0bd3a0f927c80..ba8432d8a89afe 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -9,6 +9,7 @@ services: neo4j: condition: service_healthy environment: + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-mcl - DATAHUB_GMS_HOST=datahub-gms - DATAHUB_GMS_PORT=8080 @@ -54,6 +55,7 @@ services: - GRAPH_SERVICE_IMPL=neo4j - JAVA_OPTS=-Xms1g -Xmx1g - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=false - MCE_CONSUMER_ENABLED=true diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index c77b4418b6f366..56071cfe1e9e60 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -97,6 +97,7 @@ services: - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} - JAVA_OPTS=-Xms1g -Xmx1g - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR=${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:-true} - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true diff --git a/docker/quickstart/quickstart_version_mapping.yaml b/docker/quickstart/quickstart_version_mapping.yaml index 9948bd55fdc0b6..b08cfda175aa9f 100644 --- a/docker/quickstart/quickstart_version_mapping.yaml +++ b/docker/quickstart/quickstart_version_mapping.yaml @@ -23,7 +23,7 @@ quickstart_version_map: default: composefile_git_ref: master docker_tag: head - mysql_tag: 5.7 + mysql_tag: "5.7" # default: # Use this to pin default to a specific version. # composefile_git_ref: fd1bd51541a132017a648f4a2f037eec8f70ba26 # v0.10.0 + quickstart compose file fixes # docker_tag: v0.10.0 @@ -31,19 +31,19 @@ quickstart_version_map: head: composefile_git_ref: master docker_tag: head - mysql_tag: 5.7 + mysql_tag: "5.7" # v0.13.0 we upgraded MySQL image for EOL v0.13.0: composefile_git_ref: master docker_tag: head - mysql_tag: 8.2 + mysql_tag: "8.2" # v0.9.6 images contain security vulnerabilities v0.9.6: composefile_git_ref: v0.9.6.1 docker_tag: v0.9.6.1 - mysql_tag: 5.7 + mysql_tag: "5.7" # If stable is not defined the latest released version will be used. # stable: diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py index 53ee9cf1e63ef5..7493210ffa2a5f 100644 --- a/docs-website/download_historical_versions.py +++ b/docs-website/download_historical_versions.py @@ -37,9 +37,9 @@ def fetch_urls( except Exception as e: if attempt < max_retries: print(f"Attempt {attempt + 1}/{max_retries}: {e}") - time.sleep(retry_delay) + time.sleep(retry_delay * 2**attempt) else: - print(f"Max retries reached. Unable to fetch data.") + print("Max retries reached. Unable to fetch data.") raise diff --git a/docs/developers.md b/docs/developers.md index 60d31f5e4523f7..fe007a56ddc68f 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -17,10 +17,8 @@ title: "Local Development" On macOS, these can be installed using [Homebrew](https://brew.sh/). ```shell -# Install Java 8 and 11 -brew tap homebrew/cask-versions -brew install java11 -brew install --cask zulu8 +# Install Java +brew install openjdk@17 # Install Python brew install python@3.10 # you may need to add this to your PATH diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index 1010bd2933e452..27ae2ce65ba658 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -10,6 +10,7 @@ envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py31 use_develop = true extras = dev,integration-tests,plugin-v1 deps = + # This should be kept in sync with the Github Actions matrix. -e ../../metadata-ingestion/ # Airflow version airflow21: apache-airflow~=2.1.0 @@ -20,7 +21,15 @@ deps = # See https://github.com/datahub-project/datahub/pull/9365 airflow24: apache-airflow~=2.4.0,pluggy==1.0.0 airflow26: apache-airflow~=2.6.0 + # Respect the constraints file on pendulum. + # See https://github.com/apache/airflow/issues/36274 + airflow24,airflow26: pendulum>=2.0,<3.0 + # The Airflow 2.7 constraints file points at pydantic v2, so we match that here. + # https://raw.githubusercontent.com/apache/airflow/constraints-2.7.3/constraints-3.10.txt + # Note that Airflow is actually compatible with both pydantic v1 and v2, and the + # constraints file is overly restrictive. airflow27: apache-airflow~=2.7.0 + airflow27: pydantic==2.4.2 commands = pytest --cov-append {posargs} diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 1bc1bc5100b08d..0dcac7a7fc1b41 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -14,9 +14,10 @@ "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", + # pydantic 1.8.2 is incompatible with mypy 0.910. + # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. # pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885 - # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 - "pydantic>=1.5.1,!=1.10.3,<2", + "pydantic>=1.10.0,!=1.10.3", "mixpanel>=4.9.0", "sentry-sdk", } @@ -53,6 +54,18 @@ "ruamel.yaml", } +pydantic_no_v2 = { + # pydantic 2 makes major, backwards-incompatible changes - https://github.com/pydantic/pydantic/issues/4887 + # Tags sources that require the pydantic v2 API. + "pydantic<2", +} + +plugin_common = { + # While pydantic v2 support is experimental, require that all plugins + # continue to use v1. This will ensure that no ingestion recipes break. + *pydantic_no_v2, +} + rest_common = {"requests", "requests_file"} kafka_common = { @@ -118,6 +131,7 @@ "sqlalchemy>=1.4.39, <2", # Required for SQL profiling. "great-expectations>=0.15.12, <=0.15.50", + *pydantic_no_v2, # because of great-expectations # scipy version restricted to reduce backtracking, used by great-expectations, "scipy>=1.7.2", # GE added handling for higher version of jinja2 @@ -229,6 +243,7 @@ iceberg_common = { # Iceberg Python SDK "pyiceberg", + *pydantic_no_v2, # because of pyiceberg "pyarrow>=9.0.0, <13.0.0", } @@ -259,7 +274,9 @@ databricks = { # 0.1.11 appears to have authentication issues with azure databricks - "databricks-sdk>=0.9.0", + # 0.16.0 added py.typed support which caused mypy to fail. The databricks sdk is pinned until we resolve mypy issues. + # https://github.com/databricks/databricks-sdk-py/pull/483 + "databricks-sdk>=0.9.0,<0.16.0", "pyspark~=3.3.0", "requests", # Version 2.4.0 includes sqlalchemy dialect, 2.8.0 includes some bug fixes @@ -354,7 +371,11 @@ "mlflow": {"mlflow-skinny>=2.3.0"}, "mode": {"requests", "tenacity>=8.0.1"} | sqllineage_lib, "mongodb": {"pymongo[srv]>=3.11", "packaging"}, - "mssql": sql_common | {"sqlalchemy-pytds>=0.3", "pyOpenSSL"}, + "mssql": sql_common + | { + "sqlalchemy-pytds>=0.3", + "pyOpenSSL", + }, "mssql-odbc": sql_common | {"pyodbc"}, "mysql": mysql, # mariadb should have same dependency as mysql @@ -473,9 +494,6 @@ "flake8-bugbear==23.3.12", "isort>=5.7.0", "mypy==1.0.0", - # pydantic 1.8.2 is incompatible with mypy 0.910. - # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. - "pydantic>=1.10.0", *test_api_requirements, pytest_dep, "pytest-asyncio>=0.16.0", @@ -559,7 +577,7 @@ "kafka-connect", "ldap", "mongodb", - "mssql", + "mssql" if sys.version_info >= (3, 8) else None, "mysql", "mariadb", "redash", @@ -736,7 +754,22 @@ extras_require={ "base": list(framework_common), **{ - plugin: list(framework_common | dependencies) + plugin: list( + framework_common + | ( + plugin_common + if plugin + not in { + "airflow", + "datahub-rest", + "datahub-kafka", + "sync-file-emitter", + "sql-parser", + } + else set() + ) + | dependencies + ) for (plugin, dependencies) in plugins.items() }, "all": list( diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py index c45d4ddc924580..89ac528efe81a1 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py @@ -1,7 +1,7 @@ from typing import Optional -from datahub.configuration import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel -class BaseAssertion(ConfigModel): +class BaseAssertion(v1_ConfigModel): description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py index a41b0f7aafd9f2..dc0c97d1c74e56 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -2,7 +2,7 @@ from typing_extensions import Literal, Protocol -from datahub.configuration import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel from datahub.metadata.schema_classes import ( AssertionStdOperatorClass, AssertionStdParameterClass, @@ -58,7 +58,7 @@ def _generate_assertion_std_parameters( ) -class EqualToOperator(ConfigModel): +class EqualToOperator(v1_ConfigModel): type: Literal["equal_to"] value: Union[str, int, float] @@ -71,7 +71,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class BetweenOperator(ConfigModel): +class BetweenOperator(v1_ConfigModel): type: Literal["between"] min: Union[int, float] max: Union[int, float] @@ -87,7 +87,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: ) -class LessThanOperator(ConfigModel): +class LessThanOperator(v1_ConfigModel): type: Literal["less_than"] value: Union[int, float] @@ -100,7 +100,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class GreaterThanOperator(ConfigModel): +class GreaterThanOperator(v1_ConfigModel): type: Literal["greater_than"] value: Union[int, float] @@ -113,7 +113,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class LessThanOrEqualToOperator(ConfigModel): +class LessThanOrEqualToOperator(v1_ConfigModel): type: Literal["less_than_or_equal_to"] value: Union[int, float] @@ -126,7 +126,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class GreaterThanOrEqualToOperator(ConfigModel): +class GreaterThanOrEqualToOperator(v1_ConfigModel): type: Literal["greater_than_or_equal_to"] value: Union[int, float] @@ -139,7 +139,7 @@ def generate_parameters(self) -> AssertionStdParametersClass: return _generate_assertion_std_parameters(value=self.value) -class NotNullOperator(ConfigModel): +class NotNullOperator(v1_ConfigModel): type: Literal["not_null"] operator: str = AssertionStdOperatorClass.NOT_NULL diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py index 6a3944ba36baf0..975aa359bd2031 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -1,12 +1,11 @@ from typing import List, Optional, Union -import pydantic from typing_extensions import Literal import datahub.emitter.mce_builder as builder from datahub.api.entities.datacontract.assertion import BaseAssertion from datahub.api.entities.datacontract.assertion_operator import Operators -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AssertionInfoClass, @@ -25,7 +24,7 @@ class IdConfigMixin(BaseAssertion): - id_raw: Optional[str] = pydantic.Field( + id_raw: Optional[str] = v1_Field( default=None, alias="id", description="The id of the assertion. If not provided, one will be generated using the type.", @@ -38,7 +37,7 @@ def generate_default_id(self) -> str: class CustomSQLAssertion(IdConfigMixin, BaseAssertion): type: Literal["custom_sql"] sql: str - operator: Operators = pydantic.Field(discriminator="type") + operator: Operators = v1_Field(discriminator="type") def generate_default_id(self) -> str: return f"{self.type}-{self.sql}-{self.operator.id()}" @@ -89,11 +88,11 @@ def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: ) -class DataQualityAssertion(ConfigModel): +class DataQualityAssertion(v1_ConfigModel): __root__: Union[ CustomSQLAssertion, ColumnUniqueAssertion, - ] = pydantic.Field(discriminator="type") + ] = v1_Field(discriminator="type") @property def id(self) -> str: diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py index f3c6be55e5fea9..e0ef85d5fd66c0 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -1,7 +1,6 @@ import collections from typing import Iterable, List, Optional, Tuple -import pydantic from ruamel.yaml import YAML from typing_extensions import Literal @@ -11,7 +10,11 @@ ) from datahub.api.entities.datacontract.freshness_assertion import FreshnessAssertion from datahub.api.entities.datacontract.schema_assertion import SchemaAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import ( + v1_ConfigModel, + v1_Field, + v1_validator, +) from datahub.emitter.mce_builder import datahub_guid, make_assertion_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -26,7 +29,7 @@ from datahub.utilities.urns.urn import guess_entity_type -class DataContract(ConfigModel): +class DataContract(v1_ConfigModel): """A yml representation of a Data Contract. This model is used as a simpler, Python-native representation of a DataHub data contract. @@ -36,29 +39,27 @@ class DataContract(ConfigModel): version: Literal[1] - id: Optional[str] = pydantic.Field( + id: Optional[str] = v1_Field( default=None, alias="urn", description="The data contract urn. If not provided, one will be generated.", ) - entity: str = pydantic.Field( + entity: str = v1_Field( description="The entity urn that the Data Contract is associated with" ) # TODO: add support for properties # properties: Optional[Dict[str, str]] = None - schema_field: Optional[SchemaAssertion] = pydantic.Field( - default=None, alias="schema" - ) + schema_field: Optional[SchemaAssertion] = v1_Field(default=None, alias="schema") - freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) + freshness: Optional[FreshnessAssertion] = v1_Field(default=None) # TODO: Add a validator to ensure that ids are unique - data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None) + data_quality: Optional[List[DataQualityAssertion]] = v1_Field(default=None) _original_yaml_dict: Optional[dict] = None - @pydantic.validator("data_quality") + @v1_validator("data_quality") # type: ignore def validate_data_quality( cls, data_quality: Optional[List[DataQualityAssertion]] ) -> Optional[List[DataQualityAssertion]]: diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py index 71741d76b22fc4..86942766889676 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -3,11 +3,10 @@ from datetime import timedelta from typing import List, Union -import pydantic from typing_extensions import Literal from datahub.api.entities.datacontract.assertion import BaseAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AssertionInfoClass, @@ -25,10 +24,10 @@ class CronFreshnessAssertion(BaseAssertion): type: Literal["cron"] - cron: str = pydantic.Field( + cron: str = v1_Field( description="The cron expression to use. See https://crontab.guru/ for help." ) - timezone: str = pydantic.Field( + timezone: str = v1_Field( "UTC", description="The timezone to use for the cron schedule. Defaults to UTC.", ) @@ -58,10 +57,10 @@ def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleCla ) -class FreshnessAssertion(ConfigModel): - __root__: Union[ - CronFreshnessAssertion, FixedIntervalFreshnessAssertion - ] = pydantic.Field(discriminator="type") +class FreshnessAssertion(v1_ConfigModel): + __root__: Union[CronFreshnessAssertion, FixedIntervalFreshnessAssertion] = v1_Field( + discriminator="type" + ) @property def id(self): diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py index b62f94e0592fce..39297d1a98d026 100644 --- a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -3,11 +3,10 @@ import json from typing import List, Union -import pydantic from typing_extensions import Literal from datahub.api.entities.datacontract.assertion import BaseAssertion -from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import v1_ConfigModel, v1_Field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.extractor.json_schema_util import get_schema_metadata from datahub.metadata.schema_classes import ( @@ -23,7 +22,7 @@ class JsonSchemaContract(BaseAssertion): type: Literal["json-schema"] - json_schema: dict = pydantic.Field(alias="json-schema") + json_schema: dict = v1_Field(alias="json-schema") _schema_metadata: SchemaMetadataClass @@ -37,7 +36,10 @@ def _init_private_attributes(self) -> None: ) -class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True): +class FieldListSchemaContract(BaseAssertion): + class Config: + arbitrary_types_allowed = True + type: Literal["field-list"] fields: List[SchemaFieldClass] @@ -56,8 +58,8 @@ def _init_private_attributes(self) -> None: ) -class SchemaAssertion(ConfigModel): - __root__: Union[JsonSchemaContract, FieldListSchemaContract] = pydantic.Field( +class SchemaAssertion(v1_ConfigModel): + __root__: Union[JsonSchemaContract, FieldListSchemaContract] = v1_Field( discriminator="type" ) diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index f7996900f7a7ad..2732a72aea5399 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -126,10 +126,21 @@ def metadata_diff( default=False, help="Include extra information for each plugin.", ) +@click.option( + "--source", + type=str, + default=None, +) @telemetry.with_telemetry() -def plugins(verbose: bool) -> None: +def plugins(source: Optional[str], verbose: bool) -> None: """List the enabled ingestion plugins.""" + if source: + # Quick helper for one-off checks with full stack traces. + source_registry.get(source) + click.echo(f"Source {source} is enabled.") + return + click.secho("Sources:", bold=True) click.echo(source_registry.summary(verbose=verbose, col_width=25)) click.echo() diff --git a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py index be7439f330dfb6..1c3ce93c1f7887 100644 --- a/metadata-ingestion/src/datahub/cli/quickstart_versioning.py +++ b/metadata-ingestion/src/datahub/cli/quickstart_versioning.py @@ -94,7 +94,7 @@ def fetch_quickstart_config(cls) -> "QuickstartVersionMappingConfig": try: release = cls._fetch_latest_version() config.quickstart_version_map["stable"] = QuickstartExecutionPlan( - composefile_git_ref=release, docker_tag=release, mysql_tag=release + composefile_git_ref=release, docker_tag=release, mysql_tag="5.7" ) except Exception: click.echo( @@ -123,7 +123,7 @@ def get_quickstart_execution_plan( QuickstartExecutionPlan( composefile_git_ref=composefile_git_ref, docker_tag=docker_tag, - mysql_tag=mysql_tag, + mysql_tag=str(mysql_tag), ), ) # new CLI version is downloading the composefile corresponding to the requested version diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index f225856ca43ce4..0030332bcfd541 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -99,8 +99,20 @@ def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: @classmethod def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf: - with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): - return cls.parse_obj(obj) + if PYDANTIC_VERSION_2: + try: + with unittest.mock.patch.dict( + cls.model_config, # type: ignore + {"extra": "allow"}, + clear=False, + ): + cls.model_rebuild(force=True) # type: ignore + return cls.parse_obj(obj) + finally: + cls.model_rebuild(force=True) # type: ignore + else: + with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): + return cls.parse_obj(obj) class PermissiveConfigModel(ConfigModel): diff --git a/metadata-ingestion/src/datahub/configuration/datetimes.py b/metadata-ingestion/src/datahub/configuration/datetimes.py index 41af7565593d9b..1520462fa9bf8c 100644 --- a/metadata-ingestion/src/datahub/configuration/datetimes.py +++ b/metadata-ingestion/src/datahub/configuration/datetimes.py @@ -65,6 +65,8 @@ def parse_absolute_time(input: str) -> datetime: def parse_relative_timespan(input: str) -> timedelta: + raw_input = input + neg = False input = input.strip() @@ -79,7 +81,7 @@ def parse_relative_timespan(input: str) -> timedelta: if neg: delta = -delta - logger.debug(f'Parsed "{input}" as {delta}.') + logger.debug(f'Parsed "{raw_input}" as {delta}.') return delta diff --git a/metadata-ingestion/src/datahub/configuration/git.py b/metadata-ingestion/src/datahub/configuration/git.py index a5f88744661a4a..3c76c8da0d5717 100644 --- a/metadata-ingestion/src/datahub/configuration/git.py +++ b/metadata-ingestion/src/datahub/configuration/git.py @@ -1,4 +1,3 @@ -import os import pathlib from typing import Any, Dict, Optional, Union @@ -6,6 +5,7 @@ from datahub.configuration.common import ConfigModel from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.configuration.validate_multiline_string import pydantic_multiline_string _GITHUB_PREFIX = "https://github.com/" _GITLAB_PREFIX = "https://gitlab.com/" @@ -92,15 +92,7 @@ class GitInfo(GitReference): description="The url to call `git clone` on. We infer this for github and gitlab repos, but it is required for other hosts.", ) - @validator("deploy_key_file") - def deploy_key_file_should_be_readable( - cls, v: Optional[FilePath] - ) -> Optional[FilePath]: - if v is not None: - # pydantic does existence checks, we just need to check if we can read it - if not os.access(v, os.R_OK): - raise ValueError(f"Unable to read deploy key file {v}") - return v + _fix_deploy_key_newlines = pydantic_multiline_string("deploy_key") @validator("deploy_key", pre=True, always=True) def deploy_key_filled_from_deploy_key_file( diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py index f1876b500598ba..bd931abe2e84d1 100644 --- a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py +++ b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py @@ -19,12 +19,41 @@ class PydanticDeprecatedSince20(Warning): # type: ignore if PYDANTIC_VERSION_2: from pydantic import BaseModel as GenericModel + from pydantic.v1 import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, + ) else: + from pydantic import ( # type: ignore + BaseModel as v1_BaseModel, + Extra as v1_Extra, + Field as v1_Field, + root_validator as v1_root_validator, + validator as v1_validator, + ) from pydantic.generics import GenericModel # type: ignore +class v1_ConfigModel(v1_BaseModel): + """A simplified variant of our main ConfigModel class. + + This one only uses pydantic v1 features. + """ + + class Config: + extra = v1_Extra.forbid + underscore_attrs_are_private = True + + __all__ = [ "PYDANTIC_VERSION_2", "PydanticDeprecatedSince20", "GenericModel", + "v1_ConfigModel", + "v1_Field", + "v1_root_validator", + "v1_validator", ] diff --git a/metadata-ingestion/src/datahub/configuration/time_window_config.py b/metadata-ingestion/src/datahub/configuration/time_window_config.py index 15de7470e4d823..f20ab85be05855 100644 --- a/metadata-ingestion/src/datahub/configuration/time_window_config.py +++ b/metadata-ingestion/src/datahub/configuration/time_window_config.py @@ -68,6 +68,12 @@ def default_start_time( assert abs(delta) >= get_bucket_duration_delta( values["bucket_duration"] ), "Relative start time should be in terms of configured bucket duration. e.g '-2 days' or '-2 hours'." + + # The end_time's default value is not yet populated, in which case + # we can just manually generate it here. + if "end_time" not in values: + values["end_time"] = datetime.now(tz=timezone.utc) + return get_time_bucket( values["end_time"] + delta, values["bucket_duration"] ) @@ -80,9 +86,13 @@ def default_start_time( @pydantic.validator("start_time", "end_time") def ensure_timestamps_in_utc(cls, v: datetime) -> datetime: - assert ( - v.tzinfo == timezone.utc - ), 'timezone is not UTC; try adding a "Z" to the value e.g. "2021-07-20T00:00:00Z"' + if v.tzinfo is None: + raise ValueError( + "Timestamps must be in UTC. Try adding a 'Z' to the value e.g. '2021-07-20T00:00:00Z'" + ) + + # If the timestamp is timezone-aware but not in UTC, convert it to UTC. + v = v.astimezone(timezone.utc) return v diff --git a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py index bb01f2b787123a..de2a16e9bf247d 100644 --- a/metadata-ingestion/src/datahub/configuration/validate_field_rename.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_rename.py @@ -49,4 +49,6 @@ def _validate_field_rename(cls: Type, values: dict) -> dict: # validator with pre=True gets all the values that were passed in. # Given that a renamed field doesn't show up in the fields list, we can't use # the field-level validator, even with a different field name. - return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename) + return pydantic.root_validator(pre=True, skip_on_failure=True, allow_reuse=True)( + _validate_field_rename + ) diff --git a/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py new file mode 100644 index 00000000000000..0baaf4f0264b99 --- /dev/null +++ b/metadata-ingestion/src/datahub/configuration/validate_multiline_string.py @@ -0,0 +1,31 @@ +from typing import Optional, Type, Union + +import pydantic + + +def pydantic_multiline_string(field: str) -> classmethod: + """If the field is present and contains an escaped newline, replace it with a real newline. + + This makes the assumption that the field value is never supposed to have a + r"\n" in it, and instead should only have newline characters. This is generally + a safe assumption for SSH keys and similar. + + The purpose of this helper is to make us more forgiving of small formatting issues + in recipes, without sacrificing correctness across the board. + """ + + def _validate_field( + cls: Type, v: Union[None, str, pydantic.SecretStr] + ) -> Optional[str]: + if v is not None: + if isinstance(v, pydantic.SecretStr): + v = v.get_secret_value() + v = v.replace(r"\n", "\n") + + return v + + # Hack: Pydantic maintains unique list of validators by referring its __name__. + # https://github.com/pydantic/pydantic/blob/v1.10.9/pydantic/main.py#L264 + # This hack ensures that multiple field deprecated do not overwrite each other. + _validate_field.__name__ = f"{_validate_field.__name__}_{field}" + return pydantic.validator(field, pre=True, allow_reuse=True)(_validate_field) diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py index 1f2b7f5689ea3c..42eb930c80f9d4 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py @@ -8,6 +8,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigModel +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.ingestion.glossary.classifier import Classifier @@ -50,7 +51,10 @@ class ValuesFactorConfig(ConfigModel): class PredictionFactorsAndWeights(ConfigModel): class Config: - allow_population_by_field_name = True + if PYDANTIC_VERSION_2: + populate_by_name = True + else: + allow_population_by_field_name = True Name: float = Field(alias="name") Description: float = Field(alias="description") @@ -60,7 +64,10 @@ class Config: class InfoTypeConfig(ConfigModel): class Config: - allow_population_by_field_name = True + if PYDANTIC_VERSION_2: + populate_by_name = True + else: + allow_population_by_field_name = True Prediction_Factors_and_Weights: PredictionFactorsAndWeights = Field( description="Factors and their weights to consider when predicting info types", diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 25e17d692109a5..d7c70dbea0b141 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -528,6 +528,9 @@ def log_ingestion_stats(self) -> None: { "source_type": self.config.source.type, "sink_type": self.config.sink.type, + "transformer_types": [ + transformer.type for transformer in self.config.transformers or [] + ], "records_written": stats.discretize( self.sink.get_report().total_records_written ), diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py index 0fb211a5d7b162..421991a0966c3a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/aws_common.py @@ -167,7 +167,7 @@ def get_session(self) -> Session: return session - def get_credentials(self) -> Dict[str, str]: + def get_credentials(self) -> Dict[str, Optional[str]]: credentials = self.get_session().get_credentials() if credentials is not None: return { diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index cbe68a454ea436..c13b08a6d9656b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -284,7 +284,7 @@ def validate_bigquery_audit_metadata_datasets( return v - @root_validator(pre=False) + @root_validator(pre=False, skip_on_failure=True) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: project_id = values.get("project_id") project_id_pattern = values.get("project_id_pattern") diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index eddd08c92b808d..b44b06feb95af2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -175,7 +175,7 @@ def make_lineage_edges_from_parsing_result( table_name = str( BigQueryTableRef.from_bigquery_table( BigqueryTableIdentifier.from_string_name( - DatasetUrn.create_from_string(table_urn).get_dataset_name() + DatasetUrn.from_string(table_urn).name ) ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py index f3616ca648a3e6..81a54d1327d05a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/delta_lake/config.py @@ -4,6 +4,7 @@ import pydantic from cached_property import cached_property from pydantic import Field +from typing_extensions import Literal from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import ( @@ -46,10 +47,9 @@ class DeltaLakeSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin): "'/' and URNs will be created using " "relative_path only.", ) - platform: str = Field( + platform: Literal["delta-lake"] = Field( default="delta-lake", description="The platform that this source connects to", - const=True, ) platform_instance: Optional[str] = Field( default=None, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index c0395b4e4e7963..12e362fa8a3e3f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -7,6 +7,7 @@ DataProcessInstance, InstanceRunResult, ) +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SourceCapability, @@ -248,13 +249,17 @@ def _get_connector_workunits( # Map Fivetran's connector entity with Datahub's datajob entity datajob = self._generate_datajob_from_connector(connector) - for mcp in datajob.generate_mcp(materialize_iolets=True): - if mcp.entityType == "dataset" and isinstance(mcp.aspect, StatusClass): - # While we "materialize" the referenced datasets, we don't want them - # to be tracked by stateful ingestion. - yield mcp.as_workunit(is_primary_source=False) - else: - yield mcp.as_workunit() + for mcp in datajob.generate_mcp(materialize_iolets=False): + yield mcp.as_workunit() + + # Materialize the upstream referenced datasets. + # We assume that the downstreams are materialized by other ingestion sources. + for iolet in datajob.inlets: + # We don't want these to be tracked by stateful ingestion. + yield MetadataChangeProposalWrapper( + entityUrn=str(iolet), + aspect=StatusClass(removed=False), + ).as_workunit(is_primary_source=False) # Map Fivetran's job/sync history entity with Datahub's data process entity for job in connector.jobs: diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index b76bef49a7e6f0..33079f3fd9ac17 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -2060,10 +2060,9 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 ) logger.debug("Failed to process explore", exc_info=e) - processed_view_files = processed_view_map.get(model.connection) - if processed_view_files is None: - processed_view_map[model.connection] = set() - processed_view_files = processed_view_map[model.connection] + processed_view_files = processed_view_map.setdefault( + model.connection, set() + ) project_name = self.get_project_name(model_name) logger.debug(f"Model: {model_name}; Includes: {model.resolved_includes}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index 409027a8805a0d..e40406b994c9b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -85,15 +85,18 @@ sq.endtime AS endtime, 'insert' AS operation_type FROM - stl_insert si + (select userid, query, sum(rows) as rows, tbl + from stl_insert si + where si.rows > 0 + AND si.starttime >= '{start_time}' + AND si.starttime < '{end_time}' + group by userid, query, tbl + ) as si JOIN svv_table_info sti ON si.tbl = sti.table_id JOIN stl_query sq ON si.query = sq.query JOIN svl_user_info sui ON sq.userid = sui.usesysid WHERE - si.starttime >= '{start_time}' - AND si.starttime < '{end_time}' - AND si.rows > 0 - AND sq.aborted = 0) + sq.aborted = 0) UNION (SELECT DISTINCT sd.userid AS userid, @@ -109,15 +112,18 @@ sq.endtime AS endtime, 'delete' AS operation_type FROM - stl_delete sd + (select userid, query, sum(rows) as rows, tbl + from stl_delete sd + where sd.rows > 0 + AND sd.starttime >= '{start_time}' + AND sd.starttime < '{end_time}' + group by userid, query, tbl + ) as sd JOIN svv_table_info sti ON sd.tbl = sti.table_id JOIN stl_query sq ON sd.query = sq.query JOIN svl_user_info sui ON sq.userid = sui.usesysid WHERE - sd.starttime >= '{start_time}' - AND sd.starttime < '{end_time}' - AND sd.rows > 0 - AND sq.aborted = 0) + sq.aborted = 0) ORDER BY endtime DESC """.strip() diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 032bdef178fdf6..b896df1fa340e3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -176,7 +176,7 @@ def validate_include_column_lineage(cls, v, values): ) return v - @root_validator(pre=False) + @root_validator(pre=False, skip_on_failure=True) def validate_unsupported_configs(cls, values: Dict) -> Dict: value = values.get("include_read_operational_stats") if value is not None and value: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py index 5a451bf197d347..af8d8824a4b172 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_utils.py @@ -9,8 +9,8 @@ from datahub.configuration.pattern_utils import is_schema_allowed from datahub.ingestion.source.snowflake.constants import ( GENERIC_PERMISSION_ERROR_KEY, - SNOWFLAKE_DEFAULT_CLOUD, SNOWFLAKE_REGION_CLOUD_REGION_MAPPING, + SnowflakeCloudProvider, SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config @@ -72,6 +72,15 @@ def report_error(self, key: str, reason: str) -> None: class SnowflakeCommonMixin: platform = "snowflake" + CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX = [ + "us-west-2", + "us-east-1", + "eu-west-1", + "eu-central-1", + "ap-southeast-1", + "ap-southeast-2", + ] + @staticmethod def create_snowsight_base_url( account_locator: str, @@ -79,12 +88,23 @@ def create_snowsight_base_url( cloud: str, privatelink: bool = False, ) -> Optional[str]: + if cloud: + url_cloud_provider_suffix = f".{cloud}" + + if cloud == SnowflakeCloudProvider.AWS: + # Some AWS regions do not have cloud suffix. See below the list: + # https://docs.snowflake.com/en/user-guide/admin-account-identifier#non-vps-account-locator-formats-by-cloud-platform-and-region + if ( + cloud_region_id + in SnowflakeCommonMixin.CLOUD_REGION_IDS_WITHOUT_CLOUD_SUFFIX + ): + url_cloud_provider_suffix = "" + else: + url_cloud_provider_suffix = f".{cloud}" if privatelink: url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.snowflakecomputing.com/" - elif cloud == SNOWFLAKE_DEFAULT_CLOUD: - url = f"https://app.snowflake.com/{cloud_region_id}/{account_locator}/" else: - url = f"https://app.snowflake.com/{cloud_region_id}.{cloud}/{account_locator}/" + url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/" return url @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 46bd24c7e1f4c3..e9db82ce75cd99 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -107,7 +107,7 @@ def validate_account_id(cls, account_id: str) -> str: return account_id @pydantic.validator("authentication_type", always=True) - def authenticator_type_is_valid(cls, v, values, field): + def authenticator_type_is_valid(cls, v, values): if v not in VALID_AUTH_TYPES.keys(): raise ValueError( f"unsupported authenticator type '{v}' was provided," diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py index 5eb9c83236e4f9..13abe73cc4e098 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py @@ -11,6 +11,7 @@ from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.validate_field_removal import pydantic_removed_field +from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.ingestion.source.usage.usage_common import BaseUsageConfig from datahub.ingestion.source_config.bigquery import BigQueryBaseConfig @@ -44,6 +45,8 @@ class BigQueryCredential(ConfigModel): description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", ) + _fix_private_key_newlines = pydantic_multiline_string("private_key") + @pydantic.root_validator(skip_on_failure=True) def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("client_x509_cert_url") is None: diff --git a/metadata-ingestion/src/datahub/utilities/logging_manager.py b/metadata-ingestion/src/datahub/utilities/logging_manager.py index a8eacb0a9938df..62aa1ca7ab7918 100644 --- a/metadata-ingestion/src/datahub/utilities/logging_manager.py +++ b/metadata-ingestion/src/datahub/utilities/logging_manager.py @@ -199,6 +199,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N for handler in handlers: root_logger.removeHandler(handler) for lib in DATAHUB_PACKAGES: + lib_logger = logging.getLogger(lib) lib_logger.removeHandler(handler) lib_logger.propagate = True diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py index 4f228494f416b8..3389a6fb05ee89 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py @@ -150,7 +150,7 @@ def modify_urn(urn: str) -> str: if guess_entity_type(urn) == "dataset": return _lowercase_dataset_urn(urn) elif guess_entity_type(urn) == "schemaField": - cur_urn = Urn.create_from_string(urn) + cur_urn = Urn.from_string(urn) cur_urn._entity_ids[0] = _lowercase_dataset_urn(cur_urn._entity_ids[0]) return str(cur_urn) return urn diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json index a72c960a722969..b8f05fa6e93aad 100644 --- a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json +++ b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json @@ -178,38 +178,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "powerbi-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "powerbi-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index c7273fee5a2e58..ece54f00eeaa04 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -11,20 +11,20 @@ "env": "PROD", "database": "test_db" }, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/", "name": "TEST_DB", "description": "Comment for TEST_DB", "created": { - "time": 1623110400000 + "time": 1623103200000 }, "lastModified": { - "time": 1623110400000 + "time": 1623103200000 } } }, "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00", + "lastObserved": 1615443388097, + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -144,20 +144,20 @@ "database": "test_db", "schema": "test_schema" }, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/", "name": "TEST_SCHEMA", "description": "comment for TEST_DB.TEST_SCHEMA", "created": { - "time": 1623110400000 + "time": 1623103200000 }, "lastModified": { - "time": 1623110400000 + "time": 1623103200000 } } }, "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00", + "lastObserved": 1615443388097, + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -489,22 +489,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", "name": "TABLE_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -788,22 +788,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", "name": "TABLE_2", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1087,22 +1087,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", "name": "TABLE_3", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1386,22 +1386,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", "name": "TABLE_4", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1685,22 +1685,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_5/", "name": "TABLE_5", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_5", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -1984,22 +1984,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", "name": "TABLE_6", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -2283,22 +2283,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", "name": "TABLE_7", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -2582,22 +2582,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", "name": "TABLE_8", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -2881,22 +2881,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", "name": "TABLE_9", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -3180,22 +3180,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", "name": "TABLE_10", "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10", "description": "Comment for Table", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -3470,22 +3470,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", "name": "VIEW_1", "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1", "description": "Comment for View", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, @@ -3805,22 +3805,22 @@ "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.snowflake.com/ap-south-1/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/", + "externalUrl": "https://app.snowflake.com/ap-south-1.aws/abc12345/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/", "name": "VIEW_2", "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_2", "description": "Comment for View", "created": { - "time": 1623090600000 + "time": 1623103200000 }, "lastModified": { - "time": 1623090600000 + "time": 1623103200000 }, "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28", + "runId": "snowflake-2023_12_18-10_16_09", "lastRunId": "no-run-id-provided" } }, diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index 1b58696e4014c9..39a62056a7e4ad 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -87,18 +87,18 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): confidence_level_threshold=0.58, info_types_config={ "Age": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, Values=1, Description=0, Datatype=0 + prediction_factors_and_weights=PredictionFactorsAndWeights( + name=0, values=1, description=0, datatype=0 ) ), "CloudRegion": InfoTypeConfig( - Prediction_Factors_and_Weights=PredictionFactorsAndWeights( - Name=0, - Description=0, - Datatype=0, - Values=1, + prediction_factors_and_weights=PredictionFactorsAndWeights( + name=0, + description=0, + datatype=0, + values=1, ), - Values=ValuesFactorConfig( + values=ValuesFactorConfig( prediction_type="regex", regex=[ r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+" diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index f439a322c26771..5ed672d527264a 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -1,5 +1,6 @@ import os import subprocess +import sys import time import pytest @@ -8,6 +9,10 @@ from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port +pytestmark = pytest.mark.skipif( + sys.version_info < (3, 8), reason="requires python 3.8 or higher" +) + @pytest.fixture(scope="module") def mssql_runner(docker_compose_runner, pytestconfig): diff --git a/metadata-ingestion/tests/unit/test_allow_deny.py b/metadata-ingestion/tests/unit/config/test_allow_deny.py similarity index 100% rename from metadata-ingestion/tests/unit/test_allow_deny.py rename to metadata-ingestion/tests/unit/config/test_allow_deny.py diff --git a/metadata-ingestion/tests/unit/test_config_clean.py b/metadata-ingestion/tests/unit/config/test_config_clean.py similarity index 100% rename from metadata-ingestion/tests/unit/test_config_clean.py rename to metadata-ingestion/tests/unit/config/test_config_clean.py diff --git a/metadata-ingestion/tests/unit/config/test_config_model.py b/metadata-ingestion/tests/unit/config/test_config_model.py index ffac5c465f5541..f53390a3deb18c 100644 --- a/metadata-ingestion/tests/unit/config/test_config_model.py +++ b/metadata-ingestion/tests/unit/config/test_config_model.py @@ -3,8 +3,11 @@ import pydantic import pytest -from datahub.configuration.common import ConfigModel, redact_raw_config -from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig +from datahub.configuration.common import ( + AllowDenyPattern, + ConfigModel, + redact_raw_config, +) def test_extras_not_allowed(): @@ -76,8 +79,15 @@ def test_config_redaction(): def test_shared_defaults(): - c1 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url") - c2 = UnityCatalogSourceConfig(token="s", workspace_url="https://workspace_url") + class SourceConfig(ConfigModel): + token: str + workspace_url: str + catalog_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + ) + + c1 = SourceConfig(token="s", workspace_url="https://workspace_url") + c2 = SourceConfig(token="s", workspace_url="https://workspace_url") assert c2.catalog_pattern.allow == [".*"] c1.catalog_pattern.allow += ["foo"] diff --git a/metadata-ingestion/tests/unit/test_pydantic_validators.py b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py similarity index 56% rename from metadata-ingestion/tests/unit/test_pydantic_validators.py rename to metadata-ingestion/tests/unit/config/test_pydantic_validators.py index 3e9ec6cbaf3579..f687a2776f6e2d 100644 --- a/metadata-ingestion/tests/unit/test_pydantic_validators.py +++ b/metadata-ingestion/tests/unit/config/test_pydantic_validators.py @@ -1,13 +1,18 @@ from typing import Optional +import pydantic import pytest from pydantic import ValidationError -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ConfigModel, ConfigurationWarning from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.utilities.global_warning_util import get_global_warnings +from datahub.configuration.validate_multiline_string import pydantic_multiline_string +from datahub.utilities.global_warning_util import ( + clear_global_warnings, + get_global_warnings, +) def test_field_rename(): @@ -19,8 +24,9 @@ class TestModel(ConfigModel): v = TestModel.parse_obj({"b": "original"}) assert v.b == "original" - v = TestModel.parse_obj({"a": "renamed"}) - assert v.b == "renamed" + with pytest.warns(ConfigurationWarning, match="a is deprecated"): + v = TestModel.parse_obj({"a": "renamed"}) + assert v.b == "renamed" with pytest.raises(ValidationError): TestModel.parse_obj({"a": "foo", "b": "bar"}) @@ -41,9 +47,10 @@ class TestModel(ConfigModel): assert v.b == "original" assert v.b1 == "original" - v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"}) - assert v.b == "renamed" - assert v.b1 == "renamed" + with pytest.warns(ConfigurationWarning, match=r"a.* is deprecated"): + v = TestModel.parse_obj({"a": "renamed", "a1": "renamed"}) + assert v.b == "renamed" + assert v.b1 == "renamed" with pytest.raises(ValidationError): TestModel.parse_obj({"a": "foo", "b": "bar", "b1": "ok"}) @@ -71,14 +78,17 @@ class TestModel(ConfigModel): v = TestModel.parse_obj({"b": "original"}) assert v.b == "original" - v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"}) - assert v.b == "original" + with pytest.warns(ConfigurationWarning, match=r"r\d was removed"): + v = TestModel.parse_obj({"b": "original", "r1": "removed", "r2": "removed"}) + assert v.b == "original" def test_field_deprecated(): + clear_global_warnings() + class TestModel(ConfigModel): - d1: Optional[str] - d2: Optional[str] + d1: Optional[str] = None + d2: Optional[str] = None b: str _validate_deprecated_d1 = pydantic_field_deprecated("d1") @@ -87,9 +97,38 @@ class TestModel(ConfigModel): v = TestModel.parse_obj({"b": "original"}) assert v.b == "original" - v = TestModel.parse_obj({"b": "original", "d1": "deprecated", "d2": "deprecated"}) + with pytest.warns(ConfigurationWarning, match=r"d\d.+ deprecated"): + v = TestModel.parse_obj( + {"b": "original", "d1": "deprecated", "d2": "deprecated"} + ) assert v.b == "original" assert v.d1 == "deprecated" assert v.d2 == "deprecated" assert any(["d1 is deprecated" in warning for warning in get_global_warnings()]) assert any(["d2 is deprecated" in warning for warning in get_global_warnings()]) + + clear_global_warnings() + + +def test_multiline_string_fixer(): + class TestModel(ConfigModel): + s: str + m: Optional[pydantic.SecretStr] = None + + _validate_s = pydantic_multiline_string("s") + _validate_m = pydantic_multiline_string("m") + + v = TestModel.parse_obj({"s": "foo\nbar"}) + assert v.s == "foo\nbar" + + v = TestModel.parse_obj({"s": "foo\\nbar"}) + assert v.s == "foo\nbar" + + v = TestModel.parse_obj({"s": "normal", "m": "foo\\nbar"}) + assert v.s == "normal" + assert v.m + assert v.m.get_secret_value() == "foo\nbar" + + v = TestModel.parse_obj({"s": "normal", "m": pydantic.SecretStr("foo\\nbar")}) + assert v.m + assert v.m.get_secret_value() == "foo\nbar" diff --git a/metadata-ingestion/tests/unit/test_time_window_config.py b/metadata-ingestion/tests/unit/config/test_time_window_config.py similarity index 100% rename from metadata-ingestion/tests/unit/test_time_window_config.py rename to metadata-ingestion/tests/unit/config/test_time_window_config.py diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 536c91ace4f5ed..69a7510692df1d 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -24,6 +24,7 @@ from datahub.ingestion.source.snowflake.snowflake_usage_v2 import ( SnowflakeObjectAccessEntry, ) +from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin from datahub.ingestion.source.snowflake.snowflake_v2 import SnowflakeV2Source from tests.test_helpers import test_connection_helpers @@ -584,3 +585,29 @@ def test_email_filter_query_generation_with_case_insensitive_filter(): filter_query == "AND (rlike(user_name, '.*@example.com','c')) AND NOT (rlike(user_name, '.*@example2.com','c'))" ) + + +def test_create_snowsight_base_url_us_west(): + ( + cloud, + cloud_region_id, + ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id("aws_us_west_2") + + result = SnowflakeCommonMixin.create_snowsight_base_url( + "account_locator", cloud_region_id, cloud, False + ) + assert result == "https://app.snowflake.com/us-west-2/account_locator/" + + +def test_create_snowsight_base_url_ap_northeast_1(): + ( + cloud, + cloud_region_id, + ) = SnowflakeCommonMixin.get_cloud_region_from_snowflake_region_id( + "aws_ap_northeast_1" + ) + + result = SnowflakeCommonMixin.create_snowsight_base_url( + "account_locator", cloud_region_id, cloud, False + ) + assert result == "https://app.snowflake.com/ap-northeast-1.aws/account_locator/" diff --git a/metadata-integration/java/datahub-protobuf/build.gradle b/metadata-integration/java/datahub-protobuf/build.gradle index 2cb36a14cb9c7d..c8082b875d3212 100644 --- a/metadata-integration/java/datahub-protobuf/build.gradle +++ b/metadata-integration/java/datahub-protobuf/build.gradle @@ -31,10 +31,10 @@ dependencies { implementation externalDependency.commonsCli implementation externalDependency.httpAsyncClient implementation externalDependency.slf4jApi + implementation externalDependency.jacksonCore compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok - testImplementation externalDependency.junitJupiterApi - testRuntimeOnly externalDependency.junitJupiterEngine + testImplementation externalDependency.testng } import java.nio.file.Paths @@ -61,10 +61,7 @@ jacocoTestReport { dependsOn test // tests are required to run before generating the report } -test { - useJUnit() - finalizedBy jacocoTestReport -} +test.finalizedBy jacocoTestReport task checkShadowJar(type: Exec) { diff --git a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java index dcc95222fabf23..429c6d6bfeba48 100644 --- a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java +++ b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/Proto2DataHub.java @@ -67,6 +67,13 @@ public class Proto2DataHub { "[Optional if using --directory] The protobuf source file. Typically a .proto file.") .build(); + private static final Option OPTION_MESSAGE_NAME = + Option.builder() + .longOpt("message_name") + .hasArg() + .desc("[Optional] The protobuf message name to read from.") + .build(); + private static final Option OPTION_DIR = Option.builder() .longOpt("directory") @@ -166,6 +173,7 @@ static class AppConfig { private final String dataPlatform; private final String protoc; private final String inputFile; + private final String messageName; private final String inputDir; private final TransportOptions transport; private final String filename; @@ -191,6 +199,7 @@ static class AppConfig { dataPlatform = cli.getOptionValue(OPTION_DATAHUB_PLATFORM, "kafka").toLowerCase(Locale.ROOT); protoc = cli.getOptionValue(OPTION_DESCRIPTOR); inputFile = cli.getOptionValue(OPTION_FILE, null); + messageName = cli.getOptionValue(OPTION_MESSAGE_NAME, null); transport = TransportOptions.valueOf( cli.getOptionValue(OPTION_TRANSPORT, "rest").toUpperCase(Locale.ROOT)); @@ -250,6 +259,7 @@ public static void main(String[] args) throws Exception { .addOption(OPTION_DATAHUB_TOKEN) .addOption(OPTION_DESCRIPTOR) .addOption(OPTION_FILE) + .addOption(OPTION_MESSAGE_NAME) .addOption(OPTION_DIR) .addOption(OPTION_EXCLUDE_PATTERN) .addOption(OPTION_DATAHUB_USER) @@ -354,6 +364,7 @@ public static void main(String[] args) throws Exception { .setGithubOrganization(config.githubOrg) .setSlackTeamId(config.slackId) .setSubType(config.subType) + .setMessageName(config.messageName) .build(); dataset diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java index e96bb63220b04e..62f3b0453be097 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufDatasetTest.java @@ -1,8 +1,8 @@ package datahub.protobuf; import static datahub.protobuf.TestFixtures.*; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; import com.linkedin.common.FabricType; import com.linkedin.common.GlobalTags; @@ -34,7 +34,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufDatasetTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java index e2599cb4c3f685..9bf649041e035f 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/ProtobufUtilsTest.java @@ -2,13 +2,13 @@ import static datahub.protobuf.TestFixtures.getTestProtobufFileSet; import static datahub.protobuf.TestFixtures.getTestProtoc; -import static org.junit.jupiter.api.Assertions.*; +import static org.testng.Assert.*; import com.google.protobuf.DescriptorProtos; import com.google.protobuf.ExtensionRegistry; import datahub.protobuf.model.ProtobufGraph; import java.io.IOException; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufUtilsTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java index fed9f250b359fe..ae539a8e8fa4a1 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufEnumTest.java @@ -1,6 +1,6 @@ package datahub.protobuf.model; -import static org.junit.jupiter.api.Assertions.*; +import static org.testng.Assert.*; import com.google.protobuf.DescriptorProtos.DescriptorProto; import com.google.protobuf.DescriptorProtos.EnumDescriptorProto; @@ -11,7 +11,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufEnumTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java index 6d4dc8bc4d5850..9508f4778e5c88 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufFieldTest.java @@ -1,7 +1,7 @@ package datahub.protobuf.model; import static datahub.protobuf.TestFixtures.*; -import static org.junit.jupiter.api.Assertions.*; +import static org.testng.Assert.*; import com.google.protobuf.DescriptorProtos.DescriptorProto; import com.google.protobuf.DescriptorProtos.FieldDescriptorProto; @@ -22,7 +22,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.Set; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufFieldTest { private static final DescriptorProto EXPECTED_MESSAGE_PROTO = diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java index 488222b87766d8..6ca0c5b45cb5ec 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufGraphTest.java @@ -2,14 +2,14 @@ import static datahub.protobuf.TestFixtures.getTestProtobufFileSet; import static datahub.protobuf.TestFixtures.getTestProtobufGraph; -import static org.junit.jupiter.api.Assertions.*; +import static org.testng.Assert.*; import com.google.protobuf.DescriptorProtos.FileDescriptorSet; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufGraphTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java index 1d6b3907d76d9d..1126895aec57a6 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufMessageTest.java @@ -1,6 +1,6 @@ package datahub.protobuf.model; -import static org.junit.jupiter.api.Assertions.*; +import static org.testng.Assert.*; import com.google.protobuf.DescriptorProtos.DescriptorProto; import com.google.protobuf.DescriptorProtos.FileDescriptorProto; @@ -11,7 +11,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufMessageTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java index c8bd8a322aad56..9db06f23a2bdf3 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/model/ProtobufOneOfFieldTest.java @@ -1,6 +1,6 @@ package datahub.protobuf.model; -import static org.junit.jupiter.api.Assertions.*; +import static org.testng.Assert.*; import com.google.protobuf.DescriptorProtos.DescriptorProto; import com.google.protobuf.DescriptorProtos.FieldDescriptorProto; @@ -12,7 +12,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufOneOfFieldTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java index 2fc5f3834a749f..fe27af7461860b 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/VisitContextTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufFileSet; import static datahub.protobuf.TestFixtures.getTestProtobufGraph; -import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.testng.Assert.assertNotEquals; import com.google.protobuf.DescriptorProtos.FileDescriptorSet; import datahub.protobuf.model.FieldTypeEdge; @@ -13,7 +13,7 @@ import java.util.Set; import java.util.stream.Collectors; import org.jgrapht.GraphPath; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class VisitContextTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java index de9a0f5ec4abee..6e99599c852b43 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DatasetVisitorTest.java @@ -1,7 +1,7 @@ package datahub.protobuf.visitors.dataset; import static datahub.protobuf.TestFixtures.*; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.common.urn.DatasetUrn; import com.linkedin.data.template.RecordTemplate; @@ -14,7 +14,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class DatasetVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java index 679048fb48a53d..42d8f1ad4c83c6 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DescriptionVisitorTest.java @@ -1,14 +1,14 @@ package datahub.protobuf.visitors.dataset; import static datahub.protobuf.TestFixtures.*; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import datahub.protobuf.model.ProtobufGraph; import java.io.IOException; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class DescriptionVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java index c24fc30766f0ed..3330c09c494364 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/DomainVisitorTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.common.urn.Urn; import datahub.protobuf.model.ProtobufGraph; @@ -10,7 +10,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class DomainVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java index a57916441bfcb3..45be30fe96210b 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/InstitutionalMemoryVisitorTest.java @@ -1,7 +1,7 @@ package datahub.protobuf.visitors.dataset; import static datahub.protobuf.TestFixtures.*; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.common.InstitutionalMemoryMetadata; import com.linkedin.common.url.Url; @@ -9,7 +9,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class InstitutionalMemoryVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java index 5f8572cf6ddd83..2da53dad2c0be1 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/KafkaTopicPropertyVisitorTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; @@ -11,7 +11,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class KafkaTopicPropertyVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java index 1b0aff28eb5176..adc94487dab3cc 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/OwnershipVisitorTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.common.Owner; import com.linkedin.common.OwnershipSource; @@ -14,7 +14,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class OwnershipVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java index 13912100f28a5d..be653309540519 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/PropertyVisitorTest.java @@ -3,7 +3,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; import static java.util.Map.entry; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; @@ -11,7 +11,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class PropertyVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java index f734c00bb76e08..79e7075c652094 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/dataset/TermAssociationVisitorTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.common.GlossaryTermAssociation; import com.linkedin.common.urn.GlossaryTermUrn; @@ -10,7 +10,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class TermAssociationVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java index eec397011a4ce4..ff1aa643ac8df6 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/ProtobufExtensionFieldVisitorTest.java @@ -1,7 +1,7 @@ package datahub.protobuf.visitors.field; import static datahub.protobuf.TestFixtures.*; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.common.GlobalTags; import com.linkedin.common.GlossaryTermAssociation; @@ -23,7 +23,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class ProtobufExtensionFieldVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java index af31a80d3b53ad..59d9e0ca6e5183 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/field/SchemaFieldVisitorTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.schema.NumberType; import com.linkedin.schema.SchemaField; @@ -15,7 +15,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class SchemaFieldVisitorTest { diff --git a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java index 258d816d9d1da3..ab477e19aabe4d 100644 --- a/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java +++ b/metadata-integration/java/datahub-protobuf/src/test/java/datahub/protobuf/visitors/tag/TagVisitorTest.java @@ -2,7 +2,7 @@ import static datahub.protobuf.TestFixtures.getTestProtobufGraph; import static datahub.protobuf.TestFixtures.getVisitContextBuilder; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.testng.Assert.assertEquals; import com.linkedin.tag.TagProperties; import datahub.event.MetadataChangeProposalWrapper; @@ -11,7 +11,7 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.junit.jupiter.api.Test; +import org.testng.annotations.Test; public class TagVisitorTest { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 53b974b560e2a6..e7ec4d313b5f58 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -235,6 +235,30 @@ public BrowseResultV2 browseV2( return _entitySearchService.browseV2(entityName, path, filter, input, start, count); } + /** + * Gets browse V2 snapshot of a given path + * + * @param entityNames entities being browsed + * @param path path being browsed + * @param filter browse filter + * @param input search query + * @param start start offset of first group + * @param count max number of results requested + * @throws RemoteInvocationException + */ + @Nonnull + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count, + @Nonnull Authentication authentication) { + // TODO: cache browseV2 results + return _entitySearchService.browseV2(entityNames, path, filter, input, start, count); + } + @SneakyThrows @Deprecated public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index a3338394165567..7bd8e763cdc27a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -1161,6 +1161,7 @@ public RestoreIndicesResult restoreIndices( Urn urn; try { urn = Urn.createFromString(aspect.getKey().getUrn()); + result.lastUrn = urn.toString(); } catch (Exception e) { logger.accept( String.format( @@ -1188,6 +1189,7 @@ public RestoreIndicesResult restoreIndices( result.timeEntityRegistryCheckMs += System.currentTimeMillis() - startTime; startTime = System.currentTimeMillis(); final String aspectName = aspect.getKey().getAspect(); + result.lastAspect = aspectName; // 3. Verify that the aspect is a valid aspect associated with the entity AspectSpec aspectSpec = entitySpec.getAspectSpec(aspectName); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index b2b47c1d5ba32f..26946890daa3b7 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -477,11 +477,31 @@ public PagedList getPagedAspects(final RestoreIndicesArgs args) { if (args.urnLike != null) { exp = exp.like(EbeanAspectV2.URN_COLUMN, args.urnLike); } + + int start = args.start; + if (args.urnBasedPagination) { + start = 0; + if (args.lastUrn != null && !args.lastUrn.isEmpty()) { + exp = exp.where().ge(EbeanAspectV2.URN_COLUMN, args.lastUrn); + + // To prevent processing the same aspect multiple times in a restore, it compares against + // the last aspect if the urn matches the last urn + if (args.lastAspect != null && !args.lastAspect.isEmpty()) { + exp = + exp.where() + .and() + .or() + .ne(EbeanAspectV2.URN_COLUMN, args.lastUrn) + .gt(EbeanAspectV2.ASPECT_COLUMN, args.lastAspect); + } + } + } + return exp.orderBy() .asc(EbeanAspectV2.URN_COLUMN) .orderBy() .asc(EbeanAspectV2.ASPECT_COLUMN) - .setFirstRow(args.start) + .setFirstRow(start) .setMaxRows(args.batchSize) .findPagedList(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index f40da59a149faa..fd7491fe32ea34 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -210,6 +210,18 @@ public BrowseResultV2 browseV2( return esBrowseDAO.browseV2(entityName, path, filter, input, start, count); } + @Nonnull + @Override + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count) { + return esBrowseDAO.browseV2(entityNames, path, filter, input, start, count); + } + @Nonnull @Override public List getBrowsePaths(@Nonnull String entityName, @Nonnull Urn urn) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java index 5ea60b24a577a0..3c71a2dfd91809 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java @@ -427,6 +427,44 @@ public BrowseResultV2 browseV2( } } + public BrowseResultV2 browseV2( + @Nonnull List entities, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count) { + try { + final SearchResponse groupsResponse; + + try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esGroupSearch").time()) { + final String finalInput = input.isEmpty() ? "*" : input; + groupsResponse = + client.search( + constructGroupsSearchRequestBrowseAcrossEntities( + entities, path, filter, finalInput), + RequestOptions.DEFAULT); + } + + final BrowseGroupsResultV2 browseGroupsResult = + extractGroupsResponseV2(groupsResponse, path, start, count); + final int numGroups = browseGroupsResult.getTotalGroups(); + + return new BrowseResultV2() + .setMetadata( + new BrowseResultMetadata() + .setTotalNumEntities(browseGroupsResult.getTotalNumEntities()) + .setPath(path)) + .setGroups(new BrowseResultGroupV2Array(browseGroupsResult.getGroups())) + .setNumGroups(numGroups) + .setFrom(start) + .setPageSize(count); + } catch (Exception e) { + log.error("Browse Across Entities query failed: " + e.getMessage()); + throw new ESQueryException("Browse Across Entities query failed: ", e); + } + } + @Nonnull private SearchRequest constructGroupsSearchRequestV2( @Nonnull String entityName, @@ -448,6 +486,33 @@ private SearchRequest constructGroupsSearchRequestV2( return searchRequest; } + @Nonnull + private SearchRequest constructGroupsSearchRequestBrowseAcrossEntities( + @Nonnull List entities, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input) { + + List entitySpecs = + entities.stream().map(entityRegistry::getEntitySpec).collect(Collectors.toList()); + + String[] indexArray = + entities.stream().map(indexConvention::getEntityIndexName).toArray(String[]::new); + + final SearchRequest searchRequest = new SearchRequest(indexArray); + final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); + searchSourceBuilder.size(0); + searchSourceBuilder.query( + buildQueryStringBrowseAcrossEntities( + entitySpecs, + path, + SearchUtil.transformFilterForEntities(filter, indexConvention), + input)); + searchSourceBuilder.aggregation(buildAggregationsV2(path)); + searchRequest.source(searchSourceBuilder); + return searchRequest; + } + /** * Extracts the name of group from path. * @@ -494,6 +559,32 @@ private QueryBuilder buildQueryStringV2( return queryBuilder; } + @Nonnull + private QueryBuilder buildQueryStringBrowseAcrossEntities( + @Nonnull List entitySpecs, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input) { + final int browseDepthVal = getPathDepthV2(path); + + final BoolQueryBuilder queryBuilder = QueryBuilders.boolQuery(); + + QueryBuilder query = + SearchRequestHandler.getBuilder(entitySpecs, searchConfiguration, customSearchConfiguration) + .getQuery(input, false); + queryBuilder.must(query); + + if (!path.isEmpty()) { + queryBuilder.filter(QueryBuilders.matchQuery(BROWSE_PATH_V2, path)); + } + + queryBuilder.filter(QueryBuilders.rangeQuery(BROWSE_PATH_V2_DEPTH).gt(browseDepthVal)); + + queryBuilder.filter(SearchRequestHandler.getFilterQuery(filter)); + + return queryBuilder; + } + @Nonnull private AggregationBuilder buildAggregationsV2(@Nonnull String path) { final String currentLevel = ESUtils.escapeReservedCharacters(path) + "␟.*"; diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java index b505674f2ed9c2..61b9d5c8167900 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java @@ -6,4 +6,5 @@ public class ConsumerConfiguration { private int maxPartitionFetchBytes; + private boolean stopOnDeserializationError; } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index a52b705cb8da63..36498f7c45fea1 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -236,6 +236,7 @@ kafka: maxRequestSize: ${KAFKA_PRODUCER_MAX_REQUEST_SIZE:5242880} # the max bytes sent by the producer, also see kafka-setup MAX_MESSAGE_BYTES for matching value consumer: maxPartitionFetchBytes: ${KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES:5242880} # the max bytes consumed per partition + stopOnDeserializationError: ${KAFKA_CONSUMER_STOP_ON_DESERIALIZATION_ERROR:true} # Stops kafka listener container on deserialization error, allows user to fix problems before moving past problematic offset. If false will log and move forward past the offset schemaRegistry: type: ${SCHEMA_REGISTRY_TYPE:KAFKA} # INTERNAL or KAFKA or AWS_GLUE url: ${KAFKA_SCHEMAREGISTRY_URL:http://localhost:8081} @@ -317,6 +318,7 @@ featureFlags: showAccessManagement: ${SHOW_ACCESS_MANAGEMENT:false} #Whether we should show AccessManagement tab in the datahub UI. showSearchFiltersV2: ${SHOW_SEARCH_FILTERS_V2:true} # Enables showing the search filters V2 experience. showBrowseV2: ${SHOW_BROWSE_V2:true} # Enables showing the browse v2 sidebar experience. + platformBrowseV2: ${PLATFORM_BROWSE_V2:false} # Enables the platform browse experience, instead of the entity-oriented browse default. preProcessHooks: uiEnabled: ${PRE_PROCESS_HOOKS_UI_ENABLED:true} # Circumvents Kafka for processing index updates for UI changes sourced from GraphQL to avoid processing delays showAcrylInfo: ${SHOW_ACRYL_INFO:false} # Show different CTAs within DataHub around moving to Managed DataHub. Set to true for the demo site. diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java index 2a6338ac15e93d..4c0308546d857f 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java @@ -21,6 +21,11 @@ import org.springframework.kafka.config.ConcurrentKafkaListenerContainerFactory; import org.springframework.kafka.config.KafkaListenerContainerFactory; import org.springframework.kafka.core.DefaultKafkaConsumerFactory; +import org.springframework.kafka.listener.CommonContainerStoppingErrorHandler; +import org.springframework.kafka.listener.CommonDelegatingErrorHandler; +import org.springframework.kafka.listener.DefaultErrorHandler; +import org.springframework.kafka.support.serializer.DeserializationException; +import org.springframework.kafka.support.serializer.ErrorHandlingDeserializer; @Slf4j @Configuration @@ -66,8 +71,6 @@ private static Map buildCustomizedProperties( SchemaRegistryConfig schemaRegistryConfig) { KafkaProperties.Consumer consumerProps = baseKafkaProperties.getConsumer(); - // Specify (de)serializers for record keys and for record values. - consumerProps.setKeyDeserializer(StringDeserializer.class); // Records will be flushed every 10 seconds. consumerProps.setEnableAutoCommit(true); consumerProps.setAutoCommitInterval(Duration.ofSeconds(10)); @@ -81,7 +84,13 @@ private static Map buildCustomizedProperties( Map customizedProperties = baseKafkaProperties.buildConsumerProperties(); customizedProperties.put( - ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, schemaRegistryConfig.getDeserializer()); + ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ErrorHandlingDeserializer.class); + customizedProperties.put( + ErrorHandlingDeserializer.KEY_DESERIALIZER_CLASS, StringDeserializer.class); + customizedProperties.put( + ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ErrorHandlingDeserializer.class); + customizedProperties.put( + ErrorHandlingDeserializer.VALUE_DESERIALIZER_CLASS, schemaRegistryConfig.getDeserializer()); // Override KafkaProperties with SchemaRegistryConfig only for non-empty values schemaRegistryConfig.getProperties().entrySet().stream() @@ -98,7 +107,8 @@ private static Map buildCustomizedProperties( @Bean(name = "kafkaEventConsumer") protected KafkaListenerContainerFactory createInstance( @Qualifier("kafkaConsumerFactory") - DefaultKafkaConsumerFactory kafkaConsumerFactory) { + DefaultKafkaConsumerFactory kafkaConsumerFactory, + @Qualifier("configurationProvider") ConfigurationProvider configurationProvider) { ConcurrentKafkaListenerContainerFactory factory = new ConcurrentKafkaListenerContainerFactory<>(); @@ -106,6 +116,18 @@ protected KafkaListenerContainerFactory createInstance( factory.setContainerCustomizer(new ThreadPoolContainerCustomizer()); factory.setConcurrency(kafkaEventConsumerConcurrency); + /* Sets up a delegating error handler for Deserialization errors, if disabled will + use DefaultErrorHandler (does back-off retry and then logs) rather than stopping the container. Stopping the container + prevents lost messages until the error can be examined, disabling this will allow progress, but may lose data + */ + if (configurationProvider.getKafka().getConsumer().isStopOnDeserializationError()) { + CommonDelegatingErrorHandler delegatingErrorHandler = + new CommonDelegatingErrorHandler(new DefaultErrorHandler()); + delegatingErrorHandler.addDelegate( + DeserializationException.class, new CommonContainerStoppingErrorHandler()); + factory.setCommonErrorHandler(delegatingErrorHandler); + } + log.info( String.format( "Event-based KafkaListenerContainerFactory built successfully. Consumer concurrency = %s", diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java index 7bc50a8f3dc7e6..598c252b4f7664 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -153,6 +153,28 @@ public BrowseResultV2 browseV2( @Nonnull Authentication authentication) throws RemoteInvocationException; + /** + * Gets browse snapshot of a given path + * + * @param entityNames entities being browsed + * @param path path being browsed + * @param filter browse filter + * @param input search query + * @param start start offset of first group + * @param count max number of results requested + * @throws RemoteInvocationException + */ + @Nonnull + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count, + @Nonnull Authentication authentication) + throws RemoteInvocationException; + @Deprecated public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication) throws RemoteInvocationException; diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index c854cb9dd279ec..d68c472ea91709 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -381,6 +381,20 @@ public BrowseResultV2 browseV2( throw new NotImplementedException("BrowseV2 is not implemented in Restli yet"); } + @Nonnull + @Override + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count, + @Nonnull Authentication authentication) + throws RemoteInvocationException { + throw new NotImplementedException("BrowseV2 is not implemented in Restli yet"); + } + public void update(@Nonnull final Entity entity, @Nonnull final Authentication authentication) throws RemoteInvocationException { EntitiesDoIngestRequestBuilder requestBuilder = diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java index d8fcbe0b7d44d3..e50b44b7f0eca3 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesArgs.java @@ -11,6 +11,9 @@ public class RestoreIndicesArgs implements Cloneable { public String aspectName; public String urn; public String urnLike; + public Boolean urnBasedPagination = false; + public String lastUrn = ""; + public String lastAspect = ""; @Override public RestoreIndicesArgs clone() { @@ -51,4 +54,9 @@ public RestoreIndicesArgs setBatchSize(Integer batchSize) { } return this; } + + public RestoreIndicesArgs setUrnBasedPagination(Boolean urnBasedPagination) { + this.urnBasedPagination = urnBasedPagination; + return this; + } } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java index 8479338660db0b..a270cf4548bed5 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/restoreindices/RestoreIndicesResult.java @@ -13,4 +13,6 @@ public class RestoreIndicesResult { public long aspectCheckMs = 0; public long createRecordMs = 0; public long sendMessageMs = 0; + public String lastUrn = ""; + public String lastAspect = ""; } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index 09a63e769f0253..189ae09e1b9382 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -207,6 +207,25 @@ public BrowseResultV2 browseV2( int start, int count); + /** + * Gets browse snapshot of a given path + * + * @param entityNames set of entities being browsed + * @param path path being browsed + * @param filter browse filter + * @param input search query + * @param start start offset of first group + * @param count max number of results requested + */ + @Nonnull + public BrowseResultV2 browseV2( + @Nonnull List entityNames, + @Nonnull String path, + @Nullable Filter filter, + @Nonnull String input, + int start, + int count); + /** * Gets a list of paths for a given urn. * diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js index 7ddf36aa87c2d1..dd3b0a567c75f8 100644 --- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js +++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js @@ -1,4 +1,5 @@ const glossaryTerm = "CypressGlosssaryNavigationTerm"; +const glossarySecondTerm = "CypressGlossarySecondTerm"; const glossaryTermGroup = "CypressGlosssaryNavigationGroup"; const glossaryParentGroup = "CypressNode"; @@ -30,6 +31,39 @@ describe("glossary sidebar navigation test", () => { cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click().wait(3000); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible"); + // Create another term and move it to the same term group + cy.clickOptionWithText(glossaryTermGroup); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-add-term-button"); + + // Wait for the create term modal to be visible + cy.waitTextVisible("Create Glossary Term"); + cy.enterTextInTestId("create-glossary-entity-modal-name", glossarySecondTerm); + cy.clickOptionWithTestId("glossary-entity-modal-create-button"); + + // Wait for the new term to be visible in the sidebar + cy.clickOptionWithText(glossarySecondTerm).wait(3000); + + // Move the term to the created term group + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-move-button"); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).click({ force: true }); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).should("be.visible"); + cy.clickOptionWithTestId("glossary-entity-modal-move-button"); + cy.waitTextVisible("Moved Glossary Term!"); + + // Ensure the new term is under the parent term group in the navigation sidebar + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click(); + cy.get('*[class^="GlossaryEntitiesList"]').contains(glossarySecondTerm).should("be.visible"); + + + // Switch between terms and ensure the "Properties" tab is active + cy.clickOptionWithText(glossaryTerm); + cy.get('[data-testid="entity-tab-headers-test-id"]').contains("Properties").click({ force: true }); + cy.get('[data-node-key="Properties"]').contains("Properties").should("have.attr", "aria-selected", "true"); + cy.clickOptionWithText(glossarySecondTerm); + cy.get('[data-node-key="Properties"]').contains("Properties").should("have.attr", "aria-selected", "true"); + // Move a term group from the root level to be under a parent term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryTermGroup); @@ -52,6 +86,10 @@ describe("glossary sidebar navigation test", () => { cy.clickOptionWithText(glossaryTerm).wait(3000); cy.deleteFromDropdown(); cy.waitTextVisible("Deleted Glossary Term!"); + cy.clickOptionWithText(glossaryTermGroup); + cy.clickOptionWithText(glossarySecondTerm).wait(3000); + cy.deleteFromDropdown(); + cy.waitTextVisible("Deleted Glossary Term!"); cy.clickOptionWithText(glossaryParentGroup); cy.clickOptionWithText(glossaryTermGroup).wait(3000); cy.deleteFromDropdown();