From 8c724dbf47dd76a4aefec0a93267e08ddeda7e58 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:45:38 -0600 Subject: [PATCH 01/17] feat(api): authorization extended for soft-delete and suspend (#12158) --- datahub-frontend/app/auth/AuthModule.java | 2 + .../upgrade/config/SystemUpdateConfig.java | 2 + .../restorebackup/RestoreStorageStep.java | 2 +- .../upgrade/system/AbstractMCLStep.java | 3 +- .../bootstrapmcps/BootstrapMCPUtil.java | 4 +- ...ateSchemaFieldsFromSchemaMetadataStep.java | 10 +- ...chemaFieldsFromSchemaMetadataStepTest.java | 3 +- .../aspect/CachingAspectRetriever.java | 36 +++- .../metadata/aspect/GraphRetriever.java | 23 +++ .../metadata/entity/SearchRetriever.java | 19 ++ .../metadata/aspect/MockAspectRetriever.java | 4 +- .../java/com/linkedin/metadata/Constants.java | 2 + .../ebean/batch/AspectsBatchImplTest.java | 8 +- .../aspect/utils/DefaultAspectsUtil.java | 2 +- .../client/EntityClientAspectRetriever.java | 7 +- .../metadata/client/JavaEntityClient.java | 21 ++- .../client/SystemJavaEntityClient.java | 2 +- .../entity/EntityServiceAspectRetriever.java | 10 +- .../metadata/entity/EntityServiceImpl.java | 67 +++---- .../linkedin/metadata/entity/EntityUtils.java | 2 +- .../cassandra/CassandraRetentionService.java | 2 +- .../entity/ebean/EbeanRetentionService.java | 2 +- .../query/filter/BaseQueryFilterRewriter.java | 2 +- .../SearchDocumentTransformer.java | 2 - .../BusinessAttributeUpdateHookService.java | 4 +- .../service/UpdateGraphIndicesService.java | 3 +- .../service/UpdateIndicesService.java | 5 +- .../metadata/AspectIngestionUtils.java | 12 +- .../hooks/IgnoreUnknownMutatorTest.java | 12 +- .../aspect/utils/DefaultAspectsUtilTest.java | 3 +- .../DataProductUnsetSideEffectTest.java | 8 +- .../entity/EbeanEntityServiceTest.java | 36 ++-- .../metadata/entity/EntityServiceTest.java | 118 ++++++------ .../cassandra/CassandraEntityServiceTest.java | 11 +- .../ebean/batch/ChangeItemImplTest.java | 4 +- .../RecommendationsServiceTest.java | 3 +- .../SchemaFieldSideEffectTest.java | 12 +- .../ContainerExpansionRewriterTest.java | 5 +- .../filter/DomainExpansionRewriterTest.java | 9 +- .../request/AggregationQueryBuilderTest.java | 9 +- .../request/SearchRequestHandlerTest.java | 1 + .../SearchDocumentTransformerTest.java | 12 ++ ...ropertyDefinitionDeleteSideEffectTest.java | 12 +- .../ShowPropertyAsBadgeValidatorTest.java | 2 +- .../io/datahubproject/test/DataGenerator.java | 5 +- .../MCLSpringCommonTestConfiguration.java | 3 +- .../hook/BusinessAttributeUpdateHookTest.java | 16 +- .../metadata/context/ActorContext.java | 48 +++++ .../metadata/context/OperationContext.java | 123 ++++++++----- .../metadata/context/RetrieverContext.java | 29 +++ .../exception/ActorAccessException.java | 7 + .../exception/OperationContextException.java | 9 + .../context/TestOperationContexts.java | 139 ++++++-------- .../context/OperationContextTest.java | 3 +- .../token/StatefulTokenService.java | 2 +- .../src/main/resources/application.yaml | 6 +- .../SystemOperationContextFactory.java | 14 +- .../IngestDataPlatformInstancesStep.java | 4 +- .../boot/steps/IngestPoliciesStep.java | 2 +- .../GlobalControllerExceptionHandler.java | 14 +- .../controller/GenericEntitiesController.java | 8 +- .../openapi/operations/test/IdController.java | 54 ++++++ .../openapi/util/MappingUtil.java | 2 +- .../v2/controller/EntityController.java | 4 +- .../v3/controller/EntityController.java | 4 +- ...m.linkedin.entity.entitiesV2.restspec.json | 8 + ...m.linkedin.entity.entitiesV2.snapshot.json | 8 + .../linkedin/entity/client/EntityClient.java | 71 ++++++- .../entity/client/RestliEntityClient.java | 13 +- .../client/SystemRestliEntityClient.java | 2 +- .../resources/entity/AspectResource.java | 2 +- .../resources/entity/EntityV2Resource.java | 10 +- .../resources/restli/RestliConstants.java | 3 + .../resources/restli/RestliUtils.java | 8 + .../resources/entity/AspectResourceTest.java | 2 +- .../tokens/revokable_access_token_test.py | 44 +---- .../tests/tokens/session_access_token_test.py | 173 ++++++++++++++++++ smoke-test/tests/tokens/token_utils.py | 53 ++++++ 78 files changed, 980 insertions(+), 431 deletions(-) create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java create mode 100644 metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java rename metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/{ => config}/GlobalControllerExceptionHandler.java (81%) create mode 100644 metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java create mode 100644 smoke-test/tests/tokens/session_access_token_test.py create mode 100644 smoke-test/tests/tokens/token_utils.py diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 7fa99ab3cb262..b95515684f01f 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -27,6 +27,7 @@ import io.datahubproject.metadata.context.EntityRegistryContext; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; +import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.SearchContext; import io.datahubproject.metadata.context.ValidationContext; import java.nio.charset.StandardCharsets; @@ -195,6 +196,7 @@ protected OperationContext provideOperationContext( .searchContext(SearchContext.EMPTY) .entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY)) .validationContext(ValidationContext.builder().alternateValidation(false).build()) + .retrieverContext(RetrieverContext.EMPTY) .build(systemAuthentication); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java index 661717c6309cf..fdd84da6044f7 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/SystemUpdateConfig.java @@ -13,6 +13,7 @@ import com.linkedin.gms.factory.kafka.common.TopicConventionFactory; import com.linkedin.gms.factory.kafka.schemaregistry.InternalSchemaRegistryFactory; import com.linkedin.gms.factory.search.BaseElasticSearchComponentsFactory; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.dao.producer.KafkaEventProducer; import com.linkedin.metadata.dao.producer.KafkaHealthChecker; @@ -186,6 +187,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java index 4d53b603c1eaf..1e5cd6cdb2417 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/restorebackup/RestoreStorageStep.java @@ -180,7 +180,7 @@ private void readerExecutable(ReaderWrapper reader, UpgradeContext context) { try { aspectRecord = EntityUtils.toSystemAspect( - context.opContext().getRetrieverContext().get(), aspect.toEntityAspect()) + context.opContext().getRetrieverContext(), aspect.toEntityAspect()) .get() .getRecordTemplate(); } catch (Exception e) { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java index cd7947ce3c11a..56feffd211bcd 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/AbstractMCLStep.java @@ -113,8 +113,7 @@ public Function executable() { List, SystemAspect>> futures; futures = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), - batch.collect(Collectors.toList())) + opContext.getRetrieverContext(), batch.collect(Collectors.toList())) .stream() .map( systemAspect -> { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java index 4cc3edff3eb52..5b807c6c450af 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/bootstrapmcps/BootstrapMCPUtil.java @@ -100,8 +100,8 @@ static AspectsBatch generateAspectBatch( .collect(Collectors.toList()); return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) - .retrieverContext(opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java index 55bc8edbf6a76..de03538907432 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/schemafield/GenerateSchemaFieldsFromSchemaMetadataStep.java @@ -168,13 +168,13 @@ public Function executable() { AspectsBatch aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( batch .flatMap( ebeanAspectV2 -> EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), Set.of(ebeanAspectV2)) .stream()) .map( @@ -189,11 +189,7 @@ public Function executable() { .auditStamp(systemAspect.getAuditStamp()) .systemMetadata( withAppSource(systemAspect.getSystemMetadata())) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); diff --git a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java index 3a2728b4e1d3d..04b1095e770e0 100644 --- a/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java +++ b/datahub-upgrade/src/test/java/com/linkedin/datahub/upgrade/schemafield/GenerateSchemaFieldsFromSchemaMetadataStepTest.java @@ -22,7 +22,6 @@ import com.linkedin.upgrade.DataHubUpgradeState; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RetrieverContext; -import java.util.Optional; import java.util.stream.Stream; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -48,7 +47,7 @@ public void setup() { step = new GenerateSchemaFieldsFromSchemaMetadataStep( mockOpContext, mockEntityService, mockAspectDao, 10, 100, 1000); - when(mockOpContext.getRetrieverContext()).thenReturn(Optional.of(mockRetrieverContext)); + when(mockOpContext.getRetrieverContext()).thenReturn(mockRetrieverContext); } /** Test to verify the correct step ID is returned. */ diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java index 77e799f752455..375dd8cf8911e 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/CachingAspectRetriever.java @@ -1,4 +1,38 @@ package com.linkedin.metadata.aspect; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.metadata.models.registry.EmptyEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nonnull; + /** Responses can be cached based on application.yaml caching configuration for the EntityClient */ -public interface CachingAspectRetriever extends AspectRetriever {} +public interface CachingAspectRetriever extends AspectRetriever { + + CachingAspectRetriever EMPTY = new EmptyAspectRetriever(); + + class EmptyAspectRetriever implements CachingAspectRetriever { + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public Map> getLatestSystemAspects( + Map> urnAspectNames) { + return Collections.emptyMap(); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return EmptyEntityRegistry.EMPTY; + } + } +} diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java index f6858e7da4ba6..30a2c1eb9df8c 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/GraphRetriever.java @@ -4,6 +4,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.query.filter.SortCriterion; +import java.util.Collections; import java.util.List; import java.util.function.Function; import javax.annotation.Nonnull; @@ -97,4 +98,26 @@ default void consumeRelatedEntities( } } } + + GraphRetriever EMPTY = new EmptyGraphRetriever(); + + class EmptyGraphRetriever implements GraphRetriever { + + @Nonnull + @Override + public RelatedEntitiesScrollResult scrollRelatedEntities( + @Nullable List sourceTypes, + @Nonnull Filter sourceEntityFilter, + @Nullable List destinationTypes, + @Nonnull Filter destinationEntityFilter, + @Nonnull List relationshipTypes, + @Nonnull RelationshipFilter relationshipFilter, + @Nonnull List sortCriterion, + @Nullable String scrollId, + int count, + @Nullable Long startTimeMillis, + @Nullable Long endTimeMillis) { + return new RelatedEntitiesScrollResult(0, 0, null, Collections.emptyList()); + } + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java index eaa106b8d1f63..d4894c97015f8 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/entity/SearchRetriever.java @@ -2,6 +2,7 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.ScrollResult; +import com.linkedin.metadata.search.SearchEntityArray; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -21,4 +22,22 @@ ScrollResult scroll( @Nullable Filter filters, @Nullable String scrollId, int count); + + SearchRetriever EMPTY = new EmptySearchRetriever(); + + class EmptySearchRetriever implements SearchRetriever { + + @Override + public ScrollResult scroll( + @Nonnull List entities, + @Nullable Filter filters, + @Nullable String scrollId, + int count) { + ScrollResult empty = new ScrollResult(); + empty.setEntities(new SearchEntityArray()); + empty.setNumEntities(0); + empty.setPageSize(0); + return empty; + } + } } diff --git a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java index 65705f15022b6..98a6d59004a92 100644 --- a/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java +++ b/entity-registry/src/testFixtures/java/com/linkedin/test/metadata/aspect/MockAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.data.DataMap; import com.linkedin.data.template.RecordTemplate; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.mxe.SystemMetadata; @@ -22,7 +22,7 @@ import javax.annotation.Nonnull; import org.mockito.Mockito; -public class MockAspectRetriever implements AspectRetriever { +public class MockAspectRetriever implements CachingAspectRetriever { private final Map> data; private final Map> systemData = new HashMap<>(); diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index ff6a79108600a..09f873ebf7bc9 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -409,6 +409,8 @@ public class Constants { /** User Status */ public static final String CORP_USER_STATUS_ACTIVE = "ACTIVE"; + public static final String CORP_USER_STATUS_SUSPENDED = "SUSPENDED"; + /** Task Runs */ public static final String DATA_PROCESS_INSTANCE_ENTITY_NAME = "dataProcessInstance"; diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index 9f57d36f800de..a3099b9ee21ea 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -16,7 +16,7 @@ import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.patch.GenericJsonPatch; @@ -56,7 +56,7 @@ public class AspectsBatchImplTest { private EntityRegistry testRegistry; - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeTest @@ -75,12 +75,12 @@ public void beforeTest() throws EntityRegistryException { @BeforeMethod public void setup() { - this.mockAspectRetriever = mock(AspectRetriever.class); + this.mockAspectRetriever = mock(CachingAspectRetriever.class); when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry); this.retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(mock(GraphRetriever.class)) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java index 99eadd223acd1..82bc0ae1409c5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtil.java @@ -137,7 +137,7 @@ public static List getAdditionalChanges( getProposalFromAspectForDefault( entry.getKey(), entry.getValue(), entityKeyAspect, templateItem), templateItem.getAuditStamp(), - opContext.getAspectRetrieverOpt().get())) + opContext.getAspectRetriever())) .filter(Objects::nonNull); }) .collect(Collectors.toList()); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java index bba8324d0c561..669ec751f87c6 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/EntityClientAspectRetriever.java @@ -35,7 +35,7 @@ public EntityRegistry getEntityRegistry() { @Override public Aspect getLatestAspectObject(@Nonnull Urn urn, @Nonnull String aspectName) { try { - return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName); + return entityClient.getLatestAspectObject(systemOperationContext, urn, aspectName, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -49,7 +49,7 @@ public Map> getLatestAspectObjects( return Map.of(); } else { try { - return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames); + return entityClient.getLatestAspects(systemOperationContext, urns, aspectNames, false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } @@ -70,7 +70,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())); + .collect(Collectors.toSet()), + false); } catch (RemoteInvocationException | URISyntaxException e) { throw new RuntimeException(e); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index 29faa3955ea66..3d35f5956b0f4 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -106,11 +106,17 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; - return entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return entityService.getEntityV2( + opContext, + entityName, + urn, + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } @Override @@ -126,7 +132,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull Set urns, - @Nullable Set aspectNames) + @Nullable Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final Set projectedAspects = aspectNames == null ? opContext.getEntityAspectNames(entityName) : aspectNames; @@ -139,7 +146,11 @@ public Map batchGetV2( try { responseMap.putAll( entityService.getEntitiesV2( - opContext, entityName, new HashSet<>(batch), projectedAspects)); + opContext, + entityName, + new HashSet<>(batch), + projectedAspects, + alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -772,7 +783,7 @@ public List batchIngestProposals( .mcps( batch, auditStamp, - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java index eda9b3a880228..1d2fd422d7f46 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/SystemJavaEntityClient.java @@ -89,6 +89,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java index 626a1f72f5fb7..50cf8af30d606 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceAspectRetriever.java @@ -5,7 +5,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.entity.Aspect; -import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.metadata.context.OperationContext; @@ -22,7 +22,7 @@ @Getter @Builder -public class EntityServiceAspectRetriever implements CachingAspectRetriever { +public class EntityServiceAspectRetriever implements AspectRetriever { @Setter private OperationContext systemOperationContext; private final EntityRegistry entityRegistry; @@ -46,7 +46,8 @@ public Map> getLatestAspectObjects( String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); try { return entityResponseToAspectMap( - entityService.getEntitiesV2(systemOperationContext, entityName, urns, aspectNames)); + entityService.getEntitiesV2( + systemOperationContext, entityName, urns, aspectNames, false)); } catch (URISyntaxException e) { throw new RuntimeException(e); } @@ -71,7 +72,8 @@ public Map> getLatestSystemAspects( urnAspectNames.keySet(), urnAspectNames.values().stream() .flatMap(Collection::stream) - .collect(Collectors.toSet())), + .collect(Collectors.toSet()), + false), entityRegistry); } catch (URISyntaxException e) { throw new RuntimeException(e); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 6de7784bfbc0e..8ae09111204ca 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -261,8 +261,7 @@ public Map> getLatestAspects( } List systemAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()); systemAspects.stream() // for now, don't add the key aspect here we have already added it above @@ -290,8 +289,7 @@ public Map getLatestAspectsForUrn( Map batchGetResults = getLatestAspect(opContext, new HashSet<>(Arrays.asList(urn)), aspectNames, forUpdate); - return EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), batchGetResults.values()) + return EntityUtils.toSystemAspects(opContext.getRetrieverContext(), batchGetResults.values()) .stream() .map( systemAspect -> Pair.of(systemAspect.getAspectName(), systemAspect.getRecordTemplate())) @@ -335,7 +333,7 @@ public Pair getAspectVersionPair( final Optional maybeAspect = Optional.ofNullable(aspectDao.getAspect(primaryKey)); return Pair.of( - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), maybeAspect.orElse(null)) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), maybeAspect.orElse(null)) .map(SystemAspect::getRecordTemplate) .orElse(null), version); @@ -721,7 +719,7 @@ public ListResult listLatestAspects( } return new ListResult<>( - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), entityAspects).stream() + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), entityAspects).stream() .map(SystemAspect::getRecordTemplate) .collect(Collectors.toList()), aspectMetadataList.getMetadata(), @@ -758,12 +756,12 @@ public List ingestAspects( .recordTemplate(pair.getValue()) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); return ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -815,13 +813,13 @@ private void processPostCommitMCLSideEffects( log.debug("Considering {} MCLs post commit side effects.", mcls.size()); List batch = mcls.stream() - .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetrieverOpt().get())) + .map(mcl -> MCLItemImpl.builder().build(mcl, opContext.getAspectRetriever())) .collect(Collectors.toList()); Iterable> iterable = () -> Iterators.partition( - AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext().get()) + AspectsBatch.applyPostMCPSideEffects(batch, opContext.getRetrieverContext()) .iterator(), MCP_SIDE_EFFECT_KAFKA_BATCH_SIZE); StreamSupport.stream(iterable.spliterator(), false) @@ -831,7 +829,7 @@ private void processPostCommitMCLSideEffects( ingestProposalAsync( AspectsBatchImpl.builder() .items(sideEffects) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build()) .count(); log.info("Generated {} MCP SideEffects for async processing", count); @@ -879,8 +877,7 @@ private List ingestAspectsToLocalDB( aspectDao.getLatestAspects(urnAspects, true); final Map> batchAspects = - EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), databaseAspects); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), databaseAspects); // read #2 (potentially) final Map> nextVersions = @@ -903,7 +900,7 @@ private List ingestAspectsToLocalDB( Map> newLatestAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspects(updatedItems.getFirst(), true)); // merge updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); @@ -941,7 +938,7 @@ private List ingestAspectsToLocalDB( // do final pre-commit checks with previous aspect value ValidationExceptionCollection exceptions = - AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext().get()); + AspectsBatch.validatePreCommit(changeMCPs, opContext.getRetrieverContext()); if (exceptions.hasFatalExceptions()) { // IF this is a client request/API request we fail the `transaction batch` @@ -1143,8 +1140,8 @@ public RecordTemplate ingestAspectIfNotPresent( .recordTemplate(newValue) .systemMetadata(systemMetadata) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()), - opContext.getRetrieverContext().get()) + .build(opContext.getAspectRetriever()), + opContext.getRetrieverContext()) .build(); List ingested = ingestAspects(opContext, aspectsBatch, true, false); @@ -1169,7 +1166,7 @@ public IngestResult ingestProposal( return ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, opContext.getRetrieverContext()) .build(), async) .stream() @@ -1246,7 +1243,7 @@ private Stream ingestTimeseriesProposal( .recordTemplate( EntityApiUtils.buildKeyAspect( opContext.getEntityRegistry(), item.getUrn())) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()); ingestProposalSync( @@ -1469,7 +1466,7 @@ public List restoreIndices( List systemAspects = EntityUtils.toSystemAspectFromEbeanAspects( - opContext.getRetrieverContext().get(), batch.collect(Collectors.toList())); + opContext.getRetrieverContext(), batch.collect(Collectors.toList())); RestoreIndicesResult result = restoreIndices(opContext, systemAspects, logger); result.timeSqlQueryMs = timeSqlQueryMs; @@ -1513,7 +1510,7 @@ public List restoreIndices( long startTime = System.currentTimeMillis(); List systemAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), getLatestAspect(opContext, entityBatch.getValue(), aspectNames, false).values()); long timeSqlQueryMs = System.currentTimeMillis() - startTime; @@ -1649,12 +1646,12 @@ private RestoreIndicesResult restoreIndices( .auditStamp(auditStamp) .systemMetadata(latestSystemMetadata) .recordTemplate(EntityApiUtils.buildKeyAspect(opContext.getEntityRegistry(), urn)) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); Stream defaultAspectsResult = ingestProposalSync( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(keyAspect) .build()); defaultAspectsCreated += defaultAspectsResult.count(); @@ -1966,7 +1963,7 @@ private void ingestSnapshotUnion( AspectsBatchImpl aspectsBatch = AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( aspectRecordsToIngest.stream() .map( @@ -1977,7 +1974,7 @@ private void ingestSnapshotUnion( .recordTemplate(pair.getValue()) .auditStamp(auditStamp) .systemMetadata(systemMetadata) - .build(opContext.getAspectRetrieverOpt().get())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList())) .build(); @@ -2128,7 +2125,7 @@ public RollbackRunResult deleteUrn(@Nonnull OperationContext opContext, Urn urn) } SystemMetadata latestKeySystemMetadata = - EntityUtils.toSystemAspect(opContext.getRetrieverContext().get(), latestKey) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), latestKey) .map(SystemAspect::getSystemMetadata) .get(); RollbackResult result = @@ -2253,11 +2250,11 @@ private RollbackResult deleteAspectWithoutMCL( .urn(entityUrn) .aspectName(aspectName) .auditStamp(auditStamp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); // Delete validation hooks ValidationExceptionCollection exceptions = - AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext().get()); + AspectsBatch.validateProposed(List.of(deleteItem), opContext.getRetrieverContext()); if (!exceptions.isEmpty()) { throw new ValidationException(collectMetrics(exceptions).toString()); } @@ -2271,7 +2268,7 @@ private RollbackResult deleteAspectWithoutMCL( final EntityAspect.EntitySystemAspect latest = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getLatestAspect(urn, aspectName, false)) .orElse(null); @@ -2299,7 +2296,7 @@ private RollbackResult deleteAspectWithoutMCL( EntityAspect.EntitySystemAspect candidateAspect = (EntityAspect.EntitySystemAspect) EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), + opContext.getRetrieverContext(), aspectDao.getAspect(urn, aspectName, maxVersion)) .orElse(null); SystemMetadata previousSysMetadata = @@ -2325,13 +2322,9 @@ private RollbackResult deleteAspectWithoutMCL( .urn(UrnUtils.getUrn(toDelete.getUrn())) .aspectName(toDelete.getAspect()) .auditStamp(auditStamp) - .build( - opContext - .getRetrieverContext() - .get() - .getAspectRetriever())) + .build(opContext.getAspectRetriever())) .collect(Collectors.toList()), - opContext.getRetrieverContext().get()); + opContext.getRetrieverContext()); if (!preCommitExceptions.isEmpty()) { throw new ValidationException(collectMetrics(preCommitExceptions).toString()); } @@ -2509,7 +2502,7 @@ private Map getEnvelopedAspects( final Map dbEntries = aspectDao.batchGet(dbKeys, false); List envelopedAspects = - EntityUtils.toSystemAspects(opContext.getRetrieverContext().get(), dbEntries.values()); + EntityUtils.toSystemAspects(opContext.getRetrieverContext(), dbEntries.values()); return envelopedAspects.stream() .collect( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java index 3c4109970e9d0..da48a2b76d6d5 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java @@ -72,7 +72,7 @@ public static void ingestChangeProposals( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext().get()) + .mcps(changes, getAuditStamp(actor), opContext.getRetrieverContext()) .build(), async); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java index ccc1910ba5cdb..c595e3e07b834 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/cassandra/CassandraRetentionService.java @@ -64,7 +64,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java index 49fa555e006f6..74d0d8b0964de 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanRetentionService.java @@ -59,7 +59,7 @@ protected AspectsBatch buildAspectsBatch( List mcps, @Nonnull AuditStamp auditStamp) { return AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, opContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, opContext.getRetrieverContext()) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java index 367705d369c7c..6c5c6243d3362 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/filter/BaseQueryFilterRewriter.java @@ -143,7 +143,7 @@ private static QueryBuilder expandTerms( if (!queryUrns.isEmpty()) { scrollGraph( - opContext.getRetrieverContext().get().getGraphRetriever(), + opContext.getRetrieverContext().getGraphRetriever(), queryUrns, relationshipTypes, relationshipDirection, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java index 4bb8e0630de48..b4ad847cb7afc 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformer.java @@ -437,8 +437,6 @@ private void setStructuredPropertiesSearchValue( Map> definitions = opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( propertyMap.keySet(), Set.of(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java index ad2825ead3d0d..4a692e9534622 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/BusinessAttributeUpdateHookService.java @@ -112,7 +112,7 @@ private void fetchRelatedEntities( @Nullable String scrollId, int consumedEntityCount, int batchNumber) { - GraphRetriever graph = opContext.getRetrieverContext().get().getGraphRetriever(); + GraphRetriever graph = opContext.getRetrieverContext().getGraphRetriever(); final ArrayList> futureList = new ArrayList<>(); RelatedEntitiesScrollResult result = graph.scrollRelatedEntities( @@ -165,7 +165,7 @@ private Callable processBatch( return () -> { StopWatch stopWatch = new StopWatch(); stopWatch.start(); - AspectRetriever aspectRetriever = opContext.getAspectRetrieverOpt().get(); + AspectRetriever aspectRetriever = opContext.getAspectRetriever(); log.info("Batch {} for BA:{} started", batchNumber, entityKey); ExecutionResult executionResult = new ExecutionResult(); executionResult.setBatchNumber(batchNumber); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java index efe073fc00dfd..4b09bc00efb61 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateGraphIndicesService.java @@ -94,8 +94,7 @@ public UpdateGraphIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl mclItem = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl mclItem = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); if (UPDATE_CHANGE_TYPES.contains(event.getChangeType())) { handleUpdateChangeEvent(opContext, mclItem); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index 187ef3e8c6229..c5fc9ebdac9fa 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -121,11 +121,10 @@ public UpdateIndicesService( public void handleChangeEvent( @Nonnull OperationContext opContext, @Nonnull final MetadataChangeLog event) { try { - MCLItemImpl batch = - MCLItemImpl.builder().build(event, opContext.getAspectRetrieverOpt().get()); + MCLItemImpl batch = MCLItemImpl.builder().build(event, opContext.getAspectRetriever()); Stream sideEffects = - AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext().get()); + AspectsBatch.applyMCLSideEffects(List.of(batch), opContext.getRetrieverContext()); for (MCLItem mclItem : Stream.concat(Stream.of(batch), sideEffects).collect(Collectors.toList())) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java index 12b12cf105196..fa6ab7932001b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/AspectIngestionUtils.java @@ -46,12 +46,12 @@ public static Map ingestCorpUserKeyAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -83,12 +83,12 @@ public static Map ingestCorpUserInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -121,12 +121,12 @@ public static Map ingestChartInfoAspects( .recordTemplate(aspect) .auditStamp(AspectGenerationUtils.createAuditStamp()) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); } entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java index 11a3153abcaee..19be1eb14667d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java @@ -16,7 +16,8 @@ import com.linkedin.data.template.StringMap; import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; import com.linkedin.metadata.entity.SearchRetriever; @@ -28,7 +29,6 @@ import com.linkedin.mxe.SystemMetadata; import com.linkedin.test.metadata.aspect.TestEntityRegistry; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.List; @@ -53,17 +53,17 @@ public class IgnoreUnknownMutatorTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java index 04aff4edf456d..e7ed267113159 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/utils/DefaultAspectsUtilTest.java @@ -56,8 +56,7 @@ public void testAdditionalChanges() { DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext().get()) + .mcps(List.of(proposal1), new AuditStamp(), opContext.getRetrieverContext()) .build() .getMCPItems(), entityServiceImpl, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java index 976b165fea53d..215e1e2431efa 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java @@ -15,7 +15,7 @@ import com.linkedin.dataproduct.DataProductAssociationArray; import com.linkedin.dataproduct.DataProductProperties; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.MCPItem; @@ -75,12 +75,12 @@ public class DataProductUnsetSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); GraphRetriever graphRetriever = mock(GraphRetriever.class); RelatedEntities relatedEntities = @@ -139,7 +139,7 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .graphRetriever(graphRetriever) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 0386031cbcad8..88f84ee94c8ee 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -19,6 +19,7 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.EbeanTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectDao; @@ -98,12 +99,15 @@ public void setupTest() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } @@ -152,25 +156,25 @@ public void testIngestListLatestAspects() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -230,25 +234,25 @@ public void testIngestListUrns() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)), + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null))); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null))); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -310,11 +314,11 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item)) .build(), false, @@ -356,7 +360,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items( List.of( ChangeItemImpl.builder() @@ -365,7 +369,7 @@ public void testSystemMetadataDuplicateKey() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)))) + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)))) .build(), false, true); @@ -600,7 +604,7 @@ public void run() { auditStamp.setTime(System.currentTimeMillis()); AspectsBatchImpl batch = AspectsBatchImpl.builder() - .mcps(mcps, auditStamp, operationContext.getRetrieverContext().get()) + .mcps(mcps, auditStamp, operationContext.getRetrieverContext()) .build(); entityService.ingestProposal(operationContext, batch, false); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 2d59632e6f3c6..c00632e5cf542 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -945,32 +945,32 @@ public void testRollbackAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1037,25 +1037,25 @@ public void testRollbackKey() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1130,39 +1130,39 @@ public void testRollbackUrn() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(keyAspectName) .recordTemplate(writeKey1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn2) .aspectName(aspectName) .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn3) .aspectName(aspectName) .recordTemplate(writeAspect3) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn1) .aspectName(aspectName) .recordTemplate(writeAspect1Overwrite) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1208,11 +1208,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1264,11 +1264,11 @@ public void testIngestGetLatestAspect() throws AssertionError { .recordTemplate(writeAspect2) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata2) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1320,11 +1320,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect1) .auditStamp(TEST_AUDIT_STAMP) .systemMetadata(metadata1) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1347,11 +1347,11 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1416,11 +1416,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(metadata1) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1472,11 +1472,11 @@ public void testIngestSameAspect() throws AssertionError { .recordTemplate(writeAspect2) .systemMetadata(metadata2) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1534,46 +1534,46 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName) .recordTemplate(writeAspect1b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2a) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2b) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1610,18 +1610,18 @@ public void testRetention() throws AssertionError { .recordTemplate(writeAspect1c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get()), + .build(opContext.getAspectRetriever()), ChangeItemImpl.builder() .urn(entityUrn) .aspectName(aspectName2) .recordTemplate(writeAspect2c) .systemMetadata(AspectGenerationUtils.createSystemMetadata()) .auditStamp(TEST_AUDIT_STAMP) - .build(opContext.getAspectRetrieverOpt().get())); + .build(opContext.getAspectRetriever())); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(items) .build(), true, @@ -1982,8 +1982,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -1995,7 +1994,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { SystemEntityClient mockSystemEntityClient = Mockito.mock(SystemEntityClient.class); Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(firstPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(firstPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(structuredPropertyDefinition.data())); // Add a value for that property @@ -2062,8 +2064,7 @@ public void testStructuredPropertyIngestProposal() throws Exception { stream .map( entityAspect -> - EntityUtils.toSystemAspect( - opContext.getRetrieverContext().get(), entityAspect) + EntityUtils.toSystemAspect(opContext.getRetrieverContext(), entityAspect) .get() .getAspect(StructuredPropertyDefinition.class)) .collect(Collectors.toSet()); @@ -2074,7 +2075,10 @@ public void testStructuredPropertyIngestProposal() throws Exception { Mockito.when( mockSystemEntityClient.getLatestAspectObject( - any(OperationContext.class), eq(secondPropertyUrn), eq("propertyDefinition"))) + any(OperationContext.class), + eq(secondPropertyUrn), + eq("propertyDefinition"), + anyBoolean())) .thenReturn(new com.linkedin.entity.Aspect(secondDefinition.data())); // Get existing value for first structured property @@ -2209,7 +2213,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -2217,11 +2221,11 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(item1, item2)) .build(), false, @@ -2269,7 +2273,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2311,7 +2315,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2320,7 +2324,7 @@ public void testBatchPatchWithTrailingNoOp() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchRemoveNonExistent)) .build(), false, @@ -2368,7 +2372,7 @@ public void testBatchPatchAdd() throws Exception { .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd3 = PatchItemImpl.builder() @@ -2428,7 +2432,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2437,7 +2441,7 @@ public void testBatchPatchAdd() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd3, patchAdd2, patchAdd1)) .build(), false, @@ -2491,7 +2495,7 @@ public void testBatchPatchAddDuplicate() throws Exception { .recordTemplate(new GlobalTags().setTags(new TagAssociationArray(initialTags))) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); PatchItemImpl patchAdd2 = PatchItemImpl.builder() @@ -2516,7 +2520,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(initialAspectTag1)) .build(), false, @@ -2525,7 +2529,7 @@ public void testBatchPatchAddDuplicate() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd2, patchAdd2)) // duplicate .build(), false, @@ -2581,7 +2585,7 @@ public void testPatchRemoveNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchRemove)) .build(), false, @@ -2638,7 +2642,7 @@ public void testPatchAddNonExistent() throws Exception { _entityServiceImpl.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(patchAdd)) .build(), false, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java index 550f55e6bfd0b..b4fbfecc9d60d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/cassandra/CassandraEntityServiceTest.java @@ -10,11 +10,13 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.AspectIngestionUtils; import com.linkedin.metadata.CassandraTestUtils; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.EntityServiceAspectRetriever; import com.linkedin.metadata.entity.EntityServiceImpl; import com.linkedin.metadata.entity.EntityServiceTest; import com.linkedin.metadata.entity.ListResult; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; @@ -93,12 +95,15 @@ private void configureComponents() { .entityService(_entityServiceImpl) .entityRegistry(_testEntityRegistry) .build()) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> _testEntityRegistry)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, opContext -> - ((EntityServiceAspectRetriever) opContext.getAspectRetrieverOpt().get()) + ((EntityServiceAspectRetriever) opContext.getAspectRetriever()) .setSystemOperationContext(opContext), null); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java index 3f6b301e72aa5..0a867ae3c8f2e 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImplTest.java @@ -26,7 +26,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(true)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); ChangeItemImpl item2 = ChangeItemImpl.builder() .urn(entityUrn) @@ -34,7 +34,7 @@ public void testBatchDuplicate() throws Exception { .recordTemplate(new Status().setRemoved(false)) .systemMetadata(systemMetadata.copy()) .auditStamp(TEST_AUDIT_STAMP) - .build(TestOperationContexts.emptyAspectRetriever(null)); + .build(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); assertFalse(item1.isDatabaseDuplicateOf(item2)); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java index ca42f0327c86d..8f68f119cb0b7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/recommendation/RecommendationsServiceTest.java @@ -11,6 +11,7 @@ import com.linkedin.metadata.recommendation.ranker.SimpleRecommendationRanker; import io.datahubproject.test.metadata.context.TestOperationContexts; import java.net.URISyntaxException; +import java.nio.file.AccessDeniedException; import java.util.List; import java.util.stream.Collectors; import org.testng.annotations.Test; @@ -74,7 +75,7 @@ private List getContentFromUrns(List urns) { } @Test - public void testService() throws URISyntaxException { + public void testService() throws URISyntaxException, AccessDeniedException { // Test non-eligible and empty RecommendationsService service = new RecommendationsService(ImmutableList.of(nonEligibleSource, emptySource), ranker); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java index 1661f5f02ee59..fa895cb454011 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/schemafields/sideeffects/SchemaFieldSideEffectTest.java @@ -21,7 +21,8 @@ import com.linkedin.data.ByteString; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCLItem; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -46,7 +47,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCP; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -87,18 +87,18 @@ public class SchemaFieldSideEffectTest { .build())) .build(); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); retrieverContext = RetrieverContext.builder() .searchRetriever(mock(SearchRetriever.class)) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java index fd768424e13c1..1825b65a18ab1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/ContainerExpansionRewriterTest.java @@ -20,6 +20,7 @@ import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java index 8741e24b1bca5..de375271ed660 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/filter/DomainExpansionRewriterTest.java @@ -13,13 +13,14 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.models.graph.Edge; import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.config.search.QueryFilterRewriterConfiguration; +import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; @@ -54,7 +55,7 @@ public class DomainExpansionRewriterTest @BeforeMethod public void init() { EntityRegistry entityRegistry = new TestEntityRegistry(); - AspectRetriever mockAspectRetriever = mock(AspectRetriever.class); + CachingAspectRetriever mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(entityRegistry); mockGraphRetriever = spy(GraphRetriever.class); @@ -71,8 +72,10 @@ public void init() { () -> io.datahubproject.metadata.context.RetrieverContext.builder() .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever(() -> entityRegistry)) .graphRetriever(mockGraphRetriever) - .searchRetriever(TestOperationContexts.emptySearchRetriever) + .searchRetriever(SearchRetriever.EMPTY) .build(), null, null, diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java index c68997e25bcff..d6f5f9c3eedbe 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/AggregationQueryBuilderTest.java @@ -18,6 +18,7 @@ import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation; @@ -49,8 +50,8 @@ public class AggregationQueryBuilderTest { - private static AspectRetriever aspectRetriever; - private static AspectRetriever aspectRetrieverV1; + private static CachingAspectRetriever aspectRetriever; + private static CachingAspectRetriever aspectRetrieverV1; private static String DEFAULT_FILTER = "_index"; @BeforeClass @@ -61,7 +62,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { Urn.createFromString("urn:li:structuredProperty:under.scores.and.dots_make_a_mess"); // legacy - aspectRetriever = mock(AspectRetriever.class); + aspectRetriever = mock(CachingAspectRetriever.class); when(aspectRetriever.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); @@ -106,7 +107,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { new Aspect(structPropUnderscoresAndDotsDefinition.data())))); // V1 - aspectRetrieverV1 = mock(AspectRetriever.class); + aspectRetrieverV1 = mock(CachingAspectRetriever.class); when(aspectRetrieverV1.getEntityRegistry()) .thenReturn(TestOperationContexts.defaultEntityRegistry()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 393ca3ca5d4a6..e51511699e345 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -662,6 +662,7 @@ public void testInvalidStructuredProperty() { TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever(TestOperationContexts.emptyActiveUsersAspectRetriever(null)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java index 2c5bcd1294fa1..65b73b7425b74 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/transformer/SearchDocumentTransformerTest.java @@ -247,6 +247,9 @@ public void testSetSearchableRefValue() throws URISyntaxException, RemoteInvocat TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -301,6 +304,9 @@ public void testSetSearchableRefValue_RuntimeException() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -337,6 +343,9 @@ public void testSetSearchableRefValue_RuntimeException_URNExist() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); @@ -369,6 +378,9 @@ void testSetSearchableRefValue_WithInvalidURN() TestOperationContexts.systemContextNoSearchAuthorization( RetrieverContext.builder() .aspectRetriever(aspectRetriever) + .cachingAspectRetriever( + TestOperationContexts.emptyActiveUsersAspectRetriever( + () -> TEST_ENTITY_REGISTRY)) .graphRetriever(mock(GraphRetriever.class)) .searchRetriever(mock(SearchRetriever.class)) .build()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java index b1b716c560481..9a0a82c7f9f49 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/hooks/PropertyDefinitionDeleteSideEffectTest.java @@ -18,7 +18,8 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.entity.Aspect; import com.linkedin.events.metadata.ChangeType; -import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.batch.PatchMCP; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; @@ -36,7 +37,6 @@ import com.linkedin.test.metadata.aspect.TestEntityRegistry; import com.linkedin.test.metadata.aspect.batch.TestMCL; import io.datahubproject.metadata.context.RetrieverContext; -import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.json.Json; import jakarta.json.JsonPatch; import java.util.List; @@ -76,13 +76,13 @@ public class PropertyDefinitionDeleteSideEffectTest { private static final Urn TEST_DATASET_URN = UrnUtils.getUrn( "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); - private AspectRetriever mockAspectRetriever; + private CachingAspectRetriever mockAspectRetriever; private SearchRetriever mockSearchRetriever; private RetrieverContext retrieverContext; @BeforeMethod public void setup() { - mockAspectRetriever = mock(AspectRetriever.class); + mockAspectRetriever = mock(CachingAspectRetriever.class); when(mockAspectRetriever.getEntityRegistry()).thenReturn(TEST_REGISTRY); when(mockAspectRetriever.getLatestAspectObject( eq(TEST_PROPERTY_URN), eq(STRUCTURED_PROPERTY_DEFINITION_ASPECT_NAME))) @@ -101,8 +101,8 @@ public void setup() { retrieverContext = RetrieverContext.builder() .searchRetriever(mockSearchRetriever) - .aspectRetriever(mockAspectRetriever) - .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .cachingAspectRetriever(mockAspectRetriever) + .graphRetriever(GraphRetriever.EMPTY) .build(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java index 2503faa00f6e7..6e8886f495c95 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/ShowPropertyAsBadgeValidatorTest.java @@ -58,7 +58,7 @@ public void setup() { mockGraphRetriever = Mockito.mock(GraphRetriever.class); retrieverContext = io.datahubproject.metadata.context.RetrieverContext.builder() - .aspectRetriever(mockAspectRetriever) + .cachingAspectRetriever(mockAspectRetriever) .searchRetriever(mockSearchRetriever) .graphRetriever(mockGraphRetriever) .build(); diff --git a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java index 3acd2bf341357..02cd28eb202e9 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java +++ b/metadata-io/src/test/java/io/datahubproject/test/DataGenerator.java @@ -171,10 +171,7 @@ public Stream> generateMCPs( DefaultAspectsUtil.getAdditionalChanges( opContext, AspectsBatchImpl.builder() - .mcps( - List.of(mcp), - auditStamp, - opContext.getRetrieverContext().get()) + .mcps(List.of(mcp), auditStamp, opContext.getRetrieverContext()) .build() .getMCPItems(), entityService, diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index cf9d73dfa729b..f16c9dbd82e74 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -20,7 +20,6 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.OperationContextConfig; -import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; @@ -95,7 +94,7 @@ public OperationContext operationContext( entityRegistry, mock(ServicesRegistryContext.class), indexConvention, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(() -> entityRegistry), mock(ValidationContext.class)); } diff --git a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java index 47740b02d6166..65ee6b8591f48 100644 --- a/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java +++ b/metadata-jobs/pe-consumer/src/test/java/com/datahub/event/hook/BusinessAttributeUpdateHookTest.java @@ -93,8 +93,6 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { new RelatedEntity(BUSINESS_ATTRIBUTE_OF, SCHEMA_FIELD_URN.toString()))); when(opContext - .getRetrieverContext() - .get() .getAspectRetriever() .getLatestAspectObjects( eq(Set.of(SCHEMA_FIELD_URN)), eq(Set.of(BUSINESS_ATTRIBUTE_ASPECT)))) @@ -108,7 +106,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { // verify // page 1 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -122,7 +120,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); // page 2 - Mockito.verify(opContext.getRetrieverContext().get().getGraphRetriever(), Mockito.times(1)) + Mockito.verify(opContext.getRetrieverContext().getGraphRetriever(), Mockito.times(1)) .scrollRelatedEntities( isNull(), any(Filter.class), @@ -136,7 +134,7 @@ public void testMCLOnBusinessAttributeUpdate() throws Exception { isNull(), isNull()); - Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); + Mockito.verifyNoMoreInteractions(opContext.getRetrieverContext().getGraphRetriever()); // 2 pages = 2 ingest proposals Mockito.verify(mockUpdateIndicesService, Mockito.times(2)) @@ -152,8 +150,8 @@ private void testMCLOnInvalidCategory() throws Exception { businessAttributeServiceHook.handleChangeEvent(opContext, platformEvent); // verify - Mockito.verifyNoInteractions(opContext.getRetrieverContext().get().getGraphRetriever()); - Mockito.verifyNoInteractions(opContext.getAspectRetrieverOpt().get()); + Mockito.verifyNoInteractions(opContext.getRetrieverContext().getGraphRetriever()); + Mockito.verifyNoInteractions(opContext.getAspectRetriever()); Mockito.verifyNoInteractions(mockUpdateIndicesService); } @@ -226,13 +224,15 @@ private OperationContext mockOperationContextWithGraph(List graph RetrieverContext mockRetrieverContext = mock(RetrieverContext.class); when(mockRetrieverContext.getAspectRetriever()).thenReturn(mock(AspectRetriever.class)); + when(mockRetrieverContext.getCachingAspectRetriever()) + .thenReturn(TestOperationContexts.emptyActiveUsersAspectRetriever(null)); when(mockRetrieverContext.getGraphRetriever()).thenReturn(graphRetriever); OperationContext opContext = TestOperationContexts.systemContextNoSearchAuthorization(mockRetrieverContext); // reset mock for test - reset(opContext.getAspectRetrieverOpt().get()); + reset(opContext.getAspectRetriever()); if (!graphEdges.isEmpty()) { diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java index e65bf22991736..c08b7fad4dee3 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/ActorContext.java @@ -1,12 +1,23 @@ package io.datahubproject.metadata.context; +import static com.linkedin.metadata.Constants.CORP_USER_KEY_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.CORP_USER_STATUS_SUSPENDED; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.SYSTEM_ACTOR; + import com.datahub.authentication.Authentication; +import com.linkedin.common.Status; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserStatus; +import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; import java.util.Collection; import java.util.Collections; +import java.util.Map; import java.util.Optional; import java.util.Set; import lombok.Builder; @@ -48,6 +59,43 @@ public Urn getActorUrn() { return UrnUtils.getUrn(authentication.getActor().toUrnStr()); } + /** + * Actor is considered active if the user is not hard-deleted, soft-deleted, and is not suspended + * + * @param aspectRetriever aspect retriever - ideally the SystemEntityClient backed one for caching + * @return active status + */ + public boolean isActive(AspectRetriever aspectRetriever) { + // system cannot be disabled + if (SYSTEM_ACTOR.equals(authentication.getActor().toUrnStr())) { + return true; + } + + Urn selfUrn = UrnUtils.getUrn(authentication.getActor().toUrnStr()); + Map> urnAspectMap = + aspectRetriever.getLatestAspectObjects( + Set.of(selfUrn), + Set.of(STATUS_ASPECT_NAME, CORP_USER_STATUS_ASPECT_NAME, CORP_USER_KEY_ASPECT_NAME)); + + Map aspectMap = urnAspectMap.getOrDefault(selfUrn, Map.of()); + + if (!aspectMap.containsKey(CORP_USER_KEY_ASPECT_NAME)) { + // user is hard deleted + return false; + } + + Status status = + Optional.ofNullable(aspectMap.get(STATUS_ASPECT_NAME)) + .map(a -> new Status(a.data())) + .orElse(new Status().setRemoved(false)); + CorpUserStatus corpUserStatus = + Optional.ofNullable(aspectMap.get(CORP_USER_STATUS_ASPECT_NAME)) + .map(a -> new CorpUserStatus(a.data())) + .orElse(new CorpUserStatus().setStatus("")); + + return !status.isRemoved() && !CORP_USER_STATUS_SUSPENDED.equals(corpUserStatus.getStatus()); + } + /** * The current implementation creates a cache entry unique for the set of policies. * diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java index 9a058c526647c..9158129235b39 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/OperationContext.java @@ -16,6 +16,8 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; +import io.datahubproject.metadata.exception.ActorAccessException; +import io.datahubproject.metadata.exception.OperationContextException; import java.util.Collection; import java.util.Objects; import java.util.Optional; @@ -63,6 +65,24 @@ public static OperationContext asSession( @Nonnull Authorizer authorizer, @Nonnull Authentication sessionAuthentication, boolean allowSystemAuthentication) { + return OperationContext.asSession( + systemOperationContext, + requestContext, + authorizer, + sessionAuthentication, + allowSystemAuthentication, + false); + } + + @Nonnull + public static OperationContext asSession( + OperationContext systemOperationContext, + @Nonnull RequestContext requestContext, + @Nonnull Authorizer authorizer, + @Nonnull Authentication sessionAuthentication, + boolean allowSystemAuthentication, + boolean skipCache) + throws ActorAccessException { return systemOperationContext.toBuilder() .operationContextConfig( // update allowed system authentication @@ -72,7 +92,7 @@ public static OperationContext asSession( .authorizationContext(AuthorizationContext.builder().authorizer(authorizer).build()) .requestContext(requestContext) .validationContext(systemOperationContext.getValidationContext()) - .build(sessionAuthentication); + .build(sessionAuthentication, skipCache); } /** @@ -85,10 +105,14 @@ public static OperationContext asSession( public static OperationContext withSearchFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update search flags for the request's session - .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update search flags for the request's session + .searchContext(opContext.getSearchContext().withFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -101,10 +125,14 @@ public static OperationContext withSearchFlags( public static OperationContext withLineageFlags( OperationContext opContext, Function flagDefaults) { - return opContext.toBuilder() - // update lineage flags for the request's session - .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) - .build(opContext.getSessionActorContext()); + try { + return opContext.toBuilder() + // update lineage flags for the request's session + .searchContext(opContext.getSearchContext().withLineageFlagDefaults(flagDefaults)) + .build(opContext.getSessionActorContext(), false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } /** @@ -155,18 +183,22 @@ public static OperationContext asSystem( ? SearchContext.EMPTY : SearchContext.builder().indexConvention(indexConvention).build(); - return OperationContext.builder() - .operationContextConfig(systemConfig) - .systemActorContext(systemActorContext) - .searchContext(systemSearchContext) - .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) - .servicesRegistryContext(servicesRegistryContext) - // Authorizer.EMPTY doesn't actually apply to system auth - .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) - .retrieverContext(retrieverContext) - .objectMapperContext(objectMapperContext) - .validationContext(validationContext) - .build(systemAuthentication); + try { + return OperationContext.builder() + .operationContextConfig(systemConfig) + .systemActorContext(systemActorContext) + .searchContext(systemSearchContext) + .entityRegistryContext(EntityRegistryContext.builder().build(entityRegistry)) + .servicesRegistryContext(servicesRegistryContext) + // Authorizer.EMPTY doesn't actually apply to system auth + .authorizationContext(AuthorizationContext.builder().authorizer(Authorizer.EMPTY).build()) + .retrieverContext(retrieverContext) + .objectMapperContext(objectMapperContext) + .validationContext(validationContext) + .build(systemAuthentication, false); + } catch (OperationContextException e) { + throw new RuntimeException(e); + } } @Nonnull private final OperationContextConfig operationContextConfig; @@ -177,7 +209,7 @@ public static OperationContext asSystem( @Nonnull private final EntityRegistryContext entityRegistryContext; @Nullable private final ServicesRegistryContext servicesRegistryContext; @Nullable private final RequestContext requestContext; - @Nullable private final RetrieverContext retrieverContext; + @Nonnull private final RetrieverContext retrieverContext; @Nonnull private final ObjectMapperContext objectMapperContext; @Nonnull private final ValidationContext validationContext; @@ -194,13 +226,15 @@ public OperationContext withLineageFlags( public OperationContext asSession( @Nonnull RequestContext requestContext, @Nonnull Authorizer authorizer, - @Nonnull Authentication sessionAuthentication) { + @Nonnull Authentication sessionAuthentication) + throws ActorAccessException { return OperationContext.asSession( this, requestContext, authorizer, sessionAuthentication, - getOperationContextConfig().isAllowSystemAuthentication()); + getOperationContextConfig().isAllowSystemAuthentication(), + false); } @Nonnull @@ -284,17 +318,9 @@ public AuditStamp getAuditStamp() { return getAuditStamp(null); } - public Optional getRetrieverContext() { - return Optional.ofNullable(retrieverContext); - } - - @Nullable + @Nonnull public AspectRetriever getAspectRetriever() { - return getAspectRetrieverOpt().orElse(null); - } - - public Optional getAspectRetrieverOpt() { - return getRetrieverContext().map(RetrieverContext::getAspectRetriever); + return retrieverContext.getAspectRetriever(); } /** @@ -336,10 +362,7 @@ public String getGlobalContextId() { ? EmptyContext.EMPTY : getServicesRegistryContext()) .add(getRequestContext() == null ? EmptyContext.EMPTY : getRequestContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .add(getObjectMapperContext()) .build() .stream() @@ -364,10 +387,7 @@ public String getSearchContextId() { getServicesRegistryContext() == null ? EmptyContext.EMPTY : getServicesRegistryContext()) - .add( - getRetrieverContext().isPresent() - ? getRetrieverContext().get() - : EmptyContext.EMPTY) + .add(getRetrieverContext()) .build() .stream() .map(ContextInterface::getCacheKeyComponent) @@ -438,6 +458,12 @@ public static class OperationContextBuilder { @Nonnull public OperationContext build(@Nonnull Authentication sessionAuthentication) { + return build(sessionAuthentication, false); + } + + @Nonnull + public OperationContext build( + @Nonnull Authentication sessionAuthentication, boolean skipCache) { final Urn actorUrn = UrnUtils.getUrn(sessionAuthentication.getActor().toUrnStr()); final ActorContext sessionActor = ActorContext.builder() @@ -451,11 +477,20 @@ public OperationContext build(@Nonnull Authentication sessionAuthentication) { .policyInfoSet(this.authorizationContext.getAuthorizer().getActorPolicies(actorUrn)) .groupMembership(this.authorizationContext.getAuthorizer().getActorGroups(actorUrn)) .build(); - return build(sessionActor); + return build(sessionActor, skipCache); } @Nonnull - public OperationContext build(@Nonnull ActorContext sessionActor) { + public OperationContext build(@Nonnull ActorContext sessionActor, boolean skipCache) { + AspectRetriever retriever = + skipCache + ? this.retrieverContext.getAspectRetriever() + : this.retrieverContext.getCachingAspectRetriever(); + + if (!sessionActor.isActive(retriever)) { + throw new ActorAccessException("Actor is not active"); + } + return new OperationContext( this.operationContextConfig, sessionActor, diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java index 9337fbfe3bb00..9afc4138810bb 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RetrieverContext.java @@ -1,8 +1,10 @@ package io.datahubproject.metadata.context; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.entity.SearchRetriever; +import java.util.Objects; import java.util.Optional; import javax.annotation.Nonnull; import lombok.Builder; @@ -15,10 +17,37 @@ public class RetrieverContext @Nonnull private final GraphRetriever graphRetriever; @Nonnull private final AspectRetriever aspectRetriever; + @Nonnull private final CachingAspectRetriever cachingAspectRetriever; @Nonnull private final SearchRetriever searchRetriever; @Override public Optional getCacheKeyComponent() { return Optional.empty(); } + + public static class RetrieverContextBuilder { + public RetrieverContext build() { + if (this.aspectRetriever == null && this.cachingAspectRetriever != null) { + this.aspectRetriever = this.cachingAspectRetriever; + } + + if (this.cachingAspectRetriever == null + && this.aspectRetriever instanceof CachingAspectRetriever) { + this.cachingAspectRetriever = (CachingAspectRetriever) this.aspectRetriever; + } + + return new RetrieverContext( + this.graphRetriever, + Objects.requireNonNull(this.aspectRetriever), + Objects.requireNonNull(this.cachingAspectRetriever), + this.searchRetriever); + } + } + + public static final RetrieverContext EMPTY = + RetrieverContext.builder() + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .cachingAspectRetriever(CachingAspectRetriever.EMPTY) + .build(); } diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java new file mode 100644 index 0000000000000..bca2594b96430 --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/ActorAccessException.java @@ -0,0 +1,7 @@ +package io.datahubproject.metadata.exception; + +public class ActorAccessException extends OperationContextException { + public ActorAccessException(String string) { + super(string); + } +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java new file mode 100644 index 0000000000000..1aac8dc3e60ec --- /dev/null +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/exception/OperationContextException.java @@ -0,0 +1,9 @@ +package io.datahubproject.metadata.exception; + +public class OperationContextException extends RuntimeException { + public OperationContextException(String message) { + super(message); + } + + public OperationContextException() {} +} diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index 42de6b7398c61..4abfbb196f067 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -8,21 +8,17 @@ import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.entity.Aspect; +import com.linkedin.identity.CorpUserInfo; +import com.linkedin.metadata.Constants; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.CachingAspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; -import com.linkedin.metadata.aspect.SystemAspect; -import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; import com.linkedin.metadata.entity.SearchRetriever; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; -import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.query.filter.SortCriterion; -import com.linkedin.metadata.search.ScrollResult; -import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.snapshot.Snapshot; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; @@ -32,15 +28,14 @@ import io.datahubproject.metadata.context.RetrieverContext; import io.datahubproject.metadata.context.ServicesRegistryContext; import io.datahubproject.metadata.context.ValidationContext; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.function.Consumer; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.Builder; /** * Useful for testing. If the defaults are not sufficient, try using the .toBuilder() and replacing @@ -81,26 +76,53 @@ public static EntityRegistry defaultEntityRegistry() { return defaultEntityRegistryInstance; } - public static AspectRetriever emptyAspectRetriever( + public static RetrieverContext emptyActiveUsersRetrieverContext( @Nullable Supplier entityRegistrySupplier) { - return new EmptyAspectRetriever( - () -> - Optional.ofNullable(entityRegistrySupplier) - .map(Supplier::get) - .orElse(defaultEntityRegistry())); - } - public static GraphRetriever emptyGraphRetriever = new EmptyGraphRetriever(); - public static SearchRetriever emptySearchRetriever = new EmptySearchRetriever(); + return RetrieverContext.builder() + .cachingAspectRetriever(emptyActiveUsersAspectRetriever(entityRegistrySupplier)) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) + .build(); + } - public static RetrieverContext emptyRetrieverContext( + public static CachingAspectRetriever emptyActiveUsersAspectRetriever( @Nullable Supplier entityRegistrySupplier) { - return RetrieverContext.builder() - .aspectRetriever(emptyAspectRetriever(entityRegistrySupplier)) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) - .build(); + return new CachingAspectRetriever.EmptyAspectRetriever() { + + @Nonnull + @Override + public Map> getLatestAspectObjects( + Set urns, Set aspectNames) { + if (urns.stream().allMatch(urn -> urn.toString().startsWith("urn:li:corpuser:")) + && aspectNames.contains(Constants.CORP_USER_KEY_ASPECT_NAME)) { + return urns.stream() + .map( + urn -> + Map.entry( + urn, + Map.of( + Constants.CORP_USER_KEY_ASPECT_NAME, + new Aspect( + new CorpUserInfo() + .setActive(true) + .setEmail(urn.getId()) + .setDisplayName(urn.getId()) + .data())))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + return super.getLatestAspectObjects(urns, aspectNames); + } + + @Nonnull + @Override + public EntityRegistry getEntityRegistry() { + return Optional.ofNullable(entityRegistrySupplier) + .map(Supplier::get) + .orElse(defaultEntityRegistry()); + } + }; } public static OperationContext systemContextNoSearchAuthorization( @@ -140,8 +162,10 @@ public static OperationContext systemContextNoSearchAuthorization( RetrieverContext retrieverContext = RetrieverContext.builder() .aspectRetriever(aspectRetriever) - .graphRetriever(emptyGraphRetriever) - .searchRetriever(emptySearchRetriever) + .cachingAspectRetriever( + emptyActiveUsersAspectRetriever(() -> aspectRetriever.getEntityRegistry())) + .graphRetriever(GraphRetriever.EMPTY) + .searchRetriever(SearchRetriever.EMPTY) .build(); return systemContextNoSearchAuthorization( () -> retrieverContext.getAspectRetriever().getEntityRegistry(), @@ -208,7 +232,7 @@ public static OperationContext systemContext( RetrieverContext retrieverContext = Optional.ofNullable(retrieverContextSupplier) .map(Supplier::get) - .orElse(emptyRetrieverContext(entityRegistrySupplier)); + .orElse(emptyActiveUsersRetrieverContext(entityRegistrySupplier)); EntityRegistry entityRegistry = Optional.ofNullable(entityRegistrySupplier) @@ -298,66 +322,5 @@ public static OperationContext userContextNoSearchAuthorization( .asSession(requestContext, Authorizer.EMPTY, TEST_USER_AUTH); } - @Builder - public static class EmptyAspectRetriever implements AspectRetriever { - private final Supplier entityRegistrySupplier; - - @Nonnull - @Override - public Map> getLatestAspectObjects( - Set urns, Set aspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public Map> getLatestSystemAspects( - Map> urnAspectNames) { - return Map.of(); - } - - @Nonnull - @Override - public EntityRegistry getEntityRegistry() { - return entityRegistrySupplier.get(); - } - } - - public static class EmptyGraphRetriever implements GraphRetriever { - - @Nonnull - @Override - public RelatedEntitiesScrollResult scrollRelatedEntities( - @Nullable List sourceTypes, - @Nonnull Filter sourceEntityFilter, - @Nullable List destinationTypes, - @Nonnull Filter destinationEntityFilter, - @Nonnull List relationshipTypes, - @Nonnull RelationshipFilter relationshipFilter, - @Nonnull List sortCriterion, - @Nullable String scrollId, - int count, - @Nullable Long startTimeMillis, - @Nullable Long endTimeMillis) { - return new RelatedEntitiesScrollResult(0, 0, null, List.of()); - } - } - - public static class EmptySearchRetriever implements SearchRetriever { - - @Override - public ScrollResult scroll( - @Nonnull List entities, - @Nullable Filter filters, - @Nullable String scrollId, - int count) { - ScrollResult empty = new ScrollResult(); - empty.setEntities(new SearchEntityArray()); - empty.setNumEntities(0); - empty.setPageSize(0); - return empty; - } - } - private TestOperationContexts() {} } diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java index 3e092e20127ee..f77b244d8f2d8 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/OperationContextTest.java @@ -8,6 +8,7 @@ import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.metadata.models.registry.EntityRegistry; +import io.datahubproject.test.metadata.context.TestOperationContexts; import org.testng.annotations.Test; public class OperationContextTest { @@ -25,7 +26,7 @@ public void testSystemPrivilegeEscalation() { mock(EntityRegistry.class), mock(ServicesRegistryContext.class), null, - mock(RetrieverContext.class), + TestOperationContexts.emptyActiveUsersRetrieverContext(null), mock(ValidationContext.class)); OperationContext opContext = diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java index 6724f35d840ad..a9871f1ed9948 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/token/StatefulTokenService.java @@ -145,7 +145,7 @@ public String generateAccessToken( _entityService.ingestProposal( systemOperationContext, AspectsBatchImpl.builder() - .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext().get()) + .mcps(List.of(proposal), auditStamp, systemOperationContext.getRetrieverContext()) .build(), false); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9348416606d0a..75b4c8e8b002f 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -522,12 +522,12 @@ cache: entityAspectTTLSeconds: # cache user aspects for 20s corpuser: - corpUserKey: 20 + corpUserKey: 300 # 5 min corpUserInfo: 20 corpUserEditableInfo: 20 - corpUserStatus: 20 + corpUserStatus: 300 # 5 min globalTags: 20 - status: 20 + status: 300 # 5 min corpUserCredentials: 20 corpUserSettings: 20 roleMembership: 20 diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java index f5235dc3682fc..3e2823591e168 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/context/SystemOperationContextFactory.java @@ -45,7 +45,8 @@ protected OperationContext javaSystemOperationContext( @Nonnull final SearchService searchService, @Qualifier("baseElasticSearchComponents") BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, - @Nonnull final ConfigurationProvider configurationProvider) { + @Nonnull final ConfigurationProvider configurationProvider, + @Qualifier("systemEntityClient") @Nonnull final SystemEntityClient systemEntityClient) { EntityServiceAspectRetriever entityServiceAspectRetriever = EntityServiceAspectRetriever.builder() @@ -53,6 +54,9 @@ protected OperationContext javaSystemOperationContext( .entityService(entityService) .build(); + EntityClientAspectRetriever entityClientAspectRetriever = + EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); + SystemGraphRetriever systemGraphRetriever = SystemGraphRetriever.builder().graphService(graphService).build(); @@ -68,6 +72,7 @@ protected OperationContext javaSystemOperationContext( components.getIndexConvention(), RetrieverContext.builder() .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -76,6 +81,7 @@ protected OperationContext javaSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); @@ -104,7 +110,7 @@ protected OperationContext restliSystemOperationContext( BaseElasticSearchComponentsFactory.BaseElasticSearchComponents components, @Nonnull final ConfigurationProvider configurationProvider) { - EntityClientAspectRetriever entityServiceAspectRetriever = + EntityClientAspectRetriever entityClientAspectRetriever = EntityClientAspectRetriever.builder().entityClient(systemEntityClient).build(); SystemGraphRetriever systemGraphRetriever = @@ -121,7 +127,7 @@ protected OperationContext restliSystemOperationContext( ServicesRegistryContext.builder().restrictedService(restrictedService).build(), components.getIndexConvention(), RetrieverContext.builder() - .aspectRetriever(entityServiceAspectRetriever) + .cachingAspectRetriever(entityClientAspectRetriever) .graphRetriever(systemGraphRetriever) .searchRetriever(searchServiceSearchRetriever) .build(), @@ -130,7 +136,7 @@ protected OperationContext restliSystemOperationContext( configurationProvider.getFeatureFlags().isAlternateMCPValidation()) .build()); - entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext); + entityClientAspectRetriever.setSystemOperationContext(systemOperationContext); systemGraphRetriever.setSystemOperationContext(systemOperationContext); searchServiceSearchRetriever.setSystemOperationContext(systemOperationContext); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java index 22ce06a5984ea..c04dd25ccd4ac 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestDataPlatformInstancesStep.java @@ -84,14 +84,14 @@ public void execute(@Nonnull OperationContext systemOperationContext) throws Exc .aspectName(DATA_PLATFORM_INSTANCE_ASPECT_NAME) .recordTemplate(dataPlatformInstance.get()) .auditStamp(aspectAuditStamp) - .build(systemOperationContext.getAspectRetrieverOpt().get())); + .build(systemOperationContext.getAspectRetriever())); } } _entityService.ingestAspects( systemOperationContext, AspectsBatchImpl.builder() - .retrieverContext(systemOperationContext.getRetrieverContext().get()) + .retrieverContext(systemOperationContext.getRetrieverContext()) .items(items) .build(), true, diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index eb6bfe17ac198..dac2879487469 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -225,7 +225,7 @@ private void ingestPolicy( new AuditStamp() .setActor(Urn.createFromString(Constants.SYSTEM_ACTOR)) .setTime(System.currentTimeMillis()), - systemOperationContext.getRetrieverContext().get()) + systemOperationContext.getRetrieverContext()) .build(), false); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java similarity index 81% rename from metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java index ba0a426fa20e8..c756827cad56b 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/GlobalControllerExceptionHandler.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/GlobalControllerExceptionHandler.java @@ -1,9 +1,11 @@ -package io.datahubproject.openapi; +package io.datahubproject.openapi.config; import com.linkedin.metadata.dao.throttle.APIThrottleException; +import io.datahubproject.metadata.exception.ActorAccessException; import io.datahubproject.openapi.exception.InvalidUrnException; import io.datahubproject.openapi.exception.UnauthorizedException; import java.util.Map; +import javax.annotation.PostConstruct; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.ConversionNotSupportedException; import org.springframework.core.Ordered; @@ -19,6 +21,11 @@ @ControllerAdvice public class GlobalControllerExceptionHandler extends DefaultHandlerExceptionResolver { + @PostConstruct + public void init() { + log.info("GlobalControllerExceptionHandler initialized"); + } + public GlobalControllerExceptionHandler() { setOrder(Ordered.HIGHEST_PRECEDENCE); setWarnLogCategory(getClass().getName()); @@ -52,4 +59,9 @@ public static ResponseEntity> handleUnauthorizedException( UnauthorizedException e) { return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); } + + @ExceptionHandler(ActorAccessException.class) + public static ResponseEntity> actorAccessException(ActorAccessException e) { + return new ResponseEntity<>(Map.of("error", e.getMessage()), HttpStatus.FORBIDDEN); + } } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java index 579a62c084999..592d7bba4211f 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/controller/GenericEntitiesController.java @@ -637,7 +637,7 @@ public ResponseEntity createAspect( AspectSpec aspectSpec = lookupAspectSpec(entitySpec, aspectName).get(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), urn, aspectSpec, createIfEntityNotExists, @@ -649,7 +649,7 @@ public ResponseEntity createAspect( entityService.ingestProposal( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), async); @@ -725,7 +725,7 @@ public ResponseEntity patchAspect( .build(); ChangeMCP upsert = toUpsertItem( - opContext.getRetrieverContext().get().getAspectRetriever(), + opContext.getRetrieverContext().getAspectRetriever(), validatedUrn(entityUrn), aspectSpec, currentValue, @@ -736,7 +736,7 @@ public ResponseEntity patchAspect( entityService.ingestAspects( opContext, AspectsBatchImpl.builder() - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .items(List.of(upsert)) .build(), true, diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java new file mode 100644 index 0000000000000..99d3879ab9a32 --- /dev/null +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/operations/test/IdController.java @@ -0,0 +1,54 @@ +package io.datahubproject.openapi.operations.test; + +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationContext; +import com.datahub.authorization.AuthorizerChain; +import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.RequestContext; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import jakarta.servlet.http.HttpServletRequest; +import java.util.List; +import java.util.Map; +import lombok.extern.slf4j.Slf4j; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +@RestController +@RequestMapping("/operations/identity") +@Slf4j +@Tag(name = "Identity", description = "An API for checking identity") +public class IdController { + private final AuthorizerChain authorizerChain; + private final OperationContext systemOperationContext; + + public IdController(OperationContext systemOperationContext, AuthorizerChain authorizerChain) { + this.systemOperationContext = systemOperationContext; + this.authorizerChain = authorizerChain; + } + + @Tag(name = "User") + @GetMapping(path = "/user/urn", produces = MediaType.APPLICATION_JSON_VALUE) + @Operation(summary = "User id") + public ResponseEntity> getUserId( + HttpServletRequest request, + @RequestParam(value = "skipCache", required = false, defaultValue = "false") + Boolean skipCache) { + Authentication authentication = AuthenticationContext.getAuthentication(); + String actorUrnStr = authentication.getActor().toUrnStr(); + + OperationContext.asSession( + systemOperationContext, + RequestContext.builder().buildOpenapi(actorUrnStr, request, "getUserIdentity", List.of()), + authorizerChain, + authentication, + true, + skipCache); + + return ResponseEntity.ok(Map.of("urn", actorUrnStr)); + } +} diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index c38f2db0eefbb..ca425810c87a0 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -491,7 +491,7 @@ public static List> ingestBatchProposal( try { AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext().get()) + .mcps(serviceProposals, auditStamp, opContext.getRetrieverContext()) .build(); Map> resultMap = diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java index 56a7955b9fe87..b1c5709ef0147 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v2/controller/EntityController.java @@ -203,7 +203,7 @@ protected AspectsBatch toMCPBatch( objectMapper.writeValueAsString(aspect.getValue().get("systemMetadata")))); } - items.add(builder.build(opContext.getAspectRetrieverOpt().get())); + items.add(builder.build(opContext.getAspectRetriever())); } } } @@ -211,7 +211,7 @@ protected AspectsBatch toMCPBatch( return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java index ce7fd73f99b9e..af13cd3aab051 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/v3/controller/EntityController.java @@ -554,14 +554,14 @@ protected AspectsBatch toMCPBatch( GenericRecordUtils.JSON, aspectSpec)); - items.add(builder.build(opContext.getRetrieverContext().get().getAspectRetriever())); + items.add(builder.build(opContext.getRetrieverContext().getAspectRetriever())); } } } } return AspectsBatchImpl.builder() .items(items) - .retrieverContext(opContext.getRetrieverContext().get()) + .retrieverContext(opContext.getRetrieverContext()) .build(); } diff --git a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json index 33cfba0f27802..27731af9ffaa7 100644 --- a/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json +++ b/metadata-service/restli-api/src/main/idl/com.linkedin.entity.entitiesV2.restspec.json @@ -19,6 +19,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -27,6 +31,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json index 9bf7f97b34be1..9c5f41281fcfb 100644 --- a/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json +++ b/metadata-service/restli-api/src/main/snapshot/com.linkedin.entity.entitiesV2.snapshot.json @@ -182,6 +182,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] }, { "method" : "batch_get", @@ -190,6 +194,10 @@ "name" : "aspects", "type" : "{ \"type\" : \"array\", \"items\" : \"string\" }", "optional" : true + }, { + "name" : "alwaysIncludeKeyAspect", + "type" : "boolean", + "optional" : true } ] } ], "entity" : { diff --git a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java index cf6e571cb8cbe..b85f22e781d0b 100644 --- a/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client-api/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -45,12 +45,34 @@ // Consider renaming this to datahub client. public interface EntityClient { + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urn urn id for the entity + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nullable - EntityResponse getV2( + default EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return getV2(opContext, entityName, urn, aspectNames, true); + } + + @Nullable + EntityResponse getV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Urn urn, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -58,12 +80,34 @@ EntityResponse getV2( Entity get(@Nonnull OperationContext opContext, @Nonnull final Urn urn) throws RemoteInvocationException; + /** + * This version follows the legacy behavior of returning key aspects regardless of whether they + * exist + * + * @param opContext operation context + * @param entityName entity type + * @param urns urn ids for the entities + * @param aspectNames set of aspects + * @return requested entity/aspects + */ + @Deprecated @Nonnull - Map batchGetV2( + default Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, @Nullable final Set aspectNames) + throws RemoteInvocationException, URISyntaxException { + return batchGetV2(opContext, entityName, urns, aspectNames, true); + } + + @Nonnull + Map batchGetV2( + @Nonnull OperationContext opContext, + @Nonnull String entityName, + @Nonnull final Set urns, + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException; @Nonnull @@ -589,27 +633,38 @@ void rollbackIngestion( @Nullable default Aspect getLatestAspectObject( - @Nonnull OperationContext opContext, @Nonnull Urn urn, @Nonnull String aspectName) + @Nonnull OperationContext opContext, + @Nonnull Urn urn, + @Nonnull String aspectName, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { - return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName)) + return getLatestAspects(opContext, Set.of(urn), Set.of(aspectName), alwaysIncludeKeyAspect) .getOrDefault(urn, Map.of()) .get(aspectName); } @Nonnull default Map> getLatestAspects( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); - return entityResponseToAspectMap(batchGetV2(opContext, entityName, urns, aspectNames)); + return entityResponseToAspectMap( + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect)); } @Nonnull default Map> getLatestSystemAspect( - @Nonnull OperationContext opContext, @Nonnull Set urns, @Nonnull Set aspectNames) + @Nonnull OperationContext opContext, + @Nonnull Set urns, + @Nonnull Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { String entityName = urns.stream().findFirst().map(Urn::getEntityType).get(); return entityResponseToSystemAspectMap( - batchGetV2(opContext, entityName, urns, aspectNames), opContext.getEntityRegistry()); + batchGetV2(opContext, entityName, urns, aspectNames, alwaysIncludeKeyAspect), + opContext.getEntityRegistry()); } } diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 516902601f08a..8d4c5e9228a71 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -156,10 +156,15 @@ public EntityResponse getV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Urn urn, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { final EntitiesV2GetRequestBuilder requestBuilder = - ENTITIES_V2_REQUEST_BUILDERS.get().aspectsParam(aspectNames).id(urn.toString()); + ENTITIES_V2_REQUEST_BUILDERS + .get() + .aspectsParam(aspectNames) + .id(urn.toString()) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect); return sendClientRequest(requestBuilder, opContext.getSessionAuthentication()).getEntity(); } @@ -241,7 +246,8 @@ public Map batchGetV2( @Nonnull OperationContext opContext, @Nonnull String entityName, @Nonnull final Set urns, - @Nullable final Set aspectNames) + @Nullable final Set aspectNames, + @Nullable Boolean alwaysIncludeKeyAspect) throws RemoteInvocationException, URISyntaxException { Map responseMap = new HashMap<>(); @@ -260,6 +266,7 @@ public Map batchGetV2( ENTITIES_V2_REQUEST_BUILDERS .batchGet() .aspectsParam(aspectNames) + .alwaysIncludeKeyAspectParam(alwaysIncludeKeyAspect) .ids( batch.stream() .map(Urn::toString) diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java index 2637e2d067c6d..aa17f1951bc91 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/SystemRestliEntityClient.java @@ -59,6 +59,6 @@ public Map batchGetV2NoCache( @Nonnull Set urns, @Nullable Set aspectNames) throws RemoteInvocationException, URISyntaxException { - return super.batchGetV2(opContext, entityName, urns, aspectNames); + return super.batchGetV2(opContext, entityName, urns, aspectNames, false); } } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 6033ead36f10e..30b187da00e91 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -309,7 +309,7 @@ private Task ingestProposals( log.debug("Proposals: {}", metadataChangeProposals); try { final AspectsBatch batch = AspectsBatchImpl.builder() - .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext().get(), + .mcps(metadataChangeProposals, auditStamp, opContext.getRetrieverContext(), opContext.getValidationContext().isAlternateValidation()) .build(); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 20209ddf44d64..896d81d3cbecc 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -64,7 +64,8 @@ public class EntityV2Resource extends CollectionResourceTaskTemplate get( - @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @Nonnull String urnStr, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("GET V2 {}", urnStr); final Urn urn = Urn.createFromString(urnStr); @@ -90,7 +91,7 @@ public Task get( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects); + return _entityService.getEntityV2(opContext, entityName, urn, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( @@ -106,7 +107,8 @@ public Task get( @WithSpan public Task> batchGet( @Nonnull Set urnStrs, - @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) + @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames, + @QueryParam(PARAM_ALWAYS_INCLUDE_KEY_ASPECT) @Optional @Nullable Boolean alwaysIncludeKeyAspect) throws URISyntaxException { log.debug("BATCH GET V2 {}", urnStrs.toString()); final Set urns = new HashSet<>(); @@ -138,7 +140,7 @@ public Task> batchGet( ? opContext.getEntityAspectNames(entityName) : new HashSet<>(Arrays.asList(aspectNames)); try { - return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects); + return _entityService.getEntitiesV2(opContext, entityName, urns, projectedAspects, alwaysIncludeKeyAspect == null || alwaysIncludeKeyAspect); } catch (Exception e) { throw new RuntimeException( String.format( diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java index ef79a404c2145..11df52ad66709 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliConstants.java @@ -1,5 +1,7 @@ package com.linkedin.metadata.resources.restli; +import javax.annotation.Nullable; + public final class RestliConstants { private RestliConstants() {} @@ -21,6 +23,7 @@ private RestliConstants() {} public static final String PARAM_INPUT = "input"; public static final String PARAM_MAX_HOPS = "maxHops"; public static final String PARAM_ASPECTS = "aspects"; + public static final String PARAM_ALWAYS_INCLUDE_KEY_ASPECT = "alwaysIncludeKeyAspect"; public static final String PARAM_FILTER = "filter"; public static final String PARAM_GROUP = "group"; public static final String PARAM_SORT = "sort"; diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 185874fac1382..a2092405da3ff 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -8,6 +8,7 @@ import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; +import io.datahubproject.metadata.exception.ActorAccessException; import java.util.Optional; import java.util.function.Supplier; import javax.annotation.Nonnull; @@ -38,6 +39,8 @@ public static Task toTask(@Nonnull Supplier supplier) { if (throwable instanceof IllegalArgumentException || throwable.getCause() instanceof IllegalArgumentException) { finalException = badRequestException(throwable.getMessage()); + } else if (throwable.getCause() instanceof ActorAccessException) { + finalException = forbidden(throwable.getCause().getMessage()); } else if (throwable instanceof APIThrottleException) { finalException = apiThrottled(throwable.getMessage()); } else if (throwable instanceof RestLiServiceException) { @@ -109,4 +112,9 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String public static RestLiServiceException apiThrottled(@Nullable String message) { return new RestLiServiceException(HttpStatus.S_429_TOO_MANY_REQUESTS, message); } + + @Nonnull + public static RestLiServiceException forbidden(@Nullable String message) { + return new RestLiServiceException(HttpStatus.S_403_FORBIDDEN, message); + } } diff --git a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java index a39401c170a11..037b5b81fd4df 100644 --- a/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java +++ b/metadata-service/restli-servlet-impl/src/test/java/com/linkedin/metadata/resources/entity/AspectResourceTest.java @@ -100,7 +100,7 @@ public void testAsyncDefaultAspects() throws URISyntaxException { .recordTemplate(mcp.getAspect()) .auditStamp(new AuditStamp()) .metadataChangeProposal(mcp) - .build(opContext.getAspectRetrieverOpt().get()); + .build(opContext.getAspectRetriever()); when(aspectDao.runInTransactionWithRetry(any(), any(), anyInt())) .thenReturn( List.of(List.of( diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index af29437c051e1..006daae39333e 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -9,6 +9,8 @@ wait_for_writes_to_sync, ) +from .token_utils import listUsers, removeUser + pytestmark = pytest.mark.no_cypress_suite1 # Disable telemetry @@ -490,45 +492,3 @@ def getAccessTokenMetadata(session, token): response.raise_for_status() return response.json() - - -def removeUser(session, urn): - # Remove user - json = { - "query": """mutation removeUser($urn: String!) { - removeUser(urn: $urn) - }""", - "variables": {"urn": urn}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() - - -def listUsers(session): - input = { - "start": "0", - "count": "20", - } - - # list users - json = { - "query": """query listUsers($input: ListUsersInput!) { - listUsers(input: $input) { - start - count - total - users { - username - } - } - }""", - "variables": {"input": input}, - } - - response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) - - response.raise_for_status() - return response.json() diff --git a/smoke-test/tests/tokens/session_access_token_test.py b/smoke-test/tests/tokens/session_access_token_test.py new file mode 100644 index 0000000000000..a16abc4445303 --- /dev/null +++ b/smoke-test/tests/tokens/session_access_token_test.py @@ -0,0 +1,173 @@ +import os +import time + +import pytest +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import AuditStampClass, CorpUserStatusClass +from requests.exceptions import HTTPError + +from tests.utils import ( + get_admin_credentials, + get_frontend_url, + login_as, + wait_for_writes_to_sync, +) + +from .token_utils import getUserId, listUsers, removeUser + +pytestmark = pytest.mark.no_cypress_suite1 + +# Disable telemetry +os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" + +(admin_user, admin_pass) = get_admin_credentials() +user_urn = "urn:li:corpuser:sessionUser" + + +@pytest.fixture(scope="class") +def custom_user_session(): + """Fixture to execute setup before and tear down after all tests are run""" + admin_session = login_as(admin_user, admin_pass) + + res_data = removeUser(admin_session, user_urn) + assert res_data + assert "error" not in res_data + + # Test getting the invite token + get_invite_token_json = { + "query": """query getInviteToken($input: GetInviteTokenInput!) { + getInviteToken(input: $input){ + inviteToken + } + }""", + "variables": {"input": {}}, + } + + get_invite_token_response = admin_session.post( + f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json + ) + get_invite_token_response.raise_for_status() + get_invite_token_res_data = get_invite_token_response.json() + + assert get_invite_token_res_data + assert get_invite_token_res_data["data"] + invite_token = get_invite_token_res_data["data"]["getInviteToken"]["inviteToken"] + assert invite_token is not None + assert "error" not in invite_token + + # Pass the invite token when creating the user + sign_up_json = { + "fullName": "Test Session User", + "email": "sessionUser", + "password": "sessionUser", + "title": "Date Engineer", + "inviteToken": invite_token, + } + + sign_up_response = admin_session.post( + f"{get_frontend_url()}/signUp", json=sign_up_json + ) + sign_up_response.raise_for_status() + assert sign_up_response + assert "error" not in sign_up_response + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # signUp will override the session cookie to the new user to be signed up. + admin_session.cookies.clear() + admin_session = login_as(admin_user, admin_pass) + + # Make user created user is there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} in res_data["data"]["listUsers"]["users"] + + yield login_as(sign_up_json["email"], sign_up_json["password"]) + + # Delete created user + res_data = removeUser(admin_session, user_urn) + assert res_data + assert res_data["data"] + assert res_data["data"]["removeUser"] is True + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # Make user created user is not there. + res_data = listUsers(admin_session) + assert res_data["data"] + assert res_data["data"]["listUsers"] + assert {"username": "sessionUser"} not in res_data["data"]["listUsers"]["users"] + + +@pytest.mark.dependency() +def test_soft_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.soft_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo soft delete + graph_client.set_soft_delete_status(urn=user_urn, delete=False) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_soft_delete"]) +def test_suspend(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="SUSPENDED", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) + + # undo suspend + graph_client.emit( + MetadataChangeProposalWrapper( + entityType="corpuser", + entityUrn=user_urn, + changeType="UPSERT", + aspectName="corpUserStatus", + aspect=CorpUserStatusClass( + status="ACTIVE", + lastModified=AuditStampClass( + time=int(time.time() * 1000.0), actor="urn:li:corpuser:unknown" + ), + ), + ) + ) + wait_for_writes_to_sync() + + +@pytest.mark.dependency(depends=["test_suspend"]) +def test_hard_delete(graph_client, custom_user_session): + # assert initial access + assert getUserId(custom_user_session) == {"urn": user_urn} + + graph_client.hard_delete_entity(urn=user_urn) + wait_for_writes_to_sync() + + with pytest.raises(HTTPError) as req_info: + getUserId(custom_user_session) + assert "403 Client Error: Forbidden" in str(req_info.value) diff --git a/smoke-test/tests/tokens/token_utils.py b/smoke-test/tests/tokens/token_utils.py new file mode 100644 index 0000000000000..10558e7085de7 --- /dev/null +++ b/smoke-test/tests/tokens/token_utils.py @@ -0,0 +1,53 @@ +from tests.utils import get_frontend_url + + +def getUserId(session): + response = session.get( + f"{get_frontend_url()}/openapi/operations/identity/user/urn", + params={"skipCache": "true"}, + ) + + response.raise_for_status() + return response.json() + + +def removeUser(session, urn): + # Remove user + json = { + "query": """mutation removeUser($urn: String!) { + removeUser(urn: $urn) + }""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() + + +def listUsers(session): + input = { + "start": "0", + "count": "20", + } + + # list users + json = { + "query": """query listUsers($input: ListUsersInput!) { + listUsers(input: $input) { + start + count + total + users { + username + } + } + }""", + "variables": {"input": input}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + + response.raise_for_status() + return response.json() From 83904b7f351c9ea8b9ac7737892b2b21caedb720 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 18 Dec 2024 17:02:16 -0500 Subject: [PATCH 02/17] fix(env) Fix forms hook env var default config (#12155) --- .../configuration/src/main/resources/application.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 75b4c8e8b002f..9010d77015f16 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -561,7 +561,7 @@ springdoc.api-docs.groups.enabled: true forms: hook: - enabled: { $FORMS_HOOK_ENABLED:true } + enabled: ${FORMS_HOOK_ENABLED:true} consumerGroupSuffix: ${FORMS_HOOK_CONSUMER_GROUP_SUFFIX:} businessAttribute: From da8f8221977444644596da40e676e15362bd7a2d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 18 Dec 2024 14:36:10 -0800 Subject: [PATCH 03/17] feat(ingest/mlflow): Support configurable base_external_url (#12167) --- .../src/datahub/ingestion/source/mlflow.py | 35 ++++++++++++++++--- .../tests/unit/test_mlflow_source.py | 13 +++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py index cef6d2b1bb577..26d160acf330c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mlflow.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mlflow.py @@ -38,16 +38,30 @@ class MLflowConfig(EnvConfigMixin): tracking_uri: Optional[str] = Field( default=None, - description="Tracking server URI. If not set, an MLflow default tracking_uri is used (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)", + description=( + "Tracking server URI. If not set, an MLflow default tracking_uri is used" + " (local `mlruns/` directory or `MLFLOW_TRACKING_URI` environment variable)" + ), ) registry_uri: Optional[str] = Field( default=None, - description="Registry server URI. If not set, an MLflow default registry_uri is used (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)", + description=( + "Registry server URI. If not set, an MLflow default registry_uri is used" + " (value of tracking_uri or `MLFLOW_REGISTRY_URI` environment variable)" + ), ) model_name_separator: str = Field( default="_", description="A string which separates model name from its version (e.g. model_1 or model-1)", ) + base_external_url: Optional[str] = Field( + default=None, + description=( + "Base URL to use when constructing external URLs to MLflow." + " If not set, tracking_uri is used if it's an HTTP URL." + " If neither is set, external URLs are not generated." + ), + ) @dataclass @@ -279,12 +293,23 @@ def _make_ml_model_urn(self, model_version: ModelVersion) -> str: ) return urn - def _make_external_url(self, model_version: ModelVersion) -> Union[None, str]: + def _get_base_external_url_from_tracking_uri(self) -> Optional[str]: + if isinstance( + self.client.tracking_uri, str + ) and self.client.tracking_uri.startswith("http"): + return self.client.tracking_uri + else: + return None + + def _make_external_url(self, model_version: ModelVersion) -> Optional[str]: """ Generate URL for a Model Version to MLflow UI. """ - base_uri = self.client.tracking_uri - if base_uri.startswith("http"): + base_uri = ( + self.config.base_external_url + or self._get_base_external_url_from_tracking_uri() + ) + if base_uri: return f"{base_uri.rstrip('/')}/#/models/{model_version.name}/versions/{model_version.version}" else: return None diff --git a/metadata-ingestion/tests/unit/test_mlflow_source.py b/metadata-ingestion/tests/unit/test_mlflow_source.py index d213dd92352e6..e882296b6f331 100644 --- a/metadata-ingestion/tests/unit/test_mlflow_source.py +++ b/metadata-ingestion/tests/unit/test_mlflow_source.py @@ -136,3 +136,16 @@ def test_make_external_link_remote(source, model_version): url = source._make_external_url(model_version) assert url == expected_url + + +def test_make_external_link_remote_via_config(source, model_version): + custom_base_url = "https://custom-server.org" + source.config.base_external_url = custom_base_url + source.client = MlflowClient( + tracking_uri="https://dummy-mlflow-tracking-server.org" + ) + expected_url = f"{custom_base_url}/#/models/{model_version.name}/versions/{model_version.version}" + + url = source._make_external_url(model_version) + + assert url == expected_url From 4392d72456faae5f0f59eb09756287182feec56b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 18 Dec 2024 20:29:34 -0500 Subject: [PATCH 04/17] fix(cli/properties): fix data type validation (#12170) --- .../structuredproperties.py | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py index e37281dea86e1..619f69b016262 100644 --- a/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py +++ b/metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py @@ -14,7 +14,7 @@ PropertyValueClass, StructuredPropertyDefinitionClass, ) -from datahub.metadata.urns import StructuredPropertyUrn, Urn +from datahub.metadata.urns import DataTypeUrn, StructuredPropertyUrn, Urn from datahub.utilities.urns._urn_base import URN_TYPES logging.basicConfig(level=logging.INFO) @@ -86,19 +86,31 @@ class StructuredProperties(ConfigModel): @validator("type") def validate_type(cls, v: str) -> str: - # Convert to lowercase if needed - if not v.islower(): + # This logic is somewhat hacky, since we need to deal with + # 1. fully qualified urns + # 2. raw data types, that need to get the datahub namespace prefix + # While keeping the user-facing interface and error messages clean. + + if not v.startswith("urn:li:") and not v.islower(): + # Convert to lowercase if needed + v = v.lower() logger.warning( - f"Structured property type should be lowercase. Updated to {v.lower()}" + f"Structured property type should be lowercase. Updated to {v}" ) - v = v.lower() + + urn = Urn.make_data_type_urn(v) # Check if type is allowed - if not AllowedTypes.check_allowed_type(v): + data_type_urn = DataTypeUrn.from_string(urn) + unqualified_data_type = data_type_urn.id + if unqualified_data_type.startswith("datahub."): + unqualified_data_type = unqualified_data_type[len("datahub.") :] + if not AllowedTypes.check_allowed_type(unqualified_data_type): raise ValueError( - f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}" + f"Type {unqualified_data_type} is not allowed. Allowed types are {AllowedTypes.values()}" ) - return v + + return urn @property def fqn(self) -> str: From 48f3cc578589c5c0379d5117756f01a0228669b4 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 18 Dec 2024 21:53:20 -0600 Subject: [PATCH 05/17] fix(pgsql): Postgres doesn't support UNION select with FOR UPDATE (#12169) --- .../metadata/entity/ebean/EbeanAspectDao.java | 87 ++++++++++++++++++- .../metadata/config/EbeanConfiguration.java | 1 + .../src/main/resources/application.yaml | 1 + 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java index bd6cc67561b88..ea580a97c5188 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/ebean/EbeanAspectDao.java @@ -93,8 +93,14 @@ public class EbeanAspectDao implements AspectDao, AspectMigrationsDao { */ private final LoadingCache locks; + private final String batchGetMethod; + public EbeanAspectDao(@Nonnull final Database server, EbeanConfiguration ebeanConfiguration) { _server = server; + this.batchGetMethod = + ebeanConfiguration.getBatchGetMethod() != null + ? ebeanConfiguration.getBatchGetMethod() + : "IN"; if (ebeanConfiguration.getLocking().isEnabled()) { this.locks = CacheBuilder.newBuilder() @@ -371,23 +377,37 @@ private List batchGet( final int totalPageCount = QueryUtils.getTotalPageCount(keys.size(), keysCount); final List finalResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); while (QueryUtils.hasMore(position, keysCount, totalPageCount)) { position += keysCount; final List oneStatementResult = - batchGetUnion(new ArrayList<>(keys), keysCount, position, forUpdate); + batchGetSelectString(new ArrayList<>(keys), keysCount, position, forUpdate); finalResult.addAll(oneStatementResult); } return finalResult; } + @Nonnull + private List batchGetSelectString( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + + if (batchGetMethod.equals("IN")) { + return batchGetIn(keys, keysCount, position, forUpdate); + } + + return batchGetUnion(keys, keysCount, position, forUpdate); + } + /** * Builds a single SELECT statement for batch get, which selects one entity, and then can be * UNION'd with other SELECT statements. */ - private String batchGetSelect( + private String batchGetSelectString( final int selectId, @Nonnull final String urn, @Nonnull final String aspect, @@ -434,7 +454,7 @@ private List batchGetUnion( final Map params = new HashMap<>(); for (int index = position; index < end; index++) { sb.append( - batchGetSelect( + batchGetSelectString( index - position, keys.get(index).getUrn(), keys.get(index).getAspect(), @@ -467,6 +487,65 @@ private List batchGetUnion( return query.findList(); } + @Nonnull + private List batchGetIn( + @Nonnull final List keys, + final int keysCount, + final int position, + boolean forUpdate) { + validateConnection(); + + // Build a single SELECT with IN clause using composite key comparison + // Query will look like: + // SELECT * FROM metadata_aspect WHERE (urn, aspect, version) IN + // (('urn0', 'aspect0', 0), ('urn1', 'aspect1', 1)) + final StringBuilder sb = new StringBuilder(); + sb.append( + "SELECT urn, aspect, version, metadata, systemMetadata, createdOn, createdBy, createdFor "); + sb.append("FROM metadata_aspect_v2 WHERE (urn, aspect, version) IN ("); + + final int end = Math.min(keys.size(), position + keysCount); + final Map params = new HashMap<>(); + + for (int index = position; index < end; index++) { + int paramIndex = index - position; + String urnParam = "urn" + paramIndex; + String aspectParam = "aspect" + paramIndex; + String versionParam = "version" + paramIndex; + + params.put(urnParam, keys.get(index).getUrn()); + params.put(aspectParam, keys.get(index).getAspect()); + params.put(versionParam, keys.get(index).getVersion()); + + sb.append("(:" + urnParam + ", :" + aspectParam + ", :" + versionParam + ")"); + + if (index != end - 1) { + sb.append(","); + } + } + + sb.append(")"); + + if (forUpdate) { + sb.append(" FOR UPDATE"); + } + + final RawSql rawSql = + RawSqlBuilder.parse(sb.toString()) + .columnMapping(EbeanAspectV2.URN_COLUMN, "key.urn") + .columnMapping(EbeanAspectV2.ASPECT_COLUMN, "key.aspect") + .columnMapping(EbeanAspectV2.VERSION_COLUMN, "key.version") + .create(); + + final Query query = _server.find(EbeanAspectV2.class).setRawSql(rawSql); + + for (Map.Entry param : params.entrySet()) { + query.setParameter(param.getKey(), param.getValue()); + } + + return query.findList(); + } + @Override @Nonnull public ListResult listUrns( diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java index 47b406e695a3f..6eb31e14a2d3b 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/EbeanConfiguration.java @@ -23,6 +23,7 @@ public class EbeanConfiguration { private boolean autoCreateDdl; private boolean postgresUseIamAuth; private LockingConfiguration locking; + private String batchGetMethod; public static final EbeanConfiguration testDefault = EbeanConfiguration.builder().locking(LockingConfiguration.testDefault).build(); diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 9010d77015f16..b997bc108e4ba 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -164,6 +164,7 @@ ebean: waitTimeoutMillis: ${EBEAN_WAIT_TIMEOUT_MILLIS:1000} autoCreateDdl: ${EBEAN_AUTOCREATE:false} postgresUseIamAuth: ${EBEAN_POSTGRES_USE_AWS_IAM_AUTH:false} + batchGetMethod: ${EBEAN_BATCH_GET_METHOD:IN} # Alternative UNION locking: enabled: ${EBEAN_LOCKING_ENABLED:false} durationSeconds: ${EBEAN_LOCKING_DURATION_SECONDS:60} From 953893cf2e72e71580b21bdfc12592fca572e13b Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:39:47 +0530 Subject: [PATCH 06/17] refactor(ingest/kafka-connect): define interface for new connector impl (#12149) --- metadata-ingestion/setup.py | 2 +- .../ingestion/source/kafka/kafka_connect.py | 1468 ----------------- .../source/kafka_connect/__init__.py | 0 .../ingestion/source/kafka_connect/common.py | 202 +++ .../source/kafka_connect/kafka_connect.py | 367 +++++ .../source/kafka_connect/sink_connectors.py | 341 ++++ .../source/kafka_connect/source_connectors.py | 570 +++++++ 7 files changed, 1481 insertions(+), 1469 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6334b3abbb8a0..c6994dd6d5aa6 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -741,7 +741,7 @@ "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", "kafka = datahub.ingestion.source.kafka.kafka:KafkaSource", - "kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource", + "kafka-connect = datahub.ingestion.source.kafka_connect.kafka_connect:KafkaConnectSource", "ldap = datahub.ingestion.source.ldap:LDAPSource", "looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource", "lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py deleted file mode 100644 index 23a99ccb310e1..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka_connect.py +++ /dev/null @@ -1,1468 +0,0 @@ -import logging -import re -from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple - -import jpype -import jpype.imports -import requests -from pydantic.fields import Field -from sqlalchemy.engine.url import make_url - -import datahub.emitter.mce_builder as builder -import datahub.metadata.schema_classes as models -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import ( - DatasetLineageProviderConfigBase, - PlatformInstanceConfigMixin, -) -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.decorators import ( - SourceCapability, - SupportStatus, - capability, - config_class, - platform_name, - support_status, -) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source -from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( - get_platform_from_sqlalchemy_uri, -) -from datahub.ingestion.source.state.stale_entity_removal_handler import ( - StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, - StatefulStaleMetadataRemovalConfig, -) -from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionConfigBase, - StatefulIngestionSourceBase, -) - -logger = logging.getLogger(__name__) - -KAFKA = "kafka" -SOURCE = "source" -SINK = "sink" -CONNECTOR_CLASS = "connector.class" - - -class ProvidedConfig(ConfigModel): - provider: str - path_key: str - value: str - - -class GenericConnectorConfig(ConfigModel): - connector_name: str - source_dataset: str - source_platform: str - - -class KafkaConnectSourceConfig( - PlatformInstanceConfigMixin, - DatasetLineageProviderConfigBase, - StatefulIngestionConfigBase, -): - # See the Connect REST Interface for details - # https://docs.confluent.io/platform/current/connect/references/restapi.html# - connect_uri: str = Field( - default="http://localhost:8083/", description="URI to connect to." - ) - username: Optional[str] = Field(default=None, description="Kafka Connect username.") - password: Optional[str] = Field(default=None, description="Kafka Connect password.") - cluster_name: Optional[str] = Field( - default="connect-cluster", description="Cluster to ingest from." - ) - # convert lineage dataset's urns to lowercase - convert_lineage_urns_to_lowercase: bool = Field( - default=False, - description="Whether to convert the urns of ingested lineage dataset to lowercase", - ) - connector_patterns: AllowDenyPattern = Field( - default=AllowDenyPattern.allow_all(), - description="regex patterns for connectors to filter for ingestion.", - ) - provided_configs: Optional[List[ProvidedConfig]] = Field( - default=None, description="Provided Configurations" - ) - connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( - default=None, - description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', - ) - platform_instance_map: Optional[Dict[str, str]] = Field( - default=None, - description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', - ) - generic_connectors: List[GenericConnectorConfig] = Field( - default=[], - description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", - ) - - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None - - -@dataclass -class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): - connectors_scanned: int = 0 - filtered: List[str] = field(default_factory=list) - - def report_connector_scanned(self, connector: str) -> None: - self.connectors_scanned += 1 - - def report_dropped(self, connector: str) -> None: - self.filtered.append(connector) - - -@dataclass -class KafkaConnectLineage: - """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" - - source_platform: str - target_dataset: str - target_platform: str - job_property_bag: Optional[Dict[str, str]] = None - source_dataset: Optional[str] = None - - -@dataclass -class ConnectorManifest: - """Each instance is potential DataFlow""" - - name: str - type: str - config: Dict - tasks: Dict - url: Optional[str] = None - flow_property_bag: Optional[Dict[str, str]] = None - lineages: List[KafkaConnectLineage] = field(default_factory=list) - topic_names: Iterable[str] = field(default_factory=list) - - -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - index = len(prefix) - return text[index:] - return text - - -def unquote( - string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None -) -> str: - """ - If string starts and ends with a quote, unquote it - """ - trailing_quote = trailing_quote if trailing_quote else leading_quote - if string.startswith(leading_quote) and string.endswith(trailing_quote): - string = string[1:-1] - return string - - -def get_dataset_name( - database_name: Optional[str], - source_table: str, -) -> str: - if database_name: - dataset_name = database_name + "." + source_table - else: - dataset_name = source_table - - return dataset_name - - -def get_platform_instance( - config: KafkaConnectSourceConfig, connector_name: str, platform: str -) -> Optional[str]: - instance_name = None - if ( - config.connect_to_platform_map - and config.connect_to_platform_map.get(connector_name) - and config.connect_to_platform_map[connector_name].get(platform) - ): - instance_name = config.connect_to_platform_map[connector_name][platform] - if config.platform_instance_map and config.platform_instance_map.get(platform): - logger.warning( - f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." - "Will prefer connector specific platform instance from connect_to_platform_map." - ) - elif config.platform_instance_map and config.platform_instance_map.get(platform): - instance_name = config.platform_instance_map[platform] - logger.info( - f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" - ) - return instance_name - - -@dataclass -class ConfluentJDBCSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" - KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] - # https://kafka.apache.org/documentation/#connect_included_transformation - KAFKA_NONTOPICROUTING_TRANSFORMS = [ - "InsertField", - "InsertField$Key", - "InsertField$Value", - "ReplaceField", - "ReplaceField$Key", - "ReplaceField$Value", - "MaskField", - "MaskField$Key", - "MaskField$Value", - "ValueToKey", - "ValueToKey$Key", - "ValueToKey$Value", - "HoistField", - "HoistField$Key", - "HoistField$Value", - "ExtractField", - "ExtractField$Key", - "ExtractField$Value", - "SetSchemaMetadata", - "SetSchemaMetadata$Key", - "SetSchemaMetadata$Value", - "Flatten", - "Flatten$Key", - "Flatten$Value", - "Cast", - "Cast$Key", - "Cast$Value", - "HeadersFrom", - "HeadersFrom$Key", - "HeadersFrom$Value", - "TimestampConverter", - "Filter", - "InsertHeader", - "DropHeaders", - ] - # https://docs.confluent.io/platform/current/connect/transforms/overview.html - CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ - "Drop", - "Drop$Key", - "Drop$Value", - "Filter", - "Filter$Key", - "Filter$Value", - "TombstoneHandler", - ] - KNOWN_NONTOPICROUTING_TRANSFORMS = ( - KAFKA_NONTOPICROUTING_TRANSFORMS - + [ - f"org.apache.kafka.connect.transforms.{t}" - for t in KAFKA_NONTOPICROUTING_TRANSFORMS - ] - + CONFLUENT_NONTOPICROUTING_TRANSFORMS - + [ - f"io.confluent.connect.transforms.{t}" - for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS - ] - ) - - @dataclass - class JdbcParser: - db_connection_url: str - source_platform: str - database_name: str - topic_prefix: str - query: str - transforms: list - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> JdbcParser: - url = remove_prefix( - str(connector_manifest.config.get("connection.url")), "jdbc:" - ) - url_instance = make_url(url) - source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) - database_name = url_instance.database - assert database_name - db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" - - topic_prefix = self.connector_manifest.config.get("topic.prefix", None) - - query = self.connector_manifest.config.get("query", None) - - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - return self.JdbcParser( - db_connection_url, - source_platform, - database_name, - topic_prefix, - query, - transforms, - ) - - def default_get_lineages( - self, - topic_prefix: str, - database_name: str, - source_platform: str, - topic_names: Optional[Iterable[str]] = None, - include_source_dataset: bool = True, - ) -> List[KafkaConnectLineage]: - lineages: List[KafkaConnectLineage] = [] - if not topic_names: - topic_names = self.connector_manifest.topic_names - table_name_tuples: List[Tuple] = self.get_table_names() - for topic in topic_names: - # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) - source_table: str = ( - remove_prefix(topic, topic_prefix) if topic_prefix else topic - ) - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform): - table_name_tuple: Tuple = next( - iter([t for t in table_name_tuples if t and t[-1] == source_table]), - (), - ) - if len(table_name_tuple) > 1: - source_table = f"{table_name_tuple[-2]}.{source_table}" - else: - include_source_dataset = False - self.report.warning( - "Could not find schema for table" - f"{self.connector_manifest.name} : {source_table}", - ) - dataset_name: str = get_dataset_name(database_name, source_table) - lineage = KafkaConnectLineage( - source_dataset=dataset_name if include_source_dataset else None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - return lineages - - def get_table_names(self) -> List[Tuple]: - sep: str = "." - leading_quote_char: str = '"' - trailing_quote_char: str = leading_quote_char - - table_ids: List[str] = [] - if self.connector_manifest.tasks: - table_ids = ( - ",".join( - [ - task["config"].get("tables") - for task in self.connector_manifest.tasks - ] - ) - ).split(",") - quote_method = self.connector_manifest.config.get( - "quote.sql.identifiers", "always" - ) - if ( - quote_method == "always" - and table_ids - and table_ids[0] - and table_ids[-1] - ): - leading_quote_char = table_ids[0][0] - trailing_quote_char = table_ids[-1][-1] - # This will only work for single character quotes - elif self.connector_manifest.config.get("table.whitelist"): - table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore - - # List of Tuple containing (schema, table) - tables: List[Tuple] = [ - ( - ( - unquote( - table_id.split(sep)[-2], leading_quote_char, trailing_quote_char - ) - if len(table_id.split(sep)) > 1 - else "" - ), - unquote( - table_id.split(sep)[-1], leading_quote_char, trailing_quote_char - ), - ) - for table_id in table_ids - ] - return tables - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - database_name = parser.database_name - query = parser.query - topic_prefix = parser.topic_prefix - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # Mask/Remove properties that may reveal credentials - self.connector_manifest.flow_property_bag[ - "connection.url" - ] = parser.db_connection_url - if "connection.password" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.password"] - if "connection.user" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["connection.user"] - - logging.debug( - f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " - ) - - if not self.connector_manifest.topic_names: - self.connector_manifest.lineages = lineages - return - - if query: - # Lineage source_table can be extracted by parsing query - for topic in self.connector_manifest.topic_names: - # default method - as per earlier implementation - dataset_name: str = get_dataset_name(database_name, topic) - - lineage = KafkaConnectLineage( - source_dataset=None, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.report.warning( - "Could not find input dataset, the connector has query configuration set", - self.connector_manifest.name, - ) - self.connector_manifest.lineages = lineages - return - - SINGLE_TRANSFORM = len(transforms) == 1 - NO_TRANSFORM = len(transforms) == 0 - UNKNOWN_TRANSFORM = any( - [ - transform["type"] - not in self.KNOWN_TOPICROUTING_TRANSFORMS - + self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - ALL_TRANSFORMS_NON_TOPICROUTING = all( - [ - transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS - for transform in transforms - ] - ) - - if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: - self.connector_manifest.lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - ) - return - - if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: - tables = self.get_table_names() - topic_names = list(self.connector_manifest.topic_names) - - from java.util.regex import Pattern - - for table in tables: - source_table: str = table[-1] - topic = topic_prefix + source_table if topic_prefix else source_table - - transform_regex = Pattern.compile(transforms[0]["regex"]) - transform_replacement = transforms[0]["replacement"] - - matcher = transform_regex.matcher(topic) - if matcher.matches(): - topic = str(matcher.replaceFirst(transform_replacement)) - - # Additional check to confirm that the topic present - # in connector topics - - if topic in self.connector_manifest.topic_names: - # include schema name for three-level hierarchies - if has_three_level_hierarchy(source_platform) and len(table) > 1: - source_table = f"{table[-2]}.{table[-1]}" - - dataset_name = get_dataset_name(database_name, source_table) - - lineage = KafkaConnectLineage( - source_dataset=dataset_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - topic_names.remove(topic) - lineages.append(lineage) - - if topic_names: - lineages.extend( - self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - topic_names=topic_names, - include_source_dataset=False, - ) - ) - self.report.warning( - "Could not find input dataset for connector topics", - f"{self.connector_manifest.name} : {topic_names}", - ) - self.connector_manifest.lineages = lineages - return - else: - include_source_dataset = True - if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has unknown transform", - f"{self.connector_manifest.name} : {transforms[0]['type']}", - ) - include_source_dataset = False - if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: - self.report.warning( - "Could not find input dataset, connector has one or more unknown transforms", - self.connector_manifest.name, - ) - include_source_dataset = False - lineages = self.default_get_lineages( - database_name=database_name, - source_platform=source_platform, - topic_prefix=topic_prefix, - include_source_dataset=include_source_dataset, - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class MongoSourceConnector: - # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ - - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, config: KafkaConnectSourceConfig - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self._extract_lineages() - - @dataclass - class MongoSourceParser: - db_connection_url: Optional[str] - source_platform: str - database_name: Optional[str] - topic_prefix: Optional[str] - transforms: List[str] - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> MongoSourceParser: - parser = self.MongoSourceParser( - db_connection_url=connector_manifest.config.get("connection.uri"), - source_platform="mongodb", - database_name=connector_manifest.config.get("database"), - topic_prefix=connector_manifest.config.get("topic_prefix"), - transforms=( - connector_manifest.config["transforms"].split(",") - if "transforms" in connector_manifest.config - else [] - ), - ) - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(found.group(1), found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - - -@dataclass -class DebeziumSourceConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, - connector_manifest: ConnectorManifest, - config: KafkaConnectSourceConfig, - report: KafkaConnectSourceReport, - ) -> None: - self.connector_manifest = connector_manifest - self.config = config - self.report = report - self._extract_lineages() - - @dataclass - class DebeziumParser: - source_platform: str - server_name: Optional[str] - database_name: Optional[str] - - def get_server_name(self, connector_manifest: ConnectorManifest) -> str: - if "topic.prefix" in connector_manifest.config: - return connector_manifest.config["topic.prefix"] - else: - return connector_manifest.config.get("database.server.name", "") - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> DebeziumParser: - connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") - - if connector_class == "io.debezium.connector.mysql.MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "MySqlConnector": - parser = self.DebeziumParser( - source_platform="mysql", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": - parser = self.DebeziumParser( - source_platform="mongodb", - server_name=self.get_server_name(connector_manifest), - database_name=None, - ) - elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": - parser = self.DebeziumParser( - source_platform="postgres", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.oracle.OracleConnector": - parser = self.DebeziumParser( - source_platform="oracle", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": - database_name = connector_manifest.config.get( - "database.names" - ) or connector_manifest.config.get("database.dbname") - - if "," in str(database_name): - raise Exception( - f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" - ) - - parser = self.DebeziumParser( - source_platform="mssql", - server_name=self.get_server_name(connector_manifest), - database_name=database_name, - ) - elif connector_class == "io.debezium.connector.db2.Db2Connector": - parser = self.DebeziumParser( - source_platform="db2", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("database.dbname"), - ) - elif connector_class == "io.debezium.connector.vitess.VitessConnector": - parser = self.DebeziumParser( - source_platform="vitess", - server_name=self.get_server_name(connector_manifest), - database_name=connector_manifest.config.get("vitess.keyspace"), - ) - else: - raise ValueError(f"Connector class '{connector_class}' is unknown.") - - return parser - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - - try: - parser = self.get_parser(self.connector_manifest) - source_platform = parser.source_platform - server_name = parser.server_name - database_name = parser.database_name - topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" - - if not self.connector_manifest.topic_names: - return lineages - - for topic in self.connector_manifest.topic_names: - found = re.search(re.compile(topic_naming_pattern), topic) - - if found: - table_name = get_dataset_name(database_name, found.group(2)) - - lineage = KafkaConnectLineage( - source_dataset=table_name, - source_platform=source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - lineages.append(lineage) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -@dataclass -class BigQuerySinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class BQParser: - project: str - target_platform: str - sanitizeTopics: str - transforms: list - topicsToTables: Optional[str] = None - datasets: Optional[str] = None - defaultDataset: Optional[str] = None - version: str = "v1" - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> BQParser: - project = connector_manifest.config["project"] - sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") - transform_names = ( - self.connector_manifest.config.get("transforms", "").split(",") - if self.connector_manifest.config.get("transforms") - else [] - ) - transforms = [] - for name in transform_names: - transform = {"name": name} - transforms.append(transform) - for key in self.connector_manifest.config.keys(): - if key.startswith(f"transforms.{name}."): - transform[ - key.replace(f"transforms.{name}.", "") - ] = self.connector_manifest.config[key] - - if "defaultDataset" in connector_manifest.config: - defaultDataset = connector_manifest.config["defaultDataset"] - return self.BQParser( - project=project, - defaultDataset=defaultDataset, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - version="v2", - transforms=transforms, - ) - else: - # version 1.6.x and similar configs supported - datasets = connector_manifest.config["datasets"] - topicsToTables = connector_manifest.config.get("topicsToTables") - - return self.BQParser( - project=project, - topicsToTables=topicsToTables, - datasets=datasets, - target_platform="bigquery", - sanitizeTopics=sanitizeTopics.lower() == "true", - transforms=transforms, - ) - - def get_list(self, property: str) -> Iterable[Tuple[str, str]]: - entries = property.split(",") - for entry in entries: - key, val = entry.rsplit("=") - yield (key.strip(), val.strip()) - - def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: - topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore - from java.util.regex import Pattern - - for pattern, dataset in topicregex_dataset_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - return dataset - return None - - def sanitize_table_name(self, table_name): - table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - - return table_name - - def get_dataset_table_for_topic( - self, topic: str, parser: BQParser - ) -> Optional[str]: - if parser.version == "v2": - dataset = parser.defaultDataset - parts = topic.split(":") - if len(parts) == 2: - dataset = parts[0] - table = parts[1] - else: - table = parts[0] - else: - dataset = self.get_dataset_for_topic_v1(topic, parser) - if dataset is None: - return None - - table = topic - if parser.topicsToTables: - topicregex_table_map: Dict[str, str] = dict( - self.get_list(parser.topicsToTables) # type: ignore - ) - from java.util.regex import Pattern - - for pattern, tbl in topicregex_table_map.items(): - patternMatcher = Pattern.compile(pattern).matcher(topic) - if patternMatcher.matches(): - table = tbl - break - - if parser.sanitizeTopics: - table = self.sanitize_table_name(table) - return f"{dataset}.{table}" - - def apply_transformations( - self, topic: str, transforms: List[Dict[str, str]] - ) -> str: - for transform in transforms: - if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": - regex = transform["regex"] - replacement = transform["replacement"] - pattern = re.compile(regex) - if pattern.match(topic): - topic = pattern.sub(replacement, topic, count=1) - return topic - - def _extract_lineages(self): - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - if not parser: - return lineages - target_platform = parser.target_platform - project = parser.project - transforms = parser.transforms - self.connector_manifest.flow_property_bag = self.connector_manifest.config - # Mask/Remove properties that may reveal credentials - if "keyfile" in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag["keyfile"] - - for topic in self.connector_manifest.topic_names: - transformed_topic = self.apply_transformations(topic, transforms) - dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) - if dataset_table is None: - self.report.warning( - "Could not find target dataset for topic, please check your connector configuration" - f"{self.connector_manifest.name} : {transformed_topic} ", - ) - continue - target_dataset = f"{project}.{dataset_table}" - - lineages.append( - KafkaConnectLineage( - source_dataset=transformed_topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform=target_platform, - ) - ) - self.connector_manifest.lineages = lineages - return - - -@dataclass -class SnowflakeSinkConnector: - connector_manifest: ConnectorManifest - report: KafkaConnectSourceReport - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class SnowflakeParser: - database_name: str - schema_name: str - topics_to_tables: Dict[str, str] - - def get_table_name_from_topic_name(self, topic_name: str) -> str: - """ - This function converts the topic name to a valid Snowflake table name using some rules. - Refer below link for more info - https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics - """ - table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) - if re.match("^[^a-zA-Z_].*", table_name): - table_name = "_" + table_name - # Connector may append original topic's hash code as suffix for conflict resolution - # if generated table names for 2 topics are similar. This corner case is not handled here. - # Note that Snowflake recommends to choose topic names that follow the rules for - # Snowflake identifier names so this case is not recommended by snowflake. - return table_name - - def get_parser( - self, - connector_manifest: ConnectorManifest, - ) -> SnowflakeParser: - database_name = connector_manifest.config["snowflake.database.name"] - schema_name = connector_manifest.config["snowflake.schema.name"] - - # Fetch user provided topic to table map - provided_topics_to_tables: Dict[str, str] = {} - if connector_manifest.config.get("snowflake.topic2table.map"): - for each in connector_manifest.config["snowflake.topic2table.map"].split( - "," - ): - topic, table = each.split(":") - provided_topics_to_tables[topic.strip()] = table.strip() - - topics_to_tables: Dict[str, str] = {} - # Extract lineage for only those topics whose data ingestion started - for topic in connector_manifest.topic_names: - if topic in provided_topics_to_tables: - # If user provided which table to get mapped with this topic - topics_to_tables[topic] = provided_topics_to_tables[topic] - else: - # Else connector converts topic name to a valid Snowflake table name. - topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) - - return self.SnowflakeParser( - database_name=database_name, - schema_name=schema_name, - topics_to_tables=topics_to_tables, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # For all snowflake sink connector properties, refer below link - # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector - # remove private keys, secrets from properties - secret_properties = [ - "snowflake.private.key", - "snowflake.private.key.passphrase", - "value.converter.basic.auth.user.info", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - lineages: List[KafkaConnectLineage] = list() - parser = self.get_parser(self.connector_manifest) - - for topic, table in parser.topics_to_tables.items(): - target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform=KAFKA, - target_dataset=target_dataset, - target_platform="snowflake", - ) - ) - - self.connector_manifest.lineages = lineages - return - - -@dataclass -class ConfluentS3SinkConnector: - connector_manifest: ConnectorManifest - - def __init__( - self, connector_manifest: ConnectorManifest, report: KafkaConnectSourceReport - ) -> None: - self.connector_manifest = connector_manifest - self.report = report - self._extract_lineages() - - @dataclass - class S3SinkParser: - target_platform: str - bucket: str - topics_dir: str - topics: Iterable[str] - - def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 - bucket = connector_manifest.config.get("s3.bucket.name") - if not bucket: - raise ValueError( - "Could not find 's3.bucket.name' in connector configuration" - ) - - # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage - topics_dir = connector_manifest.config.get("topics.dir", "topics") - - return self.S3SinkParser( - target_platform="s3", - bucket=bucket, - topics_dir=topics_dir, - topics=connector_manifest.topic_names, - ) - - def _extract_lineages(self): - self.connector_manifest.flow_property_bag = self.connector_manifest.config - - # remove keys, secrets from properties - secret_properties = [ - "aws.access.key.id", - "aws.secret.access.key", - "s3.sse.customer.key", - "s3.proxy.password", - ] - for k in secret_properties: - if k in self.connector_manifest.flow_property_bag: - del self.connector_manifest.flow_property_bag[k] - - try: - parser = self._get_parser(self.connector_manifest) - - lineages: List[KafkaConnectLineage] = list() - for topic in parser.topics: - target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" - - lineages.append( - KafkaConnectLineage( - source_dataset=topic, - source_platform="kafka", - target_dataset=target_dataset, - target_platform=parser.target_platform, - ) - ) - self.connector_manifest.lineages = lineages - except Exception as e: - self.report.warning( - "Error resolving lineage for connector", - self.connector_manifest.name, - exc=e, - ) - - return - - -def transform_connector_config( - connector_config: Dict, provided_configs: List[ProvidedConfig] -) -> None: - """This method will update provided configs in connector config values, if any""" - lookupsByProvider = {} - for pconfig in provided_configs: - lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value - for k, v in connector_config.items(): - for key, value in lookupsByProvider.items(): - if key in v: - connector_config[k] = connector_config[k].replace(key, value) - - -@platform_name("Kafka Connect") -@config_class(KafkaConnectSourceConfig) -@support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") -@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") -@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -class KafkaConnectSource(StatefulIngestionSourceBase): - config: KafkaConnectSourceConfig - report: KafkaConnectSourceReport - platform: str = "kafka-connect" - - def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): - super().__init__(config, ctx) - self.config = config - self.report = KafkaConnectSourceReport() - self.session = requests.Session() - self.session.headers.update( - { - "Accept": "application/json", - "Content-Type": "application/json", - } - ) - - # Test the connection - if self.config.username is not None and self.config.password is not None: - logger.info( - f"Connecting to {self.config.connect_uri} with Authentication..." - ) - self.session.auth = (self.config.username, self.config.password) - - test_response = self.session.get(f"{self.config.connect_uri}/connectors") - test_response.raise_for_status() - logger.info(f"Connection to {self.config.connect_uri} is ok") - if not jpype.isJVMStarted(): - jpype.startJVM() - - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: - config = KafkaConnectSourceConfig.parse_obj(config_dict) - return cls(config, ctx) - - def get_connectors_manifest(self) -> List[ConnectorManifest]: - """Get Kafka Connect connectors manifest using REST API. - Enrich with lineages metadata. - """ - connectors_manifest = list() - - connector_response = self.session.get( - f"{self.config.connect_uri}/connectors", - ) - - payload = connector_response.json() - - for connector_name in payload: - connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" - connector_manifest = self._get_connector_manifest( - connector_name, connector_url - ) - if ( - connector_manifest is None - or not self.config.connector_patterns.allowed(connector_manifest.name) - ): - self.report.report_dropped(connector_name) - continue - - if self.config.provided_configs: - transform_connector_config( - connector_manifest.config, self.config.provided_configs - ) - # Initialize connector lineages - connector_manifest.lineages = list() - connector_manifest.url = connector_url - - connector_manifest.topic_names = self._get_connector_topics(connector_name) - - # Populate Source Connector metadata - if connector_manifest.type == SOURCE: - connector_manifest.tasks = self._get_connector_tasks(connector_name) - - # JDBC source connector lineages - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "io.confluent.connect.jdbc.JdbcSourceConnector" - ): - connector_manifest = ConfluentJDBCSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif connector_manifest.config.get(CONNECTOR_CLASS, "").startswith( - "io.debezium.connector" - ): - connector_manifest = DebeziumSourceConnector( - connector_manifest=connector_manifest, - config=self.config, - report=self.report, - ).connector_manifest - elif ( - connector_manifest.config.get(CONNECTOR_CLASS, "") - == "com.mongodb.kafka.connect.MongoSourceConnector" - ): - connector_manifest = MongoSourceConnector( - connector_manifest=connector_manifest, config=self.config - ).connector_manifest - else: - # Find the target connector object in the list, or log an error if unknown. - target_connector = None - for connector in self.config.generic_connectors: - if connector.connector_name == connector_manifest.name: - target_connector = connector - break - if not target_connector: - logger.warning( - f"Detected undefined connector {connector_manifest.name}, which is not in the customized connector list. Please refer to Kafka Connect ingestion recipe to define this customized connector." - ) - continue - - for topic in connector_manifest.topic_names: - lineage = KafkaConnectLineage( - source_dataset=target_connector.source_dataset, - source_platform=target_connector.source_platform, - target_dataset=topic, - target_platform=KAFKA, - ) - - connector_manifest.lineages.append(lineage) - - if connector_manifest.type == SINK: - if connector_manifest.config.get(CONNECTOR_CLASS).__eq__( - "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" - ): - connector_manifest = BigQuerySinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "io.confluent.connect.s3.S3SinkConnector" - ): - connector_manifest = ConfluentS3SinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - elif connector_manifest.config.get("connector.class").__eq__( - "com.snowflake.kafka.connector.SnowflakeSinkConnector" - ): - connector_manifest = SnowflakeSinkConnector( - connector_manifest=connector_manifest, report=self.report - ).connector_manifest - else: - self.report.report_dropped(connector_manifest.name) - logger.warning( - f"Skipping connector {connector_manifest.name}. Lineage for Connector not yet implemented" - ) - pass - - connectors_manifest.append(connector_manifest) - - return connectors_manifest - - def _get_connector_manifest( - self, connector_name: str, connector_url: str - ) -> Optional[ConnectorManifest]: - try: - connector_response = self.session.get(connector_url) - connector_response.raise_for_status() - except Exception as e: - self.report.warning( - "Failed to get connector details", connector_name, exc=e - ) - return None - manifest = connector_response.json() - connector_manifest = ConnectorManifest(**manifest) - return connector_manifest - - def _get_connector_tasks(self, connector_name: str) -> dict: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/tasks", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector tasks", context=connector_name, exc=e - ) - return {} - - return response.json() - - def _get_connector_topics(self, connector_name: str) -> List[str]: - try: - response = self.session.get( - f"{self.config.connect_uri}/connectors/{connector_name}/topics", - ) - response.raise_for_status() - except Exception as e: - self.report.warning( - "Error getting connector topics", context=connector_name, exc=e - ) - return [] - - return response.json()[connector_name]["topics"] - - def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: - connector_name = connector.name - connector_type = connector.type - connector_class = connector.config.get(CONNECTOR_CLASS) - flow_property_bag = connector.flow_property_bag - # connector_url = connector.url # NOTE: this will expose connector credential when used - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - return MetadataChangeProposalWrapper( - entityUrn=flow_urn, - aspect=models.DataFlowInfoClass( - name=connector_name, - description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", - customProperties=flow_property_bag, - # externalUrl=connector_url, # NOTE: this will expose connector credential when used - ), - ).as_workunit() - - def construct_job_workunits( - self, connector: ConnectorManifest - ) -> Iterable[MetadataWorkUnit]: - connector_name = connector.name - flow_urn = builder.make_data_flow_urn( - self.platform, - connector_name, - self.config.env, - self.config.platform_instance, - ) - - lineages = connector.lineages - if lineages: - for lineage in lineages: - source_dataset = lineage.source_dataset - source_platform = lineage.source_platform - target_dataset = lineage.target_dataset - target_platform = lineage.target_platform - job_property_bag = lineage.job_property_bag - - source_platform_instance = get_platform_instance( - self.config, connector_name, source_platform - ) - target_platform_instance = get_platform_instance( - self.config, connector_name, target_platform - ) - - job_id = self.get_job_id(lineage, connector, self.config) - job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) - - inlets = ( - [ - self.make_lineage_dataset_urn( - source_platform, source_dataset, source_platform_instance - ) - ] - if source_dataset - else [] - ) - outlets = [ - self.make_lineage_dataset_urn( - target_platform, target_dataset, target_platform_instance - ) - ] - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInfoClass( - name=f"{connector_name}:{job_id}", - type="COMMAND", - customProperties=job_property_bag, - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=models.DataJobInputOutputClass( - inputDatasets=inlets, - outputDatasets=outlets, - ), - ).as_workunit() - - def get_job_id( - self, - lineage: KafkaConnectLineage, - connector: ConnectorManifest, - config: KafkaConnectSourceConfig, - ) -> str: - connector_class = connector.config.get(CONNECTOR_CLASS) - - # Note - This block is only to maintain backward compatibility of Job URN - if ( - connector_class - and connector.type == SOURCE - and ( - "JdbcSourceConnector" in connector_class - or connector_class.startswith("io.debezium.connector") - ) - and lineage.source_dataset - and config.connect_to_platform_map - and config.connect_to_platform_map.get(connector.name) - and config.connect_to_platform_map[connector.name].get( - lineage.source_platform - ) - ): - return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" - - return ( - lineage.source_dataset - if lineage.source_dataset - else f"unknown_source.{lineage.target_dataset}" - ) - - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: - return [ - *super().get_workunit_processors(), - StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ).workunit_processor, - ] - - def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - connectors_manifest = self.get_connectors_manifest() - for connector in connectors_manifest: - name = connector.name - - yield self.construct_flow_workunit(connector) - yield from self.construct_job_workunits(connector) - self.report.report_connector_scanned(name) - - def get_report(self) -> KafkaConnectSourceReport: - return self.report - - def make_lineage_dataset_urn( - self, platform: str, name: str, platform_instance: Optional[str] - ) -> str: - if self.config.convert_lineage_urns_to_lowercase: - name = name.lower() - - return builder.make_dataset_urn_with_platform_instance( - platform, name, platform_instance, self.config.env - ) - - -# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. -def has_three_level_hierarchy(platform: str) -> bool: - return platform in ["postgres", "trino", "redshift", "snowflake"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py new file mode 100644 index 0000000000000..36f6a96c0d408 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/common.py @@ -0,0 +1,202 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Iterable, List, Optional + +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import ( + DatasetLineageProviderConfigBase, + PlatformInstanceConfigMixin, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) + +logger = logging.getLogger(__name__) + +KAFKA = "kafka" +SOURCE = "source" +SINK = "sink" +CONNECTOR_CLASS = "connector.class" + + +class ProvidedConfig(ConfigModel): + provider: str + path_key: str + value: str + + +class GenericConnectorConfig(ConfigModel): + connector_name: str + source_dataset: str + source_platform: str + + +class KafkaConnectSourceConfig( + PlatformInstanceConfigMixin, + DatasetLineageProviderConfigBase, + StatefulIngestionConfigBase, +): + # See the Connect REST Interface for details + # https://docs.confluent.io/platform/current/connect/references/restapi.html# + connect_uri: str = Field( + default="http://localhost:8083/", description="URI to connect to." + ) + username: Optional[str] = Field(default=None, description="Kafka Connect username.") + password: Optional[str] = Field(default=None, description="Kafka Connect password.") + cluster_name: Optional[str] = Field( + default="connect-cluster", description="Cluster to ingest from." + ) + # convert lineage dataset's urns to lowercase + convert_lineage_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert the urns of ingested lineage dataset to lowercase", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="regex patterns for connectors to filter for ingestion.", + ) + provided_configs: Optional[List[ProvidedConfig]] = Field( + default=None, description="Provided Configurations" + ) + connect_to_platform_map: Optional[Dict[str, Dict[str, str]]] = Field( + default=None, + description='Platform instance mapping when multiple instances for a platform is available. Entry for a platform should be in either `platform_instance_map` or `connect_to_platform_map`. e.g.`connect_to_platform_map: { "postgres-connector-finance-db": "postgres": "core_finance_instance" }`', + ) + platform_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description='Platform instance mapping to use when constructing URNs. e.g.`platform_instance_map: { "hive": "warehouse" }`', + ) + generic_connectors: List[GenericConnectorConfig] = Field( + default=[], + description="Provide lineage graph for sources connectors other than Confluent JDBC Source Connector, Debezium Source Connector, and Mongo Source Connector", + ) + + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None + + +@dataclass +class KafkaConnectSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered: List[str] = field(default_factory=list) + + def report_connector_scanned(self, connector: str) -> None: + self.connectors_scanned += 1 + + def report_dropped(self, connector: str) -> None: + self.filtered.append(connector) + + +@dataclass +class KafkaConnectLineage: + """Class to store Kafka Connect lineage mapping, Each instance is potential DataJob""" + + source_platform: str + target_dataset: str + target_platform: str + job_property_bag: Optional[Dict[str, str]] = None + source_dataset: Optional[str] = None + + +@dataclass +class ConnectorManifest: + """Each instance is potential DataFlow""" + + name: str + type: str + config: Dict + tasks: Dict + url: Optional[str] = None + flow_property_bag: Optional[Dict[str, str]] = None + lineages: List[KafkaConnectLineage] = field(default_factory=list) + topic_names: Iterable[str] = field(default_factory=list) + + +def remove_prefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + index = len(prefix) + return text[index:] + return text + + +def unquote( + string: str, leading_quote: str = '"', trailing_quote: Optional[str] = None +) -> str: + """ + If string starts and ends with a quote, unquote it + """ + trailing_quote = trailing_quote if trailing_quote else leading_quote + if string.startswith(leading_quote) and string.endswith(trailing_quote): + string = string[1:-1] + return string + + +def get_dataset_name( + database_name: Optional[str], + source_table: str, +) -> str: + if database_name: + dataset_name = database_name + "." + source_table + else: + dataset_name = source_table + + return dataset_name + + +def get_platform_instance( + config: KafkaConnectSourceConfig, connector_name: str, platform: str +) -> Optional[str]: + instance_name = None + if ( + config.connect_to_platform_map + and config.connect_to_platform_map.get(connector_name) + and config.connect_to_platform_map[connector_name].get(platform) + ): + instance_name = config.connect_to_platform_map[connector_name][platform] + if config.platform_instance_map and config.platform_instance_map.get(platform): + logger.warning( + f"Same source platform {platform} configured in both platform_instance_map and connect_to_platform_map." + "Will prefer connector specific platform instance from connect_to_platform_map." + ) + elif config.platform_instance_map and config.platform_instance_map.get(platform): + instance_name = config.platform_instance_map[platform] + logger.info( + f"Instance name assigned is: {instance_name} for Connector Name {connector_name} and platform {platform}" + ) + return instance_name + + +def transform_connector_config( + connector_config: Dict, provided_configs: List[ProvidedConfig] +) -> None: + """This method will update provided configs in connector config values, if any""" + lookupsByProvider = {} + for pconfig in provided_configs: + lookupsByProvider[f"${{{pconfig.provider}:{pconfig.path_key}}}"] = pconfig.value + for k, v in connector_config.items(): + for key, value in lookupsByProvider.items(): + if key in v: + connector_config[k] = connector_config[k].replace(key, value) + + +# TODO: Find a more automated way to discover new platforms with 3 level naming hierarchy. +def has_three_level_hierarchy(platform: str) -> bool: + return platform in ["postgres", "trino", "redshift", "snowflake"] + + +@dataclass +class BaseConnector: + connector_manifest: ConnectorManifest + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + + def extract_lineages(self) -> List[KafkaConnectLineage]: + return [] + + def extract_flow_property_bag(self) -> Optional[Dict[str, str]]: + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py new file mode 100644 index 0000000000000..fa6b614c4b52a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/kafka_connect.py @@ -0,0 +1,367 @@ +import logging +from typing import Iterable, List, Optional, Type + +import jpype +import jpype.imports +import requests + +import datahub.emitter.mce_builder as builder +import datahub.metadata.schema_classes as models +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + SINK, + SOURCE, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + KafkaConnectSourceConfig, + KafkaConnectSourceReport, + get_platform_instance, + transform_connector_config, +) +from datahub.ingestion.source.kafka_connect.sink_connectors import ( + BIGQUERY_SINK_CONNECTOR_CLASS, + S3_SINK_CONNECTOR_CLASS, + SNOWFLAKE_SINK_CONNECTOR_CLASS, + BigQuerySinkConnector, + ConfluentS3SinkConnector, + SnowflakeSinkConnector, +) +from datahub.ingestion.source.kafka_connect.source_connectors import ( + DEBEZIUM_SOURCE_CONNECTOR_PREFIX, + JDBC_SOURCE_CONNECTOR_CLASS, + MONGO_SOURCE_CONNECTOR_CLASS, + ConfigDrivenSourceConnector, + ConfluentJDBCSourceConnector, + DebeziumSourceConnector, + MongoSourceConnector, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) + +logger = logging.getLogger(__name__) + + +@platform_name("Kafka Connect") +@config_class(KafkaConnectSourceConfig) +@support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +class KafkaConnectSource(StatefulIngestionSourceBase): + config: KafkaConnectSourceConfig + report: KafkaConnectSourceReport + platform: str = "kafka-connect" + + def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.config = config + self.report = KafkaConnectSourceReport() + self.session = requests.Session() + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + } + ) + + # Test the connection + if self.config.username is not None and self.config.password is not None: + logger.info( + f"Connecting to {self.config.connect_uri} with Authentication..." + ) + self.session.auth = (self.config.username, self.config.password) + + test_response = self.session.get(f"{self.config.connect_uri}/connectors") + test_response.raise_for_status() + logger.info(f"Connection to {self.config.connect_uri} is ok") + if not jpype.isJVMStarted(): + jpype.startJVM() + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = KafkaConnectSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_connectors_manifest(self) -> Iterable[ConnectorManifest]: + """Get Kafka Connect connectors manifest using REST API. + Enrich with lineages metadata. + """ + + connector_response = self.session.get( + f"{self.config.connect_uri}/connectors", + ) + + payload = connector_response.json() + + for connector_name in payload: + connector_url = f"{self.config.connect_uri}/connectors/{connector_name}" + connector_manifest = self._get_connector_manifest( + connector_name, connector_url + ) + if ( + connector_manifest is None + or not self.config.connector_patterns.allowed(connector_manifest.name) + ): + self.report.report_dropped(connector_name) + continue + + if self.config.provided_configs: + transform_connector_config( + connector_manifest.config, self.config.provided_configs + ) + connector_manifest.url = connector_url + connector_manifest.topic_names = self._get_connector_topics(connector_name) + connector_class_value = connector_manifest.config.get(CONNECTOR_CLASS) or "" + + class_type: Type[BaseConnector] = BaseConnector + + # Populate Source Connector metadata + if connector_manifest.type == SOURCE: + connector_manifest.tasks = self._get_connector_tasks(connector_name) + + # JDBC source connector lineages + if connector_class_value == JDBC_SOURCE_CONNECTOR_CLASS: + class_type = ConfluentJDBCSourceConnector + elif connector_class_value.startswith(DEBEZIUM_SOURCE_CONNECTOR_PREFIX): + class_type = DebeziumSourceConnector + elif connector_class_value == MONGO_SOURCE_CONNECTOR_CLASS: + class_type = MongoSourceConnector + elif any( + [ + connector.connector_name == connector_manifest.name + for connector in self.config.generic_connectors + ] + ): + class_type = ConfigDrivenSourceConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Source Connector not supported. " + "Please refer to Kafka Connect docs to use `generic_connectors` config.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + continue + elif connector_manifest.type == SINK: + if connector_class_value == BIGQUERY_SINK_CONNECTOR_CLASS: + class_type = BigQuerySinkConnector + elif connector_class_value == S3_SINK_CONNECTOR_CLASS: + class_type = ConfluentS3SinkConnector + elif connector_class_value == SNOWFLAKE_SINK_CONNECTOR_CLASS: + class_type = SnowflakeSinkConnector + else: + self.report.report_dropped(connector_manifest.name) + self.report.warning( + "Lineage for Sink Connector not supported.", + context=f"{connector_manifest.name} of type {connector_class_value}", + ) + + connector_class = class_type(connector_manifest, self.config, self.report) + connector_manifest.lineages = connector_class.extract_lineages() + connector_manifest.flow_property_bag = ( + connector_class.extract_flow_property_bag() + ) + + yield connector_manifest + + def _get_connector_manifest( + self, connector_name: str, connector_url: str + ) -> Optional[ConnectorManifest]: + try: + connector_response = self.session.get(connector_url) + connector_response.raise_for_status() + except Exception as e: + self.report.warning( + "Failed to get connector details", connector_name, exc=e + ) + return None + manifest = connector_response.json() + connector_manifest = ConnectorManifest(**manifest) + return connector_manifest + + def _get_connector_tasks(self, connector_name: str) -> dict: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/tasks", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector tasks", context=connector_name, exc=e + ) + return {} + + return response.json() + + def _get_connector_topics(self, connector_name: str) -> List[str]: + try: + response = self.session.get( + f"{self.config.connect_uri}/connectors/{connector_name}/topics", + ) + response.raise_for_status() + except Exception as e: + self.report.warning( + "Error getting connector topics", context=connector_name, exc=e + ) + return [] + + return response.json()[connector_name]["topics"] + + def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit: + connector_name = connector.name + connector_type = connector.type + connector_class = connector.config.get(CONNECTOR_CLASS) + flow_property_bag = connector.flow_property_bag + # connector_url = connector.url # NOTE: this will expose connector credential when used + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + return MetadataChangeProposalWrapper( + entityUrn=flow_urn, + aspect=models.DataFlowInfoClass( + name=connector_name, + description=f"{connector_type.capitalize()} connector using `{connector_class}` plugin.", + customProperties=flow_property_bag, + # externalUrl=connector_url, # NOTE: this will expose connector credential when used + ), + ).as_workunit() + + def construct_job_workunits( + self, connector: ConnectorManifest + ) -> Iterable[MetadataWorkUnit]: + connector_name = connector.name + flow_urn = builder.make_data_flow_urn( + self.platform, + connector_name, + self.config.env, + self.config.platform_instance, + ) + + lineages = connector.lineages + if lineages: + for lineage in lineages: + source_dataset = lineage.source_dataset + source_platform = lineage.source_platform + target_dataset = lineage.target_dataset + target_platform = lineage.target_platform + job_property_bag = lineage.job_property_bag + + source_platform_instance = get_platform_instance( + self.config, connector_name, source_platform + ) + target_platform_instance = get_platform_instance( + self.config, connector_name, target_platform + ) + + job_id = self.get_job_id(lineage, connector, self.config) + job_urn = builder.make_data_job_urn_with_flow(flow_urn, job_id) + + inlets = ( + [ + self.make_lineage_dataset_urn( + source_platform, source_dataset, source_platform_instance + ) + ] + if source_dataset + else [] + ) + outlets = [ + self.make_lineage_dataset_urn( + target_platform, target_dataset, target_platform_instance + ) + ] + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInfoClass( + name=f"{connector_name}:{job_id}", + type="COMMAND", + customProperties=job_property_bag, + ), + ).as_workunit() + + yield MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=models.DataJobInputOutputClass( + inputDatasets=inlets, + outputDatasets=outlets, + ), + ).as_workunit() + + def get_job_id( + self, + lineage: KafkaConnectLineage, + connector: ConnectorManifest, + config: KafkaConnectSourceConfig, + ) -> str: + connector_class = connector.config.get(CONNECTOR_CLASS) + + # Note - This block is only to maintain backward compatibility of Job URN + if ( + connector_class + and connector.type == SOURCE + and ( + "JdbcSourceConnector" in connector_class + or connector_class.startswith("io.debezium.connector") + ) + and lineage.source_dataset + and config.connect_to_platform_map + and config.connect_to_platform_map.get(connector.name) + and config.connect_to_platform_map[connector.name].get( + lineage.source_platform + ) + ): + return f"{config.connect_to_platform_map[connector.name][lineage.source_platform]}.{lineage.source_dataset}" + + return ( + lineage.source_dataset + if lineage.source_dataset + else f"unknown_source.{lineage.target_dataset}" + ) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + for connector in self.get_connectors_manifest(): + yield self.construct_flow_workunit(connector) + yield from self.construct_job_workunits(connector) + self.report.report_connector_scanned(connector.name) + + def get_report(self) -> KafkaConnectSourceReport: + return self.report + + def make_lineage_dataset_urn( + self, platform: str, name: str, platform_instance: Optional[str] + ) -> str: + if self.config.convert_lineage_urns_to_lowercase: + name = name.lower() + + return builder.make_dataset_urn_with_platform_instance( + platform, name, platform_instance, self.config.env + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py new file mode 100644 index 0000000000000..2790460c8e601 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/sink_connectors.py @@ -0,0 +1,341 @@ +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from datahub.ingestion.source.kafka_connect.common import ( + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, +) + + +@dataclass +class ConfluentS3SinkConnector(BaseConnector): + @dataclass + class S3SinkParser: + target_platform: str + bucket: str + topics_dir: str + topics: Iterable[str] + + def _get_parser(self, connector_manifest: ConnectorManifest) -> S3SinkParser: + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#s3 + bucket = connector_manifest.config.get("s3.bucket.name") + if not bucket: + raise ValueError( + "Could not find 's3.bucket.name' in connector configuration" + ) + + # https://docs.confluent.io/kafka-connectors/s3-sink/current/configuration_options.html#storage + topics_dir = connector_manifest.config.get("topics.dir", "topics") + + return self.S3SinkParser( + target_platform="s3", + bucket=bucket, + topics_dir=topics_dir, + topics=connector_manifest.topic_names, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "aws.access.key.id", + "aws.secret.access.key", + "s3.sse.customer.key", + "s3.proxy.password", + ] + } + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + try: + parser = self._get_parser(self.connector_manifest) + + lineages: List[KafkaConnectLineage] = list() + for topic in parser.topics: + target_dataset = f"{parser.bucket}/{parser.topics_dir}/{topic}" + + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform="kafka", + target_dataset=target_dataset, + target_platform=parser.target_platform, + ) + ) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class SnowflakeSinkConnector(BaseConnector): + @dataclass + class SnowflakeParser: + database_name: str + schema_name: str + topics_to_tables: Dict[str, str] + + def get_table_name_from_topic_name(self, topic_name: str) -> str: + """ + This function converts the topic name to a valid Snowflake table name using some rules. + Refer below link for more info + https://docs.snowflake.com/en/user-guide/kafka-connector-overview#target-tables-for-kafka-topics + """ + table_name = re.sub("[^a-zA-Z0-9_]", "_", topic_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + # Connector may append original topic's hash code as suffix for conflict resolution + # if generated table names for 2 topics are similar. This corner case is not handled here. + # Note that Snowflake recommends to choose topic names that follow the rules for + # Snowflake identifier names so this case is not recommended by snowflake. + return table_name + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> SnowflakeParser: + database_name = connector_manifest.config["snowflake.database.name"] + schema_name = connector_manifest.config["snowflake.schema.name"] + + # Fetch user provided topic to table map + provided_topics_to_tables: Dict[str, str] = {} + if connector_manifest.config.get("snowflake.topic2table.map"): + for each in connector_manifest.config["snowflake.topic2table.map"].split( + "," + ): + topic, table = each.split(":") + provided_topics_to_tables[topic.strip()] = table.strip() + + topics_to_tables: Dict[str, str] = {} + # Extract lineage for only those topics whose data ingestion started + for topic in connector_manifest.topic_names: + if topic in provided_topics_to_tables: + # If user provided which table to get mapped with this topic + topics_to_tables[topic] = provided_topics_to_tables[topic] + else: + # Else connector converts topic name to a valid Snowflake table name. + topics_to_tables[topic] = self.get_table_name_from_topic_name(topic) + + return self.SnowflakeParser( + database_name=database_name, + schema_name=schema_name, + topics_to_tables=topics_to_tables, + ) + + def extract_flow_property_bag(self) -> Dict[str, str]: + # For all snowflake sink connector properties, refer below link + # https://docs.snowflake.com/en/user-guide/kafka-connector-install#configuring-the-kafka-connector + # remove private keys, secrets from properties + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k + not in [ + "snowflake.private.key", + "snowflake.private.key.passphrase", + "value.converter.basic.auth.user.info", + ] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + + for topic, table in parser.topics_to_tables.items(): + target_dataset = f"{parser.database_name}.{parser.schema_name}.{table}" + lineages.append( + KafkaConnectLineage( + source_dataset=topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform="snowflake", + ) + ) + + return lineages + + +@dataclass +class BigQuerySinkConnector(BaseConnector): + @dataclass + class BQParser: + project: str + target_platform: str + sanitizeTopics: str + transforms: list + topicsToTables: Optional[str] = None + datasets: Optional[str] = None + defaultDataset: Optional[str] = None + version: str = "v1" + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> BQParser: + project = connector_manifest.config["project"] + sanitizeTopics = connector_manifest.config.get("sanitizeTopics", "false") + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + if "defaultDataset" in connector_manifest.config: + defaultDataset = connector_manifest.config["defaultDataset"] + return self.BQParser( + project=project, + defaultDataset=defaultDataset, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + version="v2", + transforms=transforms, + ) + else: + # version 1.6.x and similar configs supported + datasets = connector_manifest.config["datasets"] + topicsToTables = connector_manifest.config.get("topicsToTables") + + return self.BQParser( + project=project, + topicsToTables=topicsToTables, + datasets=datasets, + target_platform="bigquery", + sanitizeTopics=sanitizeTopics.lower() == "true", + transforms=transforms, + ) + + def get_list(self, property: str) -> Iterable[Tuple[str, str]]: + entries = property.split(",") + for entry in entries: + key, val = entry.rsplit("=") + yield (key.strip(), val.strip()) + + def get_dataset_for_topic_v1(self, topic: str, parser: BQParser) -> Optional[str]: + topicregex_dataset_map: Dict[str, str] = dict(self.get_list(parser.datasets)) # type: ignore + from java.util.regex import Pattern + + for pattern, dataset in topicregex_dataset_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + return dataset + return None + + def sanitize_table_name(self, table_name): + table_name = re.sub("[^a-zA-Z0-9_]", "_", table_name) + if re.match("^[^a-zA-Z_].*", table_name): + table_name = "_" + table_name + + return table_name + + def get_dataset_table_for_topic( + self, topic: str, parser: BQParser + ) -> Optional[str]: + if parser.version == "v2": + dataset = parser.defaultDataset + parts = topic.split(":") + if len(parts) == 2: + dataset = parts[0] + table = parts[1] + else: + table = parts[0] + else: + dataset = self.get_dataset_for_topic_v1(topic, parser) + if dataset is None: + return None + + table = topic + if parser.topicsToTables: + topicregex_table_map: Dict[str, str] = dict( + self.get_list(parser.topicsToTables) # type: ignore + ) + from java.util.regex import Pattern + + for pattern, tbl in topicregex_table_map.items(): + patternMatcher = Pattern.compile(pattern).matcher(topic) + if patternMatcher.matches(): + table = tbl + break + + if parser.sanitizeTopics: + table = self.sanitize_table_name(table) + return f"{dataset}.{table}" + + def apply_transformations( + self, topic: str, transforms: List[Dict[str, str]] + ) -> str: + for transform in transforms: + if transform["type"] == "org.apache.kafka.connect.transforms.RegexRouter": + regex = transform["regex"] + replacement = transform["replacement"] + pattern = re.compile(regex) + if pattern.match(topic): + topic = pattern.sub(replacement, topic, count=1) + return topic + + def extract_flow_property_bag(self) -> Dict[str, str]: + # Mask/Remove properties that may reveal credentials + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["keyfile"] + } + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + if not parser: + return lineages + target_platform = parser.target_platform + project = parser.project + transforms = parser.transforms + + for topic in self.connector_manifest.topic_names: + transformed_topic = self.apply_transformations(topic, transforms) + dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser) + if dataset_table is None: + self.report.warning( + "Could not find target dataset for topic, please check your connector configuration" + f"{self.connector_manifest.name} : {transformed_topic} ", + ) + continue + target_dataset = f"{project}.{dataset_table}" + + lineages.append( + KafkaConnectLineage( + source_dataset=transformed_topic, + source_platform=KAFKA, + target_dataset=target_dataset, + target_platform=target_platform, + ) + ) + return lineages + + +BIGQUERY_SINK_CONNECTOR_CLASS = "com.wepay.kafka.connect.bigquery.BigQuerySinkConnector" +S3_SINK_CONNECTOR_CLASS = "io.confluent.connect.s3.S3SinkConnector" +SNOWFLAKE_SINK_CONNECTOR_CLASS = "com.snowflake.kafka.connector.SnowflakeSinkConnector" diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py new file mode 100644 index 0000000000000..7b3b6e551a0a1 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect/source_connectors.py @@ -0,0 +1,570 @@ +import logging +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from sqlalchemy.engine.url import make_url + +from datahub.ingestion.source.kafka_connect.common import ( + CONNECTOR_CLASS, + KAFKA, + BaseConnector, + ConnectorManifest, + KafkaConnectLineage, + get_dataset_name, + has_three_level_hierarchy, + remove_prefix, + unquote, +) +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) + + +@dataclass +class ConfluentJDBCSourceConnector(BaseConnector): + REGEXROUTER = "org.apache.kafka.connect.transforms.RegexRouter" + KNOWN_TOPICROUTING_TRANSFORMS = [REGEXROUTER] + # https://kafka.apache.org/documentation/#connect_included_transformation + KAFKA_NONTOPICROUTING_TRANSFORMS = [ + "InsertField", + "InsertField$Key", + "InsertField$Value", + "ReplaceField", + "ReplaceField$Key", + "ReplaceField$Value", + "MaskField", + "MaskField$Key", + "MaskField$Value", + "ValueToKey", + "ValueToKey$Key", + "ValueToKey$Value", + "HoistField", + "HoistField$Key", + "HoistField$Value", + "ExtractField", + "ExtractField$Key", + "ExtractField$Value", + "SetSchemaMetadata", + "SetSchemaMetadata$Key", + "SetSchemaMetadata$Value", + "Flatten", + "Flatten$Key", + "Flatten$Value", + "Cast", + "Cast$Key", + "Cast$Value", + "HeadersFrom", + "HeadersFrom$Key", + "HeadersFrom$Value", + "TimestampConverter", + "Filter", + "InsertHeader", + "DropHeaders", + ] + # https://docs.confluent.io/platform/current/connect/transforms/overview.html + CONFLUENT_NONTOPICROUTING_TRANSFORMS = [ + "Drop", + "Drop$Key", + "Drop$Value", + "Filter", + "Filter$Key", + "Filter$Value", + "TombstoneHandler", + ] + KNOWN_NONTOPICROUTING_TRANSFORMS = ( + KAFKA_NONTOPICROUTING_TRANSFORMS + + [ + f"org.apache.kafka.connect.transforms.{t}" + for t in KAFKA_NONTOPICROUTING_TRANSFORMS + ] + + CONFLUENT_NONTOPICROUTING_TRANSFORMS + + [ + f"io.confluent.connect.transforms.{t}" + for t in CONFLUENT_NONTOPICROUTING_TRANSFORMS + ] + ) + + @dataclass + class JdbcParser: + db_connection_url: str + source_platform: str + database_name: str + topic_prefix: str + query: str + transforms: list + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> JdbcParser: + url = remove_prefix( + str(connector_manifest.config.get("connection.url")), "jdbc:" + ) + url_instance = make_url(url) + source_platform = get_platform_from_sqlalchemy_uri(str(url_instance)) + database_name = url_instance.database + assert database_name + db_connection_url = f"{url_instance.drivername}://{url_instance.host}:{url_instance.port}/{database_name}" + + topic_prefix = self.connector_manifest.config.get("topic.prefix", None) + + query = self.connector_manifest.config.get("query", None) + + transform_names = ( + self.connector_manifest.config.get("transforms", "").split(",") + if self.connector_manifest.config.get("transforms") + else [] + ) + + transforms = [] + for name in transform_names: + transform = {"name": name} + transforms.append(transform) + for key in self.connector_manifest.config.keys(): + if key.startswith(f"transforms.{name}."): + transform[ + key.replace(f"transforms.{name}.", "") + ] = self.connector_manifest.config[key] + + return self.JdbcParser( + db_connection_url, + source_platform, + database_name, + topic_prefix, + query, + transforms, + ) + + def default_get_lineages( + self, + topic_prefix: str, + database_name: str, + source_platform: str, + topic_names: Optional[Iterable[str]] = None, + include_source_dataset: bool = True, + ) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = [] + if not topic_names: + topic_names = self.connector_manifest.topic_names + table_name_tuples: List[Tuple] = self.get_table_names() + for topic in topic_names: + # All good for NO_TRANSFORM or (SINGLE_TRANSFORM and KNOWN_NONTOPICROUTING_TRANSFORM) or (not SINGLE_TRANSFORM and all(KNOWN_NONTOPICROUTING_TRANSFORM)) + source_table: str = ( + remove_prefix(topic, topic_prefix) if topic_prefix else topic + ) + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform): + table_name_tuple: Tuple = next( + iter([t for t in table_name_tuples if t and t[-1] == source_table]), + (), + ) + if len(table_name_tuple) > 1: + source_table = f"{table_name_tuple[-2]}.{source_table}" + else: + include_source_dataset = False + self.report.warning( + "Could not find schema for table" + f"{self.connector_manifest.name} : {source_table}", + ) + dataset_name: str = get_dataset_name(database_name, source_table) + lineage = KafkaConnectLineage( + source_dataset=dataset_name if include_source_dataset else None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + def get_table_names(self) -> List[Tuple]: + sep: str = "." + leading_quote_char: str = '"' + trailing_quote_char: str = leading_quote_char + + table_ids: List[str] = [] + if self.connector_manifest.tasks: + table_ids = ( + ",".join( + [ + task["config"].get("tables") + for task in self.connector_manifest.tasks + ] + ) + ).split(",") + quote_method = self.connector_manifest.config.get( + "quote.sql.identifiers", "always" + ) + if ( + quote_method == "always" + and table_ids + and table_ids[0] + and table_ids[-1] + ): + leading_quote_char = table_ids[0][0] + trailing_quote_char = table_ids[-1][-1] + # This will only work for single character quotes + elif self.connector_manifest.config.get("table.whitelist"): + table_ids = self.connector_manifest.config.get("table.whitelist").split(",") # type: ignore + + # List of Tuple containing (schema, table) + tables: List[Tuple] = [ + ( + ( + unquote( + table_id.split(sep)[-2], leading_quote_char, trailing_quote_char + ) + if len(table_id.split(sep)) > 1 + else "" + ), + unquote( + table_id.split(sep)[-1], leading_quote_char, trailing_quote_char + ), + ) + for table_id in table_ids + ] + return tables + + def extract_flow_property_bag(self) -> Dict[str, str]: + flow_property_bag = { + k: v + for k, v in self.connector_manifest.config.items() + if k not in ["connection.password", "connection.user"] + } + + # Mask/Remove properties that may reveal credentials + flow_property_bag["connection.url"] = self.get_parser( + self.connector_manifest + ).db_connection_url + + return flow_property_bag + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + database_name = parser.database_name + query = parser.query + topic_prefix = parser.topic_prefix + transforms = parser.transforms + + logging.debug( + f"Extracting source platform: {source_platform} and database name: {database_name} from connection url " + ) + + if not self.connector_manifest.topic_names: + return lineages + + if query: + # Lineage source_table can be extracted by parsing query + for topic in self.connector_manifest.topic_names: + # default method - as per earlier implementation + dataset_name: str = get_dataset_name(database_name, topic) + + lineage = KafkaConnectLineage( + source_dataset=None, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + self.report.warning( + "Could not find input dataset, the connector has query configuration set", + self.connector_manifest.name, + ) + return lineages + + SINGLE_TRANSFORM = len(transforms) == 1 + NO_TRANSFORM = len(transforms) == 0 + UNKNOWN_TRANSFORM = any( + [ + transform["type"] + not in self.KNOWN_TOPICROUTING_TRANSFORMS + + self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + ALL_TRANSFORMS_NON_TOPICROUTING = all( + [ + transform["type"] in self.KNOWN_NONTOPICROUTING_TRANSFORMS + for transform in transforms + ] + ) + + if NO_TRANSFORM or ALL_TRANSFORMS_NON_TOPICROUTING: + return self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + ) + + if SINGLE_TRANSFORM and transforms[0]["type"] == self.REGEXROUTER: + tables = self.get_table_names() + topic_names = list(self.connector_manifest.topic_names) + + from java.util.regex import Pattern + + for table in tables: + source_table: str = table[-1] + topic = topic_prefix + source_table if topic_prefix else source_table + + transform_regex = Pattern.compile(transforms[0]["regex"]) + transform_replacement = transforms[0]["replacement"] + + matcher = transform_regex.matcher(topic) + if matcher.matches(): + topic = str(matcher.replaceFirst(transform_replacement)) + + # Additional check to confirm that the topic present + # in connector topics + + if topic in self.connector_manifest.topic_names: + # include schema name for three-level hierarchies + if has_three_level_hierarchy(source_platform) and len(table) > 1: + source_table = f"{table[-2]}.{table[-1]}" + + dataset_name = get_dataset_name(database_name, source_table) + + lineage = KafkaConnectLineage( + source_dataset=dataset_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + topic_names.remove(topic) + lineages.append(lineage) + + if topic_names: + lineages.extend( + self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + topic_names=topic_names, + include_source_dataset=False, + ) + ) + self.report.warning( + "Could not find input dataset for connector topics", + f"{self.connector_manifest.name} : {topic_names}", + ) + return lineages + else: + include_source_dataset = True + if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has unknown transform", + f"{self.connector_manifest.name} : {transforms[0]['type']}", + ) + include_source_dataset = False + if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM: + self.report.warning( + "Could not find input dataset, connector has one or more unknown transforms", + self.connector_manifest.name, + ) + include_source_dataset = False + lineages = self.default_get_lineages( + database_name=database_name, + source_platform=source_platform, + topic_prefix=topic_prefix, + include_source_dataset=include_source_dataset, + ) + return lineages + + +@dataclass +class MongoSourceConnector(BaseConnector): + # https://www.mongodb.com/docs/kafka-connector/current/source-connector/ + + @dataclass + class MongoSourceParser: + db_connection_url: Optional[str] + source_platform: str + database_name: Optional[str] + topic_prefix: Optional[str] + transforms: List[str] + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> MongoSourceParser: + parser = self.MongoSourceParser( + db_connection_url=connector_manifest.config.get("connection.uri"), + source_platform="mongodb", + database_name=connector_manifest.config.get("database"), + topic_prefix=connector_manifest.config.get("topic_prefix"), + transforms=( + connector_manifest.config["transforms"].split(",") + if "transforms" in connector_manifest.config + else [] + ), + ) + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(found.group(1), found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +@dataclass +class DebeziumSourceConnector(BaseConnector): + @dataclass + class DebeziumParser: + source_platform: str + server_name: Optional[str] + database_name: Optional[str] + + def get_server_name(self, connector_manifest: ConnectorManifest) -> str: + if "topic.prefix" in connector_manifest.config: + return connector_manifest.config["topic.prefix"] + else: + return connector_manifest.config.get("database.server.name", "") + + def get_parser( + self, + connector_manifest: ConnectorManifest, + ) -> DebeziumParser: + connector_class = connector_manifest.config.get(CONNECTOR_CLASS, "") + + if connector_class == "io.debezium.connector.mysql.MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "MySqlConnector": + parser = self.DebeziumParser( + source_platform="mysql", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.mongodb.MongoDbConnector": + parser = self.DebeziumParser( + source_platform="mongodb", + server_name=self.get_server_name(connector_manifest), + database_name=None, + ) + elif connector_class == "io.debezium.connector.postgresql.PostgresConnector": + parser = self.DebeziumParser( + source_platform="postgres", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.oracle.OracleConnector": + parser = self.DebeziumParser( + source_platform="oracle", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.sqlserver.SqlServerConnector": + database_name = connector_manifest.config.get( + "database.names" + ) or connector_manifest.config.get("database.dbname") + + if "," in str(database_name): + raise Exception( + f"Only one database is supported for Debezium's SQL Server connector. Found: {database_name}" + ) + + parser = self.DebeziumParser( + source_platform="mssql", + server_name=self.get_server_name(connector_manifest), + database_name=database_name, + ) + elif connector_class == "io.debezium.connector.db2.Db2Connector": + parser = self.DebeziumParser( + source_platform="db2", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("database.dbname"), + ) + elif connector_class == "io.debezium.connector.vitess.VitessConnector": + parser = self.DebeziumParser( + source_platform="vitess", + server_name=self.get_server_name(connector_manifest), + database_name=connector_manifest.config.get("vitess.keyspace"), + ) + else: + raise ValueError(f"Connector class '{connector_class}' is unknown.") + + return parser + + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages: List[KafkaConnectLineage] = list() + + try: + parser = self.get_parser(self.connector_manifest) + source_platform = parser.source_platform + server_name = parser.server_name + database_name = parser.database_name + topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)" + + if not self.connector_manifest.topic_names: + return lineages + + for topic in self.connector_manifest.topic_names: + found = re.search(re.compile(topic_naming_pattern), topic) + + if found: + table_name = get_dataset_name(database_name, found.group(2)) + + lineage = KafkaConnectLineage( + source_dataset=table_name, + source_platform=source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + except Exception as e: + self.report.warning( + "Error resolving lineage for connector", + self.connector_manifest.name, + exc=e, + ) + + return [] + + +@dataclass +class ConfigDrivenSourceConnector(BaseConnector): + def extract_lineages(self) -> List[KafkaConnectLineage]: + lineages = [] + for connector in self.config.generic_connectors: + if connector.connector_name == self.connector_manifest.name: + target_connector = connector + break + for topic in self.connector_manifest.topic_names: + lineage = KafkaConnectLineage( + source_dataset=target_connector.source_dataset, + source_platform=target_connector.source_platform, + target_dataset=topic, + target_platform=KAFKA, + ) + lineages.append(lineage) + return lineages + + +JDBC_SOURCE_CONNECTOR_CLASS = "io.confluent.connect.jdbc.JdbcSourceConnector" +DEBEZIUM_SOURCE_CONNECTOR_PREFIX = "io.debezium.connector" +MONGO_SOURCE_CONNECTOR_CLASS = "com.mongodb.kafka.connect.MongoSourceConnector" From 2e544614f12bf2ad8e758b2fd742ee14c6998825 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:41:40 +0530 Subject: [PATCH 07/17] feat(ingest): add looker meta extractor support in sql parsing (#12062) Co-authored-by: Mayuri N Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../datahub/configuration/source_common.py | 13 ++ .../ingestion/source/looker/looker_common.py | 56 +++++- .../source/looker/looker_lib_wrapper.py | 14 +- .../ingestion/source/looker/looker_source.py | 13 +- .../ingestion/source/powerbi/config.py | 15 +- .../powerbi/dataplatform_instance_resolver.py | 2 +- .../source/powerbi/m_query/pattern_handler.py | 2 +- .../source/snowflake/snowflake_v2.py | 1 + .../sql_parsing/sql_parsing_aggregator.py | 2 +- .../sql_parsing/tool_meta_extractor.py | 121 ++++++++++++- .../looker/golden_looker_mces.json | 56 ++++++ .../looker/golden_test_allow_ingest.json | 53 ++++++ ...olden_test_external_project_view_mces.json | 53 ++++++ .../looker/golden_test_file_path_ingest.json | 53 ++++++ ...olden_test_folder_path_pattern_ingest.json | 53 ++++++ .../golden_test_independent_look_ingest.json | 170 +++++++++++++----- .../looker/golden_test_ingest.json | 54 ++++++ .../looker/golden_test_ingest_joins.json | 53 ++++++ .../golden_test_ingest_unaliased_joins.json | 53 ++++++ ...en_test_non_personal_independent_look.json | 71 ++++++++ .../looker_mces_golden_deleted_stateful.json | 68 ++++++- .../looker/looker_mces_usage_history.json | 53 ++++++ .../tests/integration/looker/test_looker.py | 20 +++ .../sql_parsing/test_tool_meta_extractor.py | 44 ++++- .../state/test_redundant_run_skip_handler.py | 6 +- .../platformresource/PlatformResourceType.pdl | 6 +- 26 files changed, 1026 insertions(+), 79 deletions(-) diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 44c737f1bd13d..8e41e9fb91787 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -63,3 +63,16 @@ class DatasetLineageProviderConfigBase(EnvConfigMixin): default=None, description="A holder for platform -> platform_instance mappings to generate correct dataset urns", ) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = Field( + default=None, + description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " + "with platform instance name used in ingestion " + "recipe of other datahub sources.", + ) + env: str = Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 57a251ef2ed14..a66962f962255 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -31,6 +31,10 @@ from pydantic.class_validators import validator import datahub.emitter.mce_builder as builder +from datahub.api.entities.platformresource.platform_resource import ( + PlatformResource, + PlatformResourceKey, +) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, create_embed_mcp from datahub.ingestion.api.report import Report @@ -106,7 +110,7 @@ from datahub.utilities.url_util import remove_port_from_url CORPUSER_DATAHUB = "urn:li:corpuser:datahub" - +LOOKER = "looker" logger = logging.getLogger(__name__) @@ -1411,6 +1415,7 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport): resolved_user_ids: int = 0 email_ids_missing: int = 0 # resolved users with missing email addresses + looker_user_count: int = 0 _looker_api: Optional[LookerAPI] = None query_latency: Dict[str, datetime.timedelta] = dataclasses_field( @@ -1614,9 +1619,21 @@ def get_urn_dashboard_id(self): class LookerUserRegistry: looker_api_wrapper: LookerAPI fields: str = ",".join(["id", "email", "display_name", "first_name", "last_name"]) + _user_cache: Dict[str, LookerUser] = {} - def __init__(self, looker_api: LookerAPI): + def __init__(self, looker_api: LookerAPI, report: LookerDashboardSourceReport): self.looker_api_wrapper = looker_api + self.report = report + self._initialize_user_cache() + + def _initialize_user_cache(self) -> None: + raw_users: Sequence[User] = self.looker_api_wrapper.all_users( + user_fields=self.fields + ) + + for raw_user in raw_users: + looker_user = LookerUser.create_looker_user(raw_user) + self._user_cache[str(looker_user.id)] = looker_user def get_by_id(self, id_: str) -> Optional[LookerUser]: if not id_: @@ -1624,6 +1641,9 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: logger.debug(f"Will get user {id_}") + if str(id_) in self._user_cache: + return self._user_cache.get(str(id_)) + raw_user: Optional[User] = self.looker_api_wrapper.get_user( str(id_), user_fields=self.fields ) @@ -1632,3 +1652,35 @@ def get_by_id(self, id_: str) -> Optional[LookerUser]: looker_user = LookerUser.create_looker_user(raw_user) return looker_user + + def to_platform_resource( + self, platform_instance: Optional[str] + ) -> Iterable[MetadataChangeProposalWrapper]: + try: + platform_resource_key = PlatformResourceKey( + platform=LOOKER, + resource_type="USER_ID_MAPPING", + platform_instance=platform_instance, + primary_key="", + ) + + # Extract user email mappings + user_email_cache = { + user_id: user.email + for user_id, user in self._user_cache.items() + if user.email + } + + platform_resource = PlatformResource.create( + key=platform_resource_key, + value=user_email_cache, + ) + + self.report.looker_user_count = len(user_email_cache) + yield from platform_resource.to_mcps() + + except Exception as exc: + self.report.warning( + message="Failed to generate platform resource for looker id mappings", + exc=exc, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index ab55d4e15e5de..c3f2a110136c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -68,6 +68,7 @@ class LookerAPIStats(BaseModel): get_look_calls: int = 0 search_looks_calls: int = 0 search_dashboards_calls: int = 0 + all_user_calls: int = 0 class LookerAPI: @@ -135,7 +136,7 @@ def get_available_permissions(self) -> Set[str]: return permissions - @lru_cache(maxsize=1000) + @lru_cache(maxsize=5000) def get_user(self, id_: str, user_fields: str) -> Optional[User]: self.client_stats.user_calls += 1 try: @@ -154,6 +155,17 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]: # User not found return None + def all_users(self, user_fields: str) -> Sequence[User]: + self.client_stats.all_user_calls += 1 + try: + return self.client.all_users( + fields=cast(str, user_fields), + transport_options=self.transport_options, + ) + except SDKError as e: + logger.warning(f"Failure was {e}") + return [] + def execute_query(self, write_query: WriteQuery) -> List[Dict]: logger.debug(f"Executing query {write_query}") self.client_stats.query_calls += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index cd8ccb8217257..815c5dfb1c014 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -145,7 +145,9 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): self.source_config: LookerDashboardSourceConfig = config self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.user_registry: LookerUserRegistry = LookerUserRegistry( + self.looker_api, self.reporter + ) self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) @@ -1673,5 +1675,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield usage_mcp.as_workunit() self.reporter.report_stage_end("usage_extraction") + # Dump looker user resource mappings. + logger.info("Ingesting looker user resource mapping workunits") + self.reporter.report_stage_start("user_resource_extraction") + yield from auto_workunit( + self.user_registry.to_platform_resource( + self.source_config.platform_instance + ) + ) + def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index f7458c4eb4d5b..b49d40a0c7eb6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,7 +9,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -232,19 +232,6 @@ def default_for_dataset_type_mapping() -> Dict[str, str]: return dict_ -class PlatformDetail(ConfigModel): - platform_instance: Optional[str] = pydantic.Field( - default=None, - description="DataHub platform instance name. To generate correct urn for upstream dataset, this should match " - "with platform instance name used in ingestion " - "recipe of other datahub sources.", - ) - env: str = pydantic.Field( - default=builder.DEFAULT_ENV, - description="The environment that all assets produced by DataHub platform ingestion source belong to", - ) - - class DataBricksPlatformDetail(PlatformDetail): """ metastore is an additional field used in Databricks connector to generate the dataset urn diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py index baaa8d5b85ae1..6d51e853a2fb0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from typing import Union +from datahub.configuration.source_common import PlatformDetail from datahub.ingestion.source.powerbi.config import ( - PlatformDetail, PowerBiDashboardSourceConfig, PowerBIPlatformDetail, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py index ffaed79f4e42a..63520bd731de8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -5,13 +5,13 @@ from lark import Tree +from datahub.configuration.source_common import PlatformDetail from datahub.emitter import mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( Constant, DataBricksPlatformDetail, DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, PowerBIPlatformDetail, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index c3a7912c40e8e..e5883dd0349a3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -540,6 +540,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=schema_resolver, discovered_tables=discovered_datasets, + graph=self.ctx.graph, ) # TODO: This is slightly suboptimal because we create two SqlParsingAggregator instances with different configs diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 79ea98d1c7f54..f81eb291e89e1 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -490,7 +490,7 @@ def __init__( self._exit_stack.push(self._query_usage_counts) # Tool Extractor - self._tool_meta_extractor = ToolMetaExtractor() + self._tool_meta_extractor = ToolMetaExtractor.create(graph) self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: diff --git a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py index 0d85002776e5e..5af9d9d4f0fff 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py +++ b/metadata-ingestion/src/datahub/sql_parsing/tool_meta_extractor.py @@ -1,3 +1,4 @@ +import contextlib import json import logging from dataclasses import dataclass, field @@ -5,8 +6,15 @@ from typing_extensions import Protocol +from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + PlatformResource, + PlatformResourceSearchFields, +) from datahub.ingestion.api.report import Report +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn +from datahub.utilities.search_utils import LogicalOperator from datahub.utilities.stats_collections import int_top_k_dict UrnStr = str @@ -31,6 +39,7 @@ def _get_last_line(query: str) -> str: @dataclass class ToolMetaExtractorReport(Report): num_queries_meta_extracted: Dict[str, int] = field(default_factory=int_top_k_dict) + failures: List[str] = field(default_factory=list) class ToolMetaExtractor: @@ -42,14 +51,81 @@ class ToolMetaExtractor: by warehouse query logs. """ - def __init__(self) -> None: - self.report = ToolMetaExtractorReport() + def __init__( + self, + report: ToolMetaExtractorReport, + looker_user_mapping: Optional[Dict[str, str]] = None, + ) -> None: + self.report = report self.known_tool_extractors: List[Tuple[str, Callable[[QueryLog], bool]]] = [ ( "mode", self._extract_mode_query, - ) + ), + ( + "looker", + self._extract_looker_query, + ), ] + # maps user id (as string) to email address + self.looker_user_mapping = looker_user_mapping + + @classmethod + def create( + cls, + graph: Optional[DataHubGraph] = None, + ) -> "ToolMetaExtractor": + report = ToolMetaExtractorReport() + looker_user_mapping = None + if graph: + try: + looker_user_mapping = cls.extract_looker_user_mapping_from_graph( + graph, report + ) + except Exception as e: + report.failures.append( + f"Unexpected error during Looker user metadata extraction: {str(e)}" + ) + + return cls(report, looker_user_mapping) + + @classmethod + def extract_looker_user_mapping_from_graph( + cls, graph: DataHubGraph, report: ToolMetaExtractorReport + ) -> Optional[Dict[str, str]]: + looker_user_mapping = None + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PLATFORM, "looker") + .add_field_match( + PlatformResourceSearchFields.RESOURCE_TYPE, + "USER_ID_MAPPING", + ) + .end() + ) + platform_resources = list( + PlatformResource.search_by_filters(query=query, graph_client=graph) + ) + + if len(platform_resources) > 1: + report.failures.append( + "Looker user metadata extraction failed. Found more than one looker user id mappings." + ) + else: + platform_resource = platform_resources[0] + + if ( + platform_resource + and platform_resource.resource_info + and platform_resource.resource_info.value + ): + with contextlib.suppress(ValueError, AssertionError): + value = platform_resource.resource_info.value.as_raw_json() + if value: + looker_user_mapping = value + + return looker_user_mapping def _extract_mode_query(self, entry: QueryLog) -> bool: """ @@ -78,14 +154,49 @@ def _extract_mode_query(self, entry: QueryLog) -> bool: return True + def _extract_looker_query(self, entry: QueryLog) -> bool: + """ + Returns: + bool: whether QueryLog entry is that of looker and looker user info + is extracted into entry. + """ + if not self.looker_user_mapping: + return False + + last_line = _get_last_line(entry.query_text) + + if not (last_line.startswith("--") and "Looker Query Context" in last_line): + return False + + start_quote_idx = last_line.index("'") + end_quote_idx = last_line.rindex("'") + if start_quote_idx == -1 or end_quote_idx == -1: + return False + + looker_json_raw = last_line[start_quote_idx + 1 : end_quote_idx] + looker_json = json.loads(looker_json_raw) + + user_id = str(looker_json["user_id"]) + email = self.looker_user_mapping.get(user_id) + if not email: + return False + + original_user = entry.user + + entry.user = email_to_user_urn(email) + entry.extra_info = entry.extra_info or {} + entry.extra_info["user_via"] = original_user + + return True + def extract_bi_metadata(self, entry: QueryLog) -> bool: for tool, meta_extractor in self.known_tool_extractors: try: if meta_extractor(entry): self.report.num_queries_meta_extracted[tool] += 1 return True - except Exception: - logger.debug("Tool metadata extraction failed with error : {e}") + except Exception as e: + logger.debug(f"Tool metadata extraction failed with error : {e}") return False diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index a9c445b5986ef..6ae772c134cb3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -842,6 +842,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index af9c62a2a4180..d7620980a9ced 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -497,6 +497,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b89bc356b48fd..13963af55bfe5 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 810fefd8f6cb8..f11d060102851 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -735,6 +735,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 3d78397f54a23..f6e39dd5286cd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -828,6 +828,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 5a540e61e768d..203bed843155c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -708,6 +723,21 @@ "/Folders/Personal" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-2@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1108,12 +1138,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/data" ] } }, @@ -1126,12 +1156,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "sales_model", + "model": "data", "looker.explore.label": "My Explore View", - "looker.explore.name": "sales_explore", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1153,7 +1183,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1208,7 +1238,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1227,12 +1257,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1244,12 +1274,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1261,7 +1291,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1271,8 +1301,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1287,12 +1317,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/order_model" ] } }, @@ -1305,12 +1335,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "data", + "model": "order_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "my_view", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1332,7 +1362,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1387,7 +1417,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1406,12 +1436,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1423,12 +1453,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1440,7 +1470,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1450,8 +1480,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } @@ -1466,12 +1496,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/sales_model" ] } }, @@ -1484,12 +1514,12 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "project": "lkml_samples", - "model": "order_model", + "model": "sales_model", "looker.explore.label": "My Explore View", - "looker.explore.name": "order_explore", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1511,7 +1541,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1566,7 +1596,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1585,12 +1615,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1602,12 +1632,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1619,7 +1649,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1629,8 +1659,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1705,6 +1735,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index 9ac95b8482a47..87af50f95ed6b 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -793,6 +793,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:looker,ap-south-1)" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:8436a2a37c4a7e81fb08c9c8415d2e4b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 3a2c6359ea63c..b990ce7c67dab 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -759,6 +759,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 007eee348aeaf..391192b3d16f3 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -513,6 +513,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 859b9163d7aad..4909a6af73a22 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -464,6 +464,21 @@ "/Folders/Shared" ] } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:test-1@looker.com", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] } @@ -1185,6 +1200,62 @@ "pipelineName": "execution-1" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "execution-1" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 8256c984afb27..ddeb5428b1d72 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -762,6 +762,62 @@ "pipelineName": "stateful-looker-pipeline" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided", + "pipelineName": "stateful-looker-pipeline" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", @@ -814,8 +870,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -831,8 +887,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -865,8 +921,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 0b3530f9c2462..594983c8fb0f2 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -678,6 +678,59 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "platformResourceInfo", + "aspect": { + "json": { + "resourceType": "USER_ID_MAPPING", + "primaryKey": "", + "value": { + "blob": "{\"1\": \"test-1@looker.com\", \"2\": \"test-2@looker.com\", \"3\": \"test-3@looker.com\"}", + "contentType": "JSON" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:1cec84235c544a141e63dd2077da2562", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "looker-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(looker,dashboard_elements.2)", diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 8bbf14709ff9f..a39de8384efb2 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -83,6 +83,7 @@ def test_looker_ingest(pytestconfig, tmp_path, mock_time): with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) + mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -319,6 +320,7 @@ def setup_mock_look(mocked_client): mocked_client.all_looks.return_value = [ Look( id="1", + user_id="1", title="Outer Look", description="I am not part of any Dashboard", query_id="1", @@ -327,6 +329,7 @@ def setup_mock_look(mocked_client): Look( id="2", title="Personal Look", + user_id="2", description="I am not part of any Dashboard and in personal folder", query_id="2", folder=FolderBase( @@ -561,6 +564,20 @@ def get_user( mocked_client.user.side_effect = get_user +def setup_mock_all_user(mocked_client): + def all_users( + fields: Optional[str] = None, + transport_options: Optional[transport.TransportOptions] = None, + ) -> List[User]: + return [ + User(id="1", email="test-1@looker.com"), + User(id="2", email="test-2@looker.com"), + User(id="3", email="test-3@looker.com"), + ] + + mocked_client.all_users.side_effect = all_users + + def side_effect_query_inline( result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] ) -> str: @@ -714,6 +731,7 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" @@ -946,6 +964,8 @@ def ingest_independent_looks( mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) + setup_mock_user(mocked_client) + setup_mock_all_user(mocked_client) setup_mock_look(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py index 6f590b5307146..f6566f007f5e6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_tool_meta_extractor.py @@ -1,11 +1,14 @@ from datahub.configuration.datetimes import parse_absolute_time from datahub.metadata.urns import CorpUserUrn from datahub.sql_parsing.sql_parsing_aggregator import PreparsedQuery -from datahub.sql_parsing.tool_meta_extractor import ToolMetaExtractor +from datahub.sql_parsing.tool_meta_extractor import ( + ToolMetaExtractor, + ToolMetaExtractorReport, +) def test_extract_mode_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -30,8 +33,42 @@ def test_extract_mode_metadata() -> None: assert extractor.report.num_queries_meta_extracted["mode"] == 1 +def test_extract_looker_metadata() -> None: + extractor = ToolMetaExtractor( + report=ToolMetaExtractorReport(), looker_user_mapping={"7": "john.doe@xyz.com"} + ) + looker_query = """\ +SELECT + all_entities_extended_sibling."ENTITY" AS "all_entities_extended_sibling.entity_type", + COUNT(DISTINCT ( all_entities_extended_sibling."URN" )) AS "all_entities_extended_sibling.distinct_count" +FROM "PUBLIC"."ALL_ENTITIES" + AS all_entities_extended_sibling +GROUP BY + 1 +ORDER BY + 1 +FETCH NEXT 50 ROWS ONLY +-- Looker Query Context '{"user_id":7,"history_slug":"264797031bc403cf382cbefbe3700849","instance_slug":"32654f2ffadf10b1949d4009e52fc6a4"}' +""" + + entry = PreparsedQuery( + query_id=None, + query_text=looker_query, + upstreams=[], + downstream=None, + column_lineage=None, + column_usage=None, + inferred_schema=None, + user=CorpUserUrn("mode"), + timestamp=parse_absolute_time("2021-08-01T01:02:03Z"), + ) + assert extractor.extract_bi_metadata(entry) + assert entry.user == CorpUserUrn("john.doe") + assert extractor.report.num_queries_meta_extracted["looker"] == 1 + + def test_extract_no_metadata() -> None: - extractor = ToolMetaExtractor() + extractor = ToolMetaExtractor(report=ToolMetaExtractorReport()) query = """\ select * from LONG_TAIL_COMPANIONS.ADOPTION.PET_PROFILES LIMIT 100 @@ -53,3 +90,4 @@ def test_extract_no_metadata() -> None: assert not extractor.extract_bi_metadata(entry) assert extractor.report.num_queries_meta_extracted["mode"] == 0 + assert extractor.report.num_queries_meta_extracted["looker"] == 0 diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index 85c86f8d205d9..5631ad2c69f94 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -37,7 +37,11 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou ), ) - with mock.patch("snowflake.connector.connect"): + with mock.patch( + "datahub.sql_parsing.sql_parsing_aggregator.ToolMetaExtractor.create", + ) as mock_checkpoint, mock.patch("snowflake.connector.connect"): + mock_checkpoint.return_value = mock.MagicMock() + yield SnowflakeV2Source(ctx=ctx, config=config) diff --git a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl index 2f36eda9141ab..1a1dbea4359fb 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/platformresource/PlatformResourceType.pdl @@ -9,9 +9,13 @@ enum PlatformResourceType { /** * e.g. a Slack member resource, Looker user resource, etc. */ - USER_INFO, + USER_INFO, /** * e.g. a Slack channel */ CONVERSATION + /** + * e.g. Looker mapping of all user ids + */ + USER_ID_MAPPING } From e45f548910834dc5f2a61d0cd2168b69ec1172b2 Mon Sep 17 00:00:00 2001 From: skrydal Date: Thu, 19 Dec 2024 16:25:59 +0100 Subject: [PATCH 08/17] feat(ingest/iceberg): Improve iceberg connector (#12163) --- .../ingestion/source/iceberg/iceberg.py | 28 ++- .../source/iceberg/iceberg_common.py | 4 + metadata-ingestion/tests/unit/test_iceberg.py | 168 ++++++++++++++++-- 3 files changed, 189 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 5931873f54236..76f24bfd63d47 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -10,6 +10,7 @@ NoSuchNamespaceError, NoSuchPropertyException, NoSuchTableError, + ServerError, ) from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table @@ -145,6 +146,13 @@ def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]: self.report.report_no_listed_namespaces(len(namespaces)) tables_count = 0 for namespace in namespaces: + namespace_repr = ".".join(namespace) + if not self.config.namespace_pattern.allowed(namespace_repr): + LOGGER.info( + f"Namespace {namespace_repr} is not allowed by config pattern, skipping" + ) + self.report.report_dropped(f"{namespace_repr}.*") + continue try: tables = catalog.list_tables(namespace) tables_count += len(tables) @@ -181,6 +189,9 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: if not self.config.table_pattern.allowed(dataset_name): # Dataset name is rejected by pattern, report as dropped. self.report.report_dropped(dataset_name) + LOGGER.debug( + f"Skipping table {dataset_name} due to not being allowed by the config pattern" + ) return try: if not hasattr(thread_local, "local_catalog"): @@ -219,6 +230,22 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: LOGGER.warning( f"NoSuchTableError while processing table {dataset_path}, skipping it.", ) + except FileNotFoundError as e: + self.report.report_warning( + "file-not-found", + f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}", + ) + LOGGER.warning( + f"FileNotFoundError while processing table {dataset_path}, skipping it." + ) + except ServerError as e: + self.report.report_warning( + "iceberg-rest-server-error", + f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}", + ) + LOGGER.warning( + f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it." + ) except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( @@ -269,7 +296,6 @@ def _create_iceberg_workunit( ] = table.current_snapshot().manifest_list dataset_properties = DatasetPropertiesClass( name=table.name()[-1], - tags=[], description=table.metadata.properties.get("comment", None), customProperties=custom_properties, ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py index 98ad9e552d35c..4a7f6bf4d60c1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg_common.py @@ -68,6 +68,10 @@ class IcebergSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin) default=AllowDenyPattern.allow_all(), description="Regex patterns for tables to filter in ingestion.", ) + namespace_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for namespaces to filter in ingestion.", + ) user_ownership_property: Optional[str] = Field( default="owner", description="Iceberg table property to look for a `CorpUser` owner. Can only hold a single user value. If property has no value, no owner information will be emitted.", diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index b8a136586a2bf..3afa26b35dfe9 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -10,6 +10,8 @@ NoSuchIcebergTableError, NoSuchNamespaceError, NoSuchPropertyException, + NoSuchTableError, + ServerError, ) from pyiceberg.io.pyarrow import PyArrowFileIO from pyiceberg.partitioning import PartitionSpec @@ -39,6 +41,7 @@ UUIDType, ) +from datahub.configuration.common import AllowDenyPattern from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.iceberg.iceberg import ( @@ -62,12 +65,12 @@ ) -def with_iceberg_source(processing_threads: int = 1) -> IcebergSource: +def with_iceberg_source(processing_threads: int = 1, **kwargs: Any) -> IcebergSource: catalog = {"test": {"type": "rest"}} return IcebergSource( ctx=PipelineContext(run_id="iceberg-source-test"), config=IcebergSourceConfig( - catalog=catalog, processing_threads=processing_threads + catalog=catalog, processing_threads=processing_threads, **kwargs ), ) @@ -542,11 +545,11 @@ def __init__(self, tables: Dict[str, Dict[str, Callable[[], Table]]]): """ self.tables = tables - def list_namespaces(self) -> Iterable[str]: - return [*self.tables.keys()] + def list_namespaces(self) -> Iterable[Tuple[str]]: + return [*[(key,) for key in self.tables.keys()]] def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: - return [(namespace, table) for table in self.tables[namespace].keys()] + return [(namespace[0], table) for table in self.tables[namespace[0]].keys()] def load_table(self, dataset_path: Tuple[str, str]) -> Table: return self.tables[dataset_path[0]][dataset_path[1]]() @@ -554,15 +557,15 @@ def load_table(self, dataset_path: Tuple[str, str]) -> Table: class MockCatalogExceptionListingTables(MockCatalog): def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: - if namespace == "no_such_namespace": + if namespace == ("no_such_namespace",): raise NoSuchNamespaceError() - if namespace == "generic_exception": + if namespace == ("generic_exception",): raise Exception() return super().list_tables(namespace) class MockCatalogExceptionListingNamespaces(MockCatalog): - def list_namespaces(self) -> Iterable[str]: + def list_namespaces(self) -> Iterable[Tuple[str]]: raise Exception() @@ -814,15 +817,157 @@ def test_proper_run_with_multiple_namespaces() -> None: ) +def test_filtering() -> None: + source = with_iceberg_source( + processing_threads=1, + table_pattern=AllowDenyPattern(deny=[".*abcd.*"]), + namespace_pattern=AllowDenyPattern(allow=["namespace1"]), + ) + mock_catalog = MockCatalog( + { + "namespace1": { + "table_xyz": lambda: Table( + identifier=("namespace1", "table_xyz"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/table_xyz", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/table_xyz", + io=PyArrowFileIO(), + catalog=None, + ), + "JKLtable": lambda: Table( + identifier=("namespace1", "JKLtable"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/JKLtable", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/JKLtable", + io=PyArrowFileIO(), + catalog=None, + ), + "table_abcd": lambda: Table( + identifier=("namespace1", "table_abcd"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/table_abcd", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/table_abcd", + io=PyArrowFileIO(), + catalog=None, + ), + "aaabcd": lambda: Table( + identifier=("namespace1", "aaabcd"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace1/aaabcd", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace1/aaabcd", + io=PyArrowFileIO(), + catalog=None, + ), + }, + "namespace2": { + "foo": lambda: Table( + identifier=("namespace2", "foo"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace2/foo", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace2/foo", + io=PyArrowFileIO(), + catalog=None, + ), + "bar": lambda: Table( + identifier=("namespace2", "bar"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace2/bar", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace2/bar", + io=PyArrowFileIO(), + catalog=None, + ), + }, + "namespace3": { + "sales": lambda: Table( + identifier=("namespace3", "sales"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace3/sales", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace3/sales", + io=PyArrowFileIO(), + catalog=None, + ), + "products": lambda: Table( + identifier=("namespace2", "bar"), + metadata=TableMetadataV2( + partition_specs=[PartitionSpec(spec_id=0)], + location="s3://abcdefg/namespace3/products", + last_column_id=0, + schemas=[Schema(schema_id=0)], + ), + metadata_location="s3://abcdefg/namespace3/products", + io=PyArrowFileIO(), + catalog=None, + ), + }, + } + ) + with patch( + "datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog" + ) as get_catalog: + get_catalog.return_value = mock_catalog + wu: List[MetadataWorkUnit] = [*source.get_workunits_internal()] + assert len(wu) == 2 + urns = [] + for unit in wu: + assert isinstance(unit.metadata, MetadataChangeEvent) + assert isinstance(unit.metadata.proposedSnapshot, DatasetSnapshotClass) + urns.append(unit.metadata.proposedSnapshot.urn) + TestCase().assertCountEqual( + urns, + [ + "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.table_xyz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespace1.JKLtable,PROD)", + ], + ) + assert source.report.tables_scanned == 2 + + def test_handle_expected_exceptions() -> None: source = with_iceberg_source(processing_threads=3) def _raise_no_such_property_exception(): raise NoSuchPropertyException() - def _raise_no_such_table_exception(): + def _raise_no_such_iceberg_table_exception(): raise NoSuchIcebergTableError() + def _raise_file_not_found_error(): + raise FileNotFoundError() + + def _raise_no_such_table_exception(): + raise NoSuchTableError() + + def _raise_server_error(): + raise ServerError() + mock_catalog = MockCatalog( { "namespaceA": { @@ -876,6 +1021,9 @@ def _raise_no_such_table_exception(): ), "table5": _raise_no_such_property_exception, "table6": _raise_no_such_table_exception, + "table7": _raise_file_not_found_error, + "table8": _raise_no_such_iceberg_table_exception, + "table9": _raise_server_error, } } ) @@ -899,7 +1047,7 @@ def _raise_no_such_table_exception(): "urn:li:dataset:(urn:li:dataPlatform:iceberg,namespaceA.table4,PROD)", ], ) - assert source.report.warnings.total_elements == 2 + assert source.report.warnings.total_elements == 5 assert source.report.failures.total_elements == 0 assert source.report.tables_scanned == 4 From 08605a95a78df3f2a47c42a1e595b01f52dcc5e5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 19 Dec 2024 11:02:37 -0500 Subject: [PATCH 09/17] feat(python): split out temp wheel builds (#12157) --- .github/workflows/airflow-plugin.yml | 5 +- .github/workflows/dagster-plugin.yml | 8 +- .github/workflows/gx-plugin.yml | 8 +- .github/workflows/metadata-ingestion.yml | 9 +- .github/workflows/prefect-plugin.yml | 17 +-- .github/workflows/python-build-pages.yml | 64 ++++++++++ docs-website/build.gradle | 6 +- docs-website/generateDocsDir.ts | 24 ++-- metadata-ingestion/build.gradle | 4 +- python-build/.gitignore | 3 + python-build/build.gradle | 27 ++++ python-build/build_site.py | 150 +++++++++++++++++++++++ python-build/copy_wheels.py | 27 ++++ settings.gradle | 1 + 14 files changed, 304 insertions(+), 49 deletions(-) create mode 100644 .github/workflows/python-build-pages.yml create mode 100644 python-build/.gitignore create mode 100644 python-build/build.gradle create mode 100644 python-build/build_site.py create mode 100644 python-build/copy_wheels.py diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index eefa02be4f1af..26fcceb8aeab7 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -27,7 +27,6 @@ jobs: airflow-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -69,7 +68,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) @@ -93,7 +92,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index bee1ec95e7774..d8a9cd7bfd6a3 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -27,7 +27,6 @@ jobs: dagster-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -44,7 +43,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -56,7 +56,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) @@ -79,7 +79,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml index 595438bd6e4a9..2fd814a076485 100644 --- a/.github/workflows/gx-plugin.yml +++ b/.github/workflows/gx-plugin.yml @@ -27,7 +27,6 @@ jobs: gx-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: @@ -48,7 +47,8 @@ jobs: with: distribution: "zulu" java-version: 17 - - uses: actions/checkout@v4 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -60,7 +60,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/gx-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.11' && matrix.extraPythonRequirement == 'great-expectations~=0.17.0' }} with: name: Test Results (GX Plugin ${{ matrix.python-version}}) @@ -83,7 +83,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 49def2a863c56..ad00c6d1551d1 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -28,7 +28,6 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 40 env: - SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} @@ -84,9 +83,9 @@ jobs: df -hl docker image ls docker system df - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: Test Results (metadata ingestion ${{ matrix.python-version }}) + name: Test Results (metadata ingestion ${{ matrix.python-version }} ${{ matrix.command }}) path: | **/build/reports/tests/test/** **/build/test-results/test/** @@ -100,14 +99,14 @@ jobs: directory: ./build/coverage-reports/ fail_ci_if_error: false flags: pytest-${{ matrix.command }} - name: pytest-${{ matrix.command }} + name: pytest-${{ matrix.python-version }}-${{ matrix.command }} verbose: true event-file: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/prefect-plugin.yml b/.github/workflows/prefect-plugin.yml index 3c75e8fe9a62f..e4a70426f3a61 100644 --- a/.github/workflows/prefect-plugin.yml +++ b/.github/workflows/prefect-plugin.yml @@ -27,25 +27,20 @@ jobs: prefect-plugin: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 DATAHUB_TELEMETRY_ENABLED: false strategy: matrix: python-version: ["3.8", "3.9", "3.10"] - include: - - python-version: "3.8" - - python-version: "3.9" - - python-version: "3.10" fail-fast: false steps: - name: Set up JDK 17 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v3 - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" @@ -56,7 +51,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/prefect-plugin/venv/bin/activate && uv pip freeze - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: ${{ always() && matrix.python-version == '3.10'}} with: name: Test Results (Prefect Plugin ${{ matrix.python-version}}) @@ -72,7 +67,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} directory: ./build/coverage-reports/ fail_ci_if_error: false - flags: prefect,prefect-${{ matrix.extra_pip_extras }} + flags: prefect,prefect-${{ matrix.python-version }} name: pytest-prefect-${{ matrix.python-version }} verbose: true @@ -80,7 +75,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/python-build-pages.yml b/.github/workflows/python-build-pages.yml new file mode 100644 index 0000000000000..8971722c374fb --- /dev/null +++ b/.github/workflows/python-build-pages.yml @@ -0,0 +1,64 @@ +name: Python Build +on: + push: + branches: + - master + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + pull_request: + branches: + - "**" + paths: + - ".github/workflows/python-build-pages.yml" + - "metadata-ingestion/**" + - "metadata-ingestion-modules/**" + - "metadata-models/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + deploy-pages: + runs-on: ubuntu-latest + if: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME != '' }} + + name: Python Wheels + permissions: + contents: read + pull-requests: read + deployments: write + steps: + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: "zulu" + java-version: 17 + - uses: gradle/actions/setup-gradle@v3 + - uses: acryldata/sane-checkout-action@v3 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + - uses: actions/cache@v4 + with: + path: | + ~/.cache/uv + key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }} + - name: Build Python wheel site + run: | + ./gradlew :python-build:buildSite + env: + GITHUB_TOKEN: ${{ github.token }} + - name: Publish + uses: cloudflare/pages-action@v1 + with: + apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} + accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} + projectName: ${{ vars.CLOUDFLARE_WHEELS_PROJECT_NAME }} + workingDirectory: python-build + directory: site + gitHubToken: ${{ github.token }} diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 1860b4a49ae23..797863d2019fb 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -83,11 +83,7 @@ task yarnInstall(type: YarnTask) { task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLSchema, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen', - ':metadata-ingestion:buildWheel', - ':metadata-ingestion-modules:airflow-plugin:buildWheel', - ':metadata-ingestion-modules:dagster-plugin:buildWheel', - ':metadata-ingestion-modules:prefect-plugin:buildWheel', - ':metadata-ingestion-modules:gx-plugin:buildWheel', + ':python-build:buildWheels', ]) { inputs.files(projectMdFiles) outputs.cacheIf { true } diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 0f7e347da64eb..ad82a85f9e567 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -573,26 +573,20 @@ function write_markdown_file( function copy_python_wheels(): void { // Copy the built wheel files to the static directory. - const wheel_dirs = [ - "../metadata-ingestion/dist", - "../metadata-ingestion-modules/airflow-plugin/dist", - "../metadata-ingestion-modules/dagster-plugin/dist", - "../metadata-ingestion-modules/prefect-plugin/dist", - "../metadata-ingestion-modules/gx-plugin/dist", - ]; + // Everything is copied to the python-build directory first, so + // we just need to copy from there. + const wheel_dir = "../python-build/wheels"; const wheel_output_directory = path.join(STATIC_DIRECTORY, "wheels"); fs.mkdirSync(wheel_output_directory, { recursive: true }); - for (const wheel_dir of wheel_dirs) { - const wheel_files = fs.readdirSync(wheel_dir); - for (const wheel_file of wheel_files) { - const src = path.join(wheel_dir, wheel_file); - const dest = path.join(wheel_output_directory, wheel_file); + const wheel_files = fs.readdirSync(wheel_dir); + for (const wheel_file of wheel_files) { + const src = path.join(wheel_dir, wheel_file); + const dest = path.join(wheel_output_directory, wheel_file); - // console.log(`Copying artifact ${src} to ${dest}...`); - fs.copyFileSync(src, dest); - } + // console.log(`Copying artifact ${src} to ${dest}...`); + fs.copyFileSync(src, dest); } } diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 2c5d8e6c9646a..fc1409fbed74e 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -23,8 +23,8 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { inputs.file file('setup.py') outputs.file(sentinel_file) commandLine 'bash', '-c', - "${python_executable} -m venv ${venv_name} && " + - "${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " + + "${python_executable} -m venv ${venv_name} && set -x && " + + "${venv_name}/bin/python -m pip install --upgrade uv && " + "touch ${sentinel_file}" } diff --git a/python-build/.gitignore b/python-build/.gitignore new file mode 100644 index 0000000000000..d2de6dec25809 --- /dev/null +++ b/python-build/.gitignore @@ -0,0 +1,3 @@ + +/wheels +/site diff --git a/python-build/build.gradle b/python-build/build.gradle new file mode 100644 index 0000000000000..e90bffd46828c --- /dev/null +++ b/python-build/build.gradle @@ -0,0 +1,27 @@ +plugins { + id 'base' +} + +ext { + python_executable = 'python3' +} + +task checkPythonVersion(type: Exec) { + commandLine python_executable, '-c', + 'import sys; sys.version_info >= (3, 8), f"Python version {sys.version_info} is too old"' +} + +task buildWheels(type: Exec, dependsOn: [ + checkPythonVersion, + ':metadata-ingestion:buildWheel', + ':metadata-ingestion-modules:airflow-plugin:buildWheel', + ':metadata-ingestion-modules:dagster-plugin:buildWheel', + ':metadata-ingestion-modules:prefect-plugin:buildWheel', + ':metadata-ingestion-modules:gx-plugin:buildWheel', +]) { + commandLine python_executable, "copy_wheels.py" +} + +task buildSite(type: Exec, dependsOn: [buildWheels]) { + commandLine python_executable, "build_site.py" +} diff --git a/python-build/build_site.py b/python-build/build_site.py new file mode 100644 index 0000000000000..73941eca9968c --- /dev/null +++ b/python-build/build_site.py @@ -0,0 +1,150 @@ +import contextlib +import json +import os +import pathlib +import shutil +import subprocess +from datetime import datetime, timezone + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +WHEEL_DIR = PYTHON_BUILD_DIR / "wheels" +SITE_OUTPUT_DIR = PYTHON_BUILD_DIR / "site" + +shutil.rmtree(SITE_OUTPUT_DIR, ignore_errors=True) +SITE_OUTPUT_DIR.mkdir(parents=True) + +SITE_ARTIFACT_WHEEL_DIR = SITE_OUTPUT_DIR / "artifacts" / "wheels" +SITE_ARTIFACT_WHEEL_DIR.mkdir(parents=True) +for wheel_file in WHEEL_DIR.glob("*"): + shutil.copy(wheel_file, SITE_ARTIFACT_WHEEL_DIR) + + +def package_name(wheel_file: pathlib.Path) -> str: + return wheel_file.name.split("-")[0].replace("_", "-") + + +# Get some extra context about the build +ts = datetime.now(timezone.utc).isoformat() +context_info: dict = { + "timestamp": ts, +} + +# Get branch info. +with contextlib.suppress(Exception): + if branch_info := os.getenv("GITHUB_HEAD_REF"): + pass + else: + branch_info = subprocess.check_output( + ["git", "branch", "--show-current"], text=True + ) + context_info["branch"] = branch_info.strip() + +# Get commit info. +with contextlib.suppress(Exception): + commit_info = subprocess.check_output( + ["git", "log", "-1", "--pretty=%H%n%B"], text=True + ) + commit_hash, commit_msg = commit_info.strip().split("\n", 1) + context_info["commit"] = { + "hash": commit_hash, + "message": commit_msg.strip(), + } + +# Get PR info. +with contextlib.suppress(Exception): + pr_info = "unknown" + if github_ref := os.getenv("GITHUB_REF"): + # e.g. GITHUB_REF=refs/pull/12157/merge + parts = github_ref.split("/") + if parts[1] == "pull": + pull_number = parts[2] + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", pull_number, "--json", "title,number,url"], + text=True, + ) + ) + else: + # The `gh` CLI might be able to figure it out. + pr_info = json.loads( + subprocess.check_output( + ["gh", "pr", "view", "--json", "title,number,url"], text=True + ) + ) + context_info["pr"] = pr_info + + +newline = "\n" +(SITE_OUTPUT_DIR / "index.html").write_text( + f""" + + + DataHub Python Builds + + + + + + + + + + + +
+

DataHub Python Builds

+

+ These prebuilt wheel files can be used to install our Python packages as of a specific commit. +

+ +

Build context

+

+ Built at {ts}. +

+
{json.dumps(context_info, indent=2)}
+ +

Usage

+

+ Current base URL: unknown +

+ + + + + + + + + + + { + newline.join( + f''' + + + + + + ''' + for wheel_file in sorted(WHEEL_DIR.glob("*.whl")) + ) + } + +
PackageSizeInstall command
{package_name(wheel_file)}{wheel_file.stat().st_size / 1024 / 1024:.3f} MBuv pip install '{package_name(wheel_file)} @ <base-url>/artifacts/wheels/{wheel_file.name}'
+
+ + + +""" +) + +print("DataHub Python wheel site built in", SITE_OUTPUT_DIR) diff --git a/python-build/copy_wheels.py b/python-build/copy_wheels.py new file mode 100644 index 0000000000000..b66662cbfe991 --- /dev/null +++ b/python-build/copy_wheels.py @@ -0,0 +1,27 @@ +import pathlib +import shutil + +PYTHON_BUILD_DIR = pathlib.Path(__file__).parent +ROOT_DIR = PYTHON_BUILD_DIR.parent +WHEEL_OUTPUT_DIR = PYTHON_BUILD_DIR / "wheels" + +# These should line up with the build.gradle file. +wheel_dirs = [ + ROOT_DIR / "metadata-ingestion/dist", + ROOT_DIR / "metadata-ingestion-modules/airflow-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/dagster-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/prefect-plugin/dist", + ROOT_DIR / "metadata-ingestion-modules/gx-plugin/dist", +] + +# Delete and recreate the output directory. +if WHEEL_OUTPUT_DIR.exists(): + shutil.rmtree(WHEEL_OUTPUT_DIR) +WHEEL_OUTPUT_DIR.mkdir(parents=True) + +# Copy things over. +for wheel_dir in wheel_dirs: + for wheel_file in wheel_dir.glob("*"): + shutil.copy(wheel_file, WHEEL_OUTPUT_DIR) + +print("Copied wheels to", WHEEL_OUTPUT_DIR) diff --git a/settings.gradle b/settings.gradle index 8756df31c1ac6..b0c2c707d566c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -64,6 +64,7 @@ include 'metadata-ingestion-modules:airflow-plugin' include 'metadata-ingestion-modules:gx-plugin' include 'metadata-ingestion-modules:dagster-plugin' include 'metadata-ingestion-modules:prefect-plugin' +include 'python-build' include 'smoke-test' include 'metadata-auth:auth-api' include 'metadata-service:schema-registry-api' From 89acda66d0d56d01a2645d9c8cced7c593b65e99 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:18:30 -0600 Subject: [PATCH 10/17] docs(release): v0.3.7.7 (#12091) --- docs/managed-datahub/release-notes/v_0_3_7.md | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index be3a2d97514ef..75f5ac21224c2 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -13,12 +13,43 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Known Issues +### v0.3.7.7 + * Postgres regression, non-functional when using postgres + ### v0.3.7.3 * Search page fails to render when filters are applied with a query which returns zero results. ## Release Changelog --- +### v0.3.7.8 + +- [Postgres] Fix regression from MySQL fix in v0.3.7.7 + +### v0.3.7.7 + +- [UI] Fix bug showing upstream lineage dbt source leaves +- [UI] Show column-level lineage through transformational home node +- [UI] Browse nodes titles expand to full width of panel +- [UI] Data product preview cards display correctly +- [UI] Fix elasticsearch usage sort field names +- [UI] Add structured property display settings feature +- [Executor] Fix false errors on cli ingestions +- [Search] Schema field boost reduced +- [Search] Search usage ranking null_fill fix +- [Search] Single term with underscores by default no longer considered quoted +- [Metadata Tests] Metadata Test shutdown actions flush +- [Metadata Tests] Add deduplicate logic for MCP batches +- [Metadata Tests] Prevent mutation of systemMetadata in patch batches +- [MAE Consumer] Fix graph edge on container delete exception +- [Notifications] Filter out system ingestion source notifications +- [MySQL] Fix index gap lock deadlock +- [API] DataJobInputOutput finegrained lineage fix + +### v0.3.7.6 + +- [UI] fix(automations): white screen automations with dbt sync + ### v0.3.7.5 - [GMS] Fix upstream lineage patching when path contained encoded slash From 9031b49b2345f79db5504f80432af1cd8a77a5e5 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Thu, 19 Dec 2024 09:07:59 -0800 Subject: [PATCH 11/17] fix(docs): Add improvements in examples for PATCH documentation (#12165) Co-authored-by: John Joyce Co-authored-by: John Joyce --- docs/advanced/patch.md | 110 +++++++++++++----- docs/api/tutorials/custom-properties.md | 4 +- .../dataset_add_custom_properties_patch.py | 19 +++ .../dataset_add_glossary_term_patch.py | 22 ++++ .../library/dataset_add_owner_patch.py | 24 ++++ .../library/dataset_add_properties.py | 44 ------- ...aset_add_remove_custom_properties_patch.py | 19 +++ .../library/dataset_add_remove_properties.py | 46 -------- .../dataset_add_structured_properties.py | 24 ---- ...dataset_add_structured_properties_patch.py | 23 ++++ .../examples/library/dataset_add_tag_patch.py | 22 ++++ .../dataset_add_upstream_lineage_patch.py | 62 ++++++++++ .../dataset_field_add_glossary_term_patch.py | 26 +++++ .../library/dataset_field_add_tag_patch.py | 24 ++++ 14 files changed, 321 insertions(+), 148 deletions(-) create mode 100644 metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_owner_patch.py delete mode 100644 metadata-ingestion/examples/library/dataset_add_properties.py create mode 100644 metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py delete mode 100644 metadata-ingestion/examples/library/dataset_add_remove_properties.py delete mode 100644 metadata-ingestion/examples/library/dataset_add_structured_properties.py create mode 100644 metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_tag_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py create mode 100644 metadata-ingestion/examples/library/dataset_field_add_tag_patch.py diff --git a/docs/advanced/patch.md b/docs/advanced/patch.md index 601d055659313..24e8c68a9168d 100644 --- a/docs/advanced/patch.md +++ b/docs/advanced/patch.md @@ -1,69 +1,120 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# But First, Semantics: Upsert versus Patch +# Emitting Patch Updates to DataHub ## Why Would You Use Patch -By default, most of the SDK tutorials and API-s involve applying full upserts at the aspect level. This means that typically, when you want to change one field within an aspect without modifying others, you need to do a read-modify-write to not overwrite existing fields. -To support these scenarios, DataHub supports PATCH based operations so that targeted changes to single fields or values within arrays of fields are possible without impacting other existing metadata. +By default, most of the SDK tutorials and APIs involve applying full upserts at the aspect level, e.g. replacing the aspect entirely. +This means that when you want to change even a single field within an aspect without modifying others, you need to do a read-modify-write to avoid overwriting existing fields. +To support these scenarios, DataHub supports `PATCH` operations to perform targeted changes for individual fields or values within arrays of fields are possible without impacting other existing metadata. :::note -Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). In the near future, we do have plans to automatically support PATCH semantics for aspects by default. +Currently, PATCH support is only available for a selected set of aspects, so before pinning your hopes on using PATCH as a way to make modifications to aspect values, confirm whether your aspect supports PATCH semantics. The complete list of Aspects that are supported are maintained [here](https://github.com/datahub-project/datahub/blob/9588440549f3d99965085e97b214a7dabc181ed2/entity-registry/src/main/java/com/linkedin/metadata/models/registry/template/AspectTemplateEngine.java#L24). ::: -## How To Use Patch +## How To Use Patches -Examples for using Patch are sprinkled throughout the API guides. Here's how to find the appropriate classes for the language for your choice. - - + -The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. +The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +Patch builder helper classes exist for -Here are a few illustrative examples using the Java Patch builders: +- [Datasets](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataset.py) +- [Charts](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/chart.py) +- [Dashboards](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dashboard.py) +- [Data Jobs (Tasks)](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/datajob.py) +- [Data Products](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/specific/dataproduct.py) +And we are gladly accepting contributions for Containers, Data Flows (Pipelines), Tags, Glossary Terms, Domains, and ML Models. -### Add Custom Properties +### Add & Remove Owners for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAdd.java show_path_as_comment }} +To add & remove specific owners for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_owner_patch.py show_path_as_comment }} ``` -### Add and Remove Custom Properties +### Add & Remove Tags for Dataset -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +To add & remove specific tags for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_tag_patch.py show_path_as_comment }} ``` -### Add Data Job Lineage +And for a specific schema field within the Dataset: -```java -{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_tag_patch.py show_path_as_comment }} ``` - - +### Add & Remove Glossary Terms for Dataset + +To add & remove specific glossary terms for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py show_path_as_comment }} +``` + +And for a specific schema field within the Dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py show_path_as_comment }} +``` + +### Add & Remove Structured Properties for Dataset -The Python Patch builders are entity-oriented and located in the [metadata-ingestion](https://github.com/datahub-project/datahub/tree/9588440549f3d99965085e97b214a7dabc181ed2/metadata-ingestion/src/datahub/specific) module and located in the `datahub.specific` module. +To add & remove structured properties for a dataset: -Here are a few illustrative examples using the Python Patch builders: +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py show_path_as_comment }} +``` -### Add Properties to Dataset +### Add & Remove Upstream Lineage for Dataset + +To add & remove a lineage edge connecting a dataset to it's upstream or input at both the dataset and schema field level: ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py show_path_as_comment }} +``` + +### Add & Remove Read-Only Custom Properties for Dataset + +To add & remove specific custom properties for a dataset: + +```python +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} +``` + + + + +The Java Patch builders are aspect-oriented and located in the [datahub-client](https://github.com/datahub-project/datahub/tree/master/metadata-integration/java/datahub-client/src/main/java/datahub/client/patch) module under the `datahub.client.patch` namespace. + +### Add & Remove Read-Only Custom Properties + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetCustomPropertiesAddRemove.java show_path_as_comment }} +``` + +### Add Data Job Lineage + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DataJobLineageAdd.java show_path_as_comment }} ``` -## How Patch works +## Advanced: How Patch works To understand how patching works, it's important to understand a bit about our [models](../what/aspect.md). Entities are comprised of Aspects which can be reasoned about as JSON representations of the object models. To be able to patch these we utilize [JsonPatch](https://jsonpatch.com/). The components of a JSON Patch are the path, operation, and value. @@ -73,9 +124,6 @@ which can be reasoned about as JSON representations of the object models. To be The JSON path refers to a value within the schema. This can be a single field or can be an entire object reference depending on what the path is. For our patches we are primarily targeting single fields or even single array elements within a field. To be able to target array elements by id, we go through a translation process of the schema to transform arrays into maps. This allows a path to reference a particular array element by key rather than by index, for example a specific tag urn being added to a dataset. -This is important to note that for some fields in our schema that are arrays which do not necessarily restrict uniqueness, this puts a uniqueness constraint on the key. -The key for objects stored in arrays is determined manually by examining the schema and a long term goal is to make these keys annotation driven to reduce the amount of code needed to support -additional aspects to be patched. There is a generic patch endpoint, but it requires any array field keys to be specified at request time, putting a lot of burden on the API user. #### Examples @@ -87,8 +135,7 @@ Breakdown: * `/upstreams` -> References the upstreams field of the UpstreamLineage aspect, this is an array of Upstream objects where the key is the Urn * `/urn:...` -> The dataset to be targeted by the operation - -A patch path for targeting a fine grained lineage upstream: +A patch path for targeting a fine-grained lineage upstream: `/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),foo)/urn:li:query:queryId/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created_upstream,PROD),bar)` @@ -118,7 +165,6 @@ using adds, but generally the most useful use case for patch is to add elements Remove operations require the path specified to be present, or an error will be thrown, otherwise they operate as one would expect. The specified path will be removed from the aspect. - ### Value Value is the actual information that will be stored at a path. If the path references an object then this will include the JSON key value pairs for that object. diff --git a/docs/api/tutorials/custom-properties.md b/docs/api/tutorials/custom-properties.md index fe0d7e62dcde8..86b1b2c0c54da 100644 --- a/docs/api/tutorials/custom-properties.md +++ b/docs/api/tutorials/custom-properties.md @@ -74,7 +74,7 @@ The following code adds custom properties `cluster_name` and `retention_time` to ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py show_path_as_comment }} ``` @@ -128,7 +128,7 @@ The following code shows you how can add and remove custom properties in the sam ```python -{{ inline /metadata-ingestion/examples/library/dataset_add_remove_properties.py show_path_as_comment }} +{{ inline /metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py show_path_as_comment }} ``` diff --git a/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py new file mode 100644 index 0000000000000..7231461fea322 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.add_custom_property("retention_time", "2 years") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py new file mode 100644 index 0000000000000..d0b9a866fde61 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_glossary_term_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_term(GlossaryTermAssociationClass(make_term_urn("term-to-add-id"))) +patch_builder.remove_term(make_term_urn("term-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_owner_patch.py b/metadata-ingestion/examples/library/dataset_add_owner_patch.py new file mode 100644 index 0000000000000..8d3130c09c4bb --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_owner_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_group_urn, make_user_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import OwnerClass, OwnershipTypeClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Owners +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_owner( + OwnerClass(make_user_urn("user-to-add-id"), OwnershipTypeClass.TECHNICAL_OWNER) +) +patch_builder.remove_owner(make_group_urn("group-to-remove-id")) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_properties.py b/metadata-ingestion/examples/library/dataset_add_properties.py deleted file mode 100644 index b72aac5b82800..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_properties.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .add_custom_property("retention_time", "2 years") - .build() - ): - emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py new file mode 100644 index 0000000000000..c1db9c91d13ec --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_remove_custom_properties_patch.py @@ -0,0 +1,19 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add + Remove Custom Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_custom_property("cluster_name", "datahubproject.acryl.io") +patch_builder.remove_custom_property("retention_time") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_remove_properties.py b/metadata-ingestion/examples/library/dataset_add_remove_properties.py deleted file mode 100644 index 7109c0264f971..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_remove_properties.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -from typing import Union - -from datahub.configuration.kafka import KafkaProducerConnectionConfig -from datahub.emitter.kafka_emitter import DatahubKafkaEmitter, KafkaEmitterConfig -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -# Get an emitter, either REST or Kafka, this example shows you both -def get_emitter() -> Union[DataHubRestEmitter, DatahubKafkaEmitter]: - USE_REST_EMITTER = True - if USE_REST_EMITTER: - gms_endpoint = "http://localhost:8080" - return DataHubRestEmitter(gms_server=gms_endpoint) - else: - kafka_server = "localhost:9092" - schema_registry_url = "http://localhost:8081" - return DatahubKafkaEmitter( - config=KafkaEmitterConfig( - connection=KafkaProducerConnectionConfig( - bootstrap=kafka_server, schema_registry_url=schema_registry_url - ) - ) - ) - - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - -with get_emitter() as emitter: - for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_custom_property("cluster_name", "datahubproject.acryl.io") - .remove_custom_property("retention_time") - .build() - ): - emitter.emit(patch_mcp) - - -log.info( - f"Added cluster_name property, removed retention_time property from dataset {dataset_urn}" -) diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties.py b/metadata-ingestion/examples/library/dataset_add_structured_properties.py deleted file mode 100644 index fc2c379340592..0000000000000 --- a/metadata-ingestion/examples/library/dataset_add_structured_properties.py +++ /dev/null @@ -1,24 +0,0 @@ -import logging - -from datahub.emitter.mce_builder import make_dataset_urn -from datahub.emitter.rest_emitter import DataHubRestEmitter -from datahub.specific.dataset import DatasetPatchBuilder - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - -# Create rest emitter -rest_emitter = DataHubRestEmitter(gms_server="http://localhost:8080") - -dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") - - -for patch_mcp in ( - DatasetPatchBuilder(dataset_urn) - .add_structured_property("io.acryl.dataManagement.replicationSLA", 12) - .build() -): - rest_emitter.emit(patch_mcp) - - -log.info(f"Added cluster_name, retention_time properties to dataset {dataset_urn}") diff --git a/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py new file mode 100644 index 0000000000000..ef72ed58a4b82 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_structured_properties_patch.py @@ -0,0 +1,23 @@ +from datahub.emitter.mce_builder import make_dataset_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn(platform="hive", name="fct_users_created", env="PROD") + +# Create Dataset Patch to Add and Remove Structured Properties +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_structured_property( + "urn:li:structuredProperty:retentionTimeInDays", 12 +) +patch_builder.remove_structured_property( + "urn:li:structuredProperty:customClassification" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_add_tag_patch.py new file mode 100644 index 0000000000000..0bc644d6865f6 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_tag_patch.py @@ -0,0 +1,22 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.add_tag(TagAssociationClass(make_tag_urn("tag-to-add-id"))) +patch_builder.remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py new file mode 100644 index 0000000000000..0b4e5e39bf627 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_add_upstream_lineage_patch.py @@ -0,0 +1,62 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import ( + DatasetLineageTypeClass, + FineGrainedLineageClass, + FineGrainedLineageUpstreamTypeClass, + UpstreamClass, +) +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) +upstream_to_remove_urn = make_dataset_urn( + platform="s3", name="fct_users_old", env="PROD" +) +upstream_to_add_urn = make_dataset_urn(platform="s3", name="fct_users_new", env="PROD") + +# Create Dataset Patch to Add & Remove Upstream Lineage Edges +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.remove_upstream_lineage(upstream_to_remove_urn) +patch_builder.add_upstream_lineage( + UpstreamClass(upstream_to_add_urn, DatasetLineageTypeClass.TRANSFORMED) +) + +# ...And also include schema field lineage +upstream_field_to_add_urn = make_schema_field_urn(upstream_to_add_urn, "profile_id") +downstream_field_to_add_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.add_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_add_urn], + [downstream_field_to_add_urn], + ) +) + +upstream_field_to_remove_urn = make_schema_field_urn( + upstream_to_remove_urn, "profile_id" +) +downstream_field_to_remove_urn = make_schema_field_urn(dataset_urn, "profile_id") + +patch_builder.remove_fine_grained_upstream_lineage( + FineGrainedLineageClass( + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + FineGrainedLineageUpstreamTypeClass.FIELD_SET, + [upstream_field_to_remove_urn], + [downstream_field_to_remove_urn], + ) +) + +patch_mcps = patch_builder.build() + + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py new file mode 100644 index 0000000000000..3f8da2c143c92 --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_glossary_term_patch.py @@ -0,0 +1,26 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import GlossaryTermAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Term for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_term( + GlossaryTermAssociationClass(make_term_urn("term-to-add-id")) +) +patch_builder.for_field("profile_id").remove_term( + "urn:li:glossaryTerm:term-to-remove-id" +) +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) diff --git a/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py new file mode 100644 index 0000000000000..3075cac5320ae --- /dev/null +++ b/metadata-ingestion/examples/library/dataset_field_add_tag_patch.py @@ -0,0 +1,24 @@ +from datahub.emitter.mce_builder import make_dataset_urn, make_tag_urn +from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig +from datahub.metadata.schema_classes import TagAssociationClass +from datahub.specific.dataset import DatasetPatchBuilder + +# Create DataHub Client +datahub_client = DataHubGraph(DataHubGraphConfig(server="http://localhost:8080")) + +# Create Dataset URN +dataset_urn = make_dataset_urn( + platform="snowflake", name="fct_users_created", env="PROD" +) + +# Create Dataset Patch to Add + Remove Tag for 'profile_id' column +patch_builder = DatasetPatchBuilder(dataset_urn) +patch_builder.for_field("profile_id").add_tag( + TagAssociationClass(make_tag_urn("tag-to-add-id")) +) +patch_builder.for_field("profile_id").remove_tag("urn:li:tag:tag-to-remove-id") +patch_mcps = patch_builder.build() + +# Emit Dataset Patch +for patch_mcp in patch_mcps: + datahub_client.emit(patch_mcp) From b7bb5ca7ee3e0e80c5f8ca1843e67671f779f27d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Thu, 19 Dec 2024 10:20:06 -0800 Subject: [PATCH 12/17] feat(graphql/ml): Add custom properties to ml entities (#12152) --- .../types/mappers/EmbeddedModelMapper.java | 12 +++++++++++ .../mlmodel/mappers/MLFeatureMapper.java | 12 +++++++---- .../mappers/MLFeaturePropertiesMapper.java | 20 +++++++++++++------ .../mlmodel/mappers/MLFeatureTableMapper.java | 10 +++++----- .../MLFeatureTablePropertiesMapper.java | 18 ++++++++++------- .../mlmodel/mappers/MLModelGroupMapper.java | 11 ++++++---- .../mappers/MLModelGroupPropertiesMapper.java | 19 ++++++++++++------ .../mappers/MLModelPropertiesMapper.java | 12 ++++++----- .../mlmodel/mappers/MLPrimaryKeyMapper.java | 15 ++++++++------ .../mappers/MLPrimaryKeyPropertiesMapper.java | 19 ++++++++++++------ .../src/main/resources/entity.graphql | 12 ++++++++--- 11 files changed, 108 insertions(+), 52 deletions(-) create mode 100644 datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java new file mode 100644 index 0000000000000..62e7c90ab9b0e --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mappers/EmbeddedModelMapper.java @@ -0,0 +1,12 @@ +package com.linkedin.datahub.graphql.types.mappers; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.QueryContext; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +/** Made for models that are embedded in other models and thus do not encode their own URN. */ +public interface EmbeddedModelMapper { + O apply( + @Nullable final QueryContext context, @Nonnull final I input, @Nonnull final Urn entityUrn); +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java index d5eb1a15624dc..74076fd2f4ee9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureMapper.java @@ -75,7 +75,8 @@ public MLFeature apply( mlFeature.setOwnership( OwnershipMapper.map(context, new Ownership(dataMap), entityUrn))); mappingHelper.mapToResult( - context, ML_FEATURE_PROPERTIES_ASPECT_NAME, MLFeatureMapper::mapMLFeatureProperties); + ML_FEATURE_PROPERTIES_ASPECT_NAME, + (entity, dataMap) -> mapMLFeatureProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlFeature, dataMap) -> @@ -138,10 +139,13 @@ private static void mapMLFeatureKey(@Nonnull MLFeature mlFeature, @Nonnull DataM private static void mapMLFeatureProperties( @Nullable final QueryContext context, @Nonnull MLFeature mlFeature, - @Nonnull DataMap dataMap) { + @Nonnull DataMap dataMap, + @Nonnull Urn entityUrn) { MLFeatureProperties featureProperties = new MLFeatureProperties(dataMap); - mlFeature.setFeatureProperties(MLFeaturePropertiesMapper.map(context, featureProperties)); - mlFeature.setProperties(MLFeaturePropertiesMapper.map(context, featureProperties)); + com.linkedin.datahub.graphql.generated.MLFeatureProperties graphqlProperties = + MLFeaturePropertiesMapper.map(context, featureProperties, entityUrn); + mlFeature.setFeatureProperties(graphqlProperties); + mlFeature.setProperties(graphqlProperties); mlFeature.setDescription(featureProperties.getDescription()); if (featureProperties.getDataType() != null) { mlFeature.setDataType(MLFeatureDataType.valueOf(featureProperties.getDataType().toString())); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java index 92d090275867d..08ac3a1b5f138 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeaturePropertiesMapper.java @@ -1,29 +1,34 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.MLFeatureDataType; import com.linkedin.datahub.graphql.generated.MLFeatureProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLFeaturePropertiesMapper - implements ModelMapper { + implements EmbeddedModelMapper< + com.linkedin.ml.metadata.MLFeatureProperties, MLFeatureProperties> { public static final MLFeaturePropertiesMapper INSTANCE = new MLFeaturePropertiesMapper(); public static MLFeatureProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties) { - return INSTANCE.apply(context, mlFeatureProperties); + @Nonnull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlFeatureProperties, entityUrn); } @Override public MLFeatureProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureProperties mlFeatureProperties, + @Nonnull Urn entityUrn) { final MLFeatureProperties result = new MLFeatureProperties(); result.setDescription(mlFeatureProperties.getDescription()); @@ -45,6 +50,9 @@ public MLFeatureProperties apply( .collect(Collectors.toList())); } + result.setCustomProperties( + CustomPropertiesMapper.map(mlFeatureProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java index 51d3004d97a61..65bc8e84f7bbb 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTableMapper.java @@ -76,7 +76,7 @@ public MLFeatureTable apply( mappingHelper.mapToResult(ML_FEATURE_TABLE_KEY_ASPECT_NAME, this::mapMLFeatureTableKey); mappingHelper.mapToResult( ML_FEATURE_TABLE_PROPERTIES_ASPECT_NAME, - (entity, dataMap) -> this.mapMLFeatureTableProperties(context, entity, dataMap, entityUrn)); + (entity, dataMap) -> mapMLFeatureTableProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlFeatureTable, dataMap) -> @@ -146,10 +146,10 @@ private static void mapMLFeatureTableProperties( @Nonnull DataMap dataMap, Urn entityUrn) { MLFeatureTableProperties featureTableProperties = new MLFeatureTableProperties(dataMap); - mlFeatureTable.setFeatureTableProperties( - MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn)); - mlFeatureTable.setProperties( - MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn)); + com.linkedin.datahub.graphql.generated.MLFeatureTableProperties graphqlProperties = + MLFeatureTablePropertiesMapper.map(context, featureTableProperties, entityUrn); + mlFeatureTable.setFeatureTableProperties(graphqlProperties); + mlFeatureTable.setProperties(graphqlProperties); mlFeatureTable.setDescription(featureTableProperties.getDescription()); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java index d9fed13ed0d0b..3c054cb6a9a5b 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLFeatureTablePropertiesMapper.java @@ -8,26 +8,30 @@ import com.linkedin.datahub.graphql.generated.MLFeatureTableProperties; import com.linkedin.datahub.graphql.generated.MLPrimaryKey; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; -public class MLFeatureTablePropertiesMapper { +public class MLFeatureTablePropertiesMapper + implements EmbeddedModelMapper< + com.linkedin.ml.metadata.MLFeatureTableProperties, MLFeatureTableProperties> { public static final MLFeatureTablePropertiesMapper INSTANCE = new MLFeatureTablePropertiesMapper(); public static MLFeatureTableProperties map( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, + @Nonnull Urn entityUrn) { return INSTANCE.apply(context, mlFeatureTableProperties, entityUrn); } - public static MLFeatureTableProperties apply( + @Override + public MLFeatureTableProperties apply( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLFeatureTableProperties mlFeatureTableProperties, + @Nonnull Urn entityUrn) { final MLFeatureTableProperties result = new MLFeatureTableProperties(); result.setDescription(mlFeatureTableProperties.getDescription()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java index 6e3da1c153392..9009972a47616 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupMapper.java @@ -75,9 +75,8 @@ public MLModelGroup apply( mappingHelper.mapToResult( ML_MODEL_GROUP_KEY_ASPECT_NAME, MLModelGroupMapper::mapToMLModelGroupKey); mappingHelper.mapToResult( - context, ML_MODEL_GROUP_PROPERTIES_ASPECT_NAME, - MLModelGroupMapper::mapToMLModelGroupProperties); + (entity, dataMap) -> mapToMLModelGroupProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( STATUS_ASPECT_NAME, (mlModelGroup, dataMap) -> @@ -136,9 +135,13 @@ private static void mapToMLModelGroupKey(MLModelGroup mlModelGroup, DataMap data } private static void mapToMLModelGroupProperties( - @Nullable final QueryContext context, MLModelGroup mlModelGroup, DataMap dataMap) { + @Nullable final QueryContext context, + MLModelGroup mlModelGroup, + DataMap dataMap, + @Nonnull Urn entityUrn) { MLModelGroupProperties modelGroupProperties = new MLModelGroupProperties(dataMap); - mlModelGroup.setProperties(MLModelGroupPropertiesMapper.map(context, modelGroupProperties)); + mlModelGroup.setProperties( + MLModelGroupPropertiesMapper.map(context, modelGroupProperties, entityUrn)); if (modelGroupProperties.getDescription() != null) { mlModelGroup.setDescription(modelGroupProperties.getDescription()); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java index 9f1918f9ec489..a6cfded9865d9 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelGroupPropertiesMapper.java @@ -1,27 +1,31 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.MLModelGroupProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLModelGroupPropertiesMapper - implements ModelMapper< + implements EmbeddedModelMapper< com.linkedin.ml.metadata.MLModelGroupProperties, MLModelGroupProperties> { public static final MLModelGroupPropertiesMapper INSTANCE = new MLModelGroupPropertiesMapper(); public static MLModelGroupProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties) { - return INSTANCE.apply(context, mlModelGroupProperties); + @Nonnull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlModelGroupProperties, entityUrn); } @Override public MLModelGroupProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties) { + @Nonnull final com.linkedin.ml.metadata.MLModelGroupProperties mlModelGroupProperties, + @Nonnull Urn entityUrn) { final MLModelGroupProperties result = new MLModelGroupProperties(); result.setDescription(mlModelGroupProperties.getDescription()); @@ -30,6 +34,9 @@ public MLModelGroupProperties apply( } result.setCreatedAt(mlModelGroupProperties.getCreatedAt()); + result.setCustomProperties( + CustomPropertiesMapper.map(mlModelGroupProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java index a89904b3ab915..265005c2caa9e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLModelPropertiesMapper.java @@ -7,25 +7,27 @@ import com.linkedin.datahub.graphql.generated.MLModelGroup; import com.linkedin.datahub.graphql.generated.MLModelProperties; import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; -public class MLModelPropertiesMapper { +public class MLModelPropertiesMapper + implements EmbeddedModelMapper { public static final MLModelPropertiesMapper INSTANCE = new MLModelPropertiesMapper(); public static MLModelProperties map( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, + @Nonnull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, Urn entityUrn) { return INSTANCE.apply(context, mlModelProperties, entityUrn); } public MLModelProperties apply( @Nullable final QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, - Urn entityUrn) { + @Nonnull final com.linkedin.ml.metadata.MLModelProperties mlModelProperties, + @Nonnull Urn entityUrn) { final MLModelProperties result = new MLModelProperties(); result.setDate(mlModelProperties.getDate()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java index c446c892cb223..d48d93ede9c1a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyMapper.java @@ -74,9 +74,8 @@ public MLPrimaryKey apply( mappingHelper.mapToResult( ML_PRIMARY_KEY_KEY_ASPECT_NAME, MLPrimaryKeyMapper::mapMLPrimaryKeyKey); mappingHelper.mapToResult( - context, ML_PRIMARY_KEY_PROPERTIES_ASPECT_NAME, - MLPrimaryKeyMapper::mapMLPrimaryKeyProperties); + (entity, dataMap) -> mapMLPrimaryKeyProperties(context, entity, dataMap, entityUrn)); mappingHelper.mapToResult( INSTITUTIONAL_MEMORY_ASPECT_NAME, (mlPrimaryKey, dataMap) -> @@ -132,11 +131,15 @@ private static void mapMLPrimaryKeyKey(MLPrimaryKey mlPrimaryKey, DataMap dataMa } private static void mapMLPrimaryKeyProperties( - @Nullable final QueryContext context, MLPrimaryKey mlPrimaryKey, DataMap dataMap) { + @Nullable final QueryContext context, + MLPrimaryKey mlPrimaryKey, + DataMap dataMap, + @Nonnull Urn entityUrn) { MLPrimaryKeyProperties primaryKeyProperties = new MLPrimaryKeyProperties(dataMap); - mlPrimaryKey.setPrimaryKeyProperties( - MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties)); - mlPrimaryKey.setProperties(MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties)); + com.linkedin.datahub.graphql.generated.MLPrimaryKeyProperties graphqlProperties = + MLPrimaryKeyPropertiesMapper.map(context, primaryKeyProperties, entityUrn); + mlPrimaryKey.setPrimaryKeyProperties(graphqlProperties); + mlPrimaryKey.setProperties(graphqlProperties); mlPrimaryKey.setDescription(primaryKeyProperties.getDescription()); if (primaryKeyProperties.getDataType() != null) { mlPrimaryKey.setDataType( diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java index 09e41fe7ee4e8..0bbe8f53f3271 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/mlmodel/mappers/MLPrimaryKeyPropertiesMapper.java @@ -1,30 +1,34 @@ package com.linkedin.datahub.graphql.types.mlmodel.mappers; +import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.generated.Dataset; import com.linkedin.datahub.graphql.generated.MLFeatureDataType; import com.linkedin.datahub.graphql.generated.MLPrimaryKeyProperties; -import com.linkedin.datahub.graphql.types.mappers.ModelMapper; +import com.linkedin.datahub.graphql.types.common.mappers.CustomPropertiesMapper; +import com.linkedin.datahub.graphql.types.mappers.EmbeddedModelMapper; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import javax.annotation.Nullable; -import lombok.NonNull; public class MLPrimaryKeyPropertiesMapper - implements ModelMapper< + implements EmbeddedModelMapper< com.linkedin.ml.metadata.MLPrimaryKeyProperties, MLPrimaryKeyProperties> { public static final MLPrimaryKeyPropertiesMapper INSTANCE = new MLPrimaryKeyPropertiesMapper(); public static MLPrimaryKeyProperties map( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties) { - return INSTANCE.apply(context, mlPrimaryKeyProperties); + @Nonnull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties, + @Nonnull Urn entityUrn) { + return INSTANCE.apply(context, mlPrimaryKeyProperties, entityUrn); } @Override public MLPrimaryKeyProperties apply( @Nullable QueryContext context, - @NonNull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties) { + @Nonnull final com.linkedin.ml.metadata.MLPrimaryKeyProperties mlPrimaryKeyProperties, + @Nonnull Urn entityUrn) { final MLPrimaryKeyProperties result = new MLPrimaryKeyProperties(); result.setDescription(mlPrimaryKeyProperties.getDescription()); @@ -45,6 +49,9 @@ public MLPrimaryKeyProperties apply( }) .collect(Collectors.toList())); + result.setCustomProperties( + CustomPropertiesMapper.map(mlPrimaryKeyProperties.getCustomProperties(), entityUrn)); + return result; } } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 049527e5d77e3..926cd256a5c5a 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -9829,11 +9829,13 @@ type MLModelGroup implements EntityWithRelationships & Entity & BrowsableEntity type MLModelGroupProperties { -description: String + description: String createdAt: Long version: VersionTag + + customProperties: [CustomPropertiesEntry!] } """ @@ -10028,6 +10030,8 @@ type MLFeatureProperties { version: VersionTag sources: [Dataset] + + customProperties: [CustomPropertiesEntry!] } """ @@ -10164,13 +10168,15 @@ type MLPrimaryKey implements EntityWithRelationships & Entity { type MLPrimaryKeyProperties { -description: String + description: String dataType: MLFeatureDataType version: VersionTag sources: [Dataset] + + customProperties: [CustomPropertiesEntry!] } """ @@ -10347,7 +10353,7 @@ type MLModelGroupEditableProperties { type MLFeatureTableProperties { -description: String + description: String mlFeatures: [MLFeature] From 9762c46702dc4492d09a5810544dfa7922266fb1 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:41:44 -0600 Subject: [PATCH 13/17] chore(bump): ingestion-base & actions (#12171) --- docker/datahub-ingestion-base/build.gradle | 2 +- docker/datahub-ingestion/build.gradle | 2 +- docker/profiles/docker-compose.actions.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index ef482de9256a3..f19faa227ca61 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 7 // increment to trigger rebuild + revision = 8 // increment to trigger rebuild } docker { diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index 113a6dcf0a1bd..b236a53c288f7 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 8 // increment to trigger rebuild + revision = 9 // increment to trigger rebuild } dependencies { diff --git a/docker/profiles/docker-compose.actions.yml b/docker/profiles/docker-compose.actions.yml index c2985f4299326..459fffdd8acf3 100644 --- a/docker/profiles/docker-compose.actions.yml +++ b/docker/profiles/docker-compose.actions.yml @@ -6,7 +6,7 @@ x-search-datastore-elasticsearch-env: &search-datastore-env x-datahub-actions-service: &datahub-actions-service hostname: actions - image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-v0.1.1} + image: ${DATAHUB_ACTIONS_IMAGE:-${DATAHUB_ACTIONS_REPO:-acryldata}/datahub-actions}:${ACTIONS_VERSION:-v0.1.6} env_file: - datahub-actions/env/docker.env - ${DATAHUB_LOCAL_COMMON_ENV:-empty.env} From 45ace13fe26a9ae20ed9fcdd7df04bb7c197d52a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 19 Dec 2024 20:20:42 +0100 Subject: [PATCH 14/17] feat(mssql): platform instance aspect for dataflow and datajob entities (#12180) --- .../ingestion/source/sql/mssql/job_models.py | 31 +- .../ingestion/source/sql/mssql/source.py | 14 + .../golden_mces_mssql_to_file.json | 756 ++++++++++++------ .../sql_server/source_files/mssql_to_file.yml | 1 + 4 files changed, 574 insertions(+), 228 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 5107a4e38f64d..d3941e7add0fd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -1,11 +1,17 @@ from dataclasses import dataclass, field from typing import Dict, List, Optional, Union -from datahub.emitter.mce_builder import make_data_flow_urn, make_data_job_urn +from datahub.emitter.mce_builder import ( + make_data_flow_urn, + make_data_job_urn, + make_data_platform_urn, + make_dataplatform_instance_urn, +) from datahub.metadata.schema_classes import ( DataFlowInfoClass, DataJobInfoClass, DataJobInputOutputClass, + DataPlatformInstanceClass, ) @@ -204,6 +210,18 @@ def as_datajob_info_aspect(self) -> DataJobInfoClass: status=self.status, ) + @property + def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: + if self.entity.flow.platform_instance: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.entity.flow.orchestrator), + instance=make_dataplatform_instance_urn( + platform=self.entity.flow.orchestrator, + instance=self.entity.flow.platform_instance, + ), + ) + return None + @dataclass class MSSQLDataFlow: @@ -238,3 +256,14 @@ def as_dataflow_info_aspect(self) -> DataFlowInfoClass: customProperties=self.flow_properties, externalUrl=self.external_url, ) + + @property + def as_maybe_platform_instance_aspect(self) -> Optional[DataPlatformInstanceClass]: + if self.entity.platform_instance: + return DataPlatformInstanceClass( + platform=make_data_platform_urn(self.entity.orchestrator), + instance=make_dataplatform_instance_urn( + self.entity.orchestrator, self.entity.platform_instance + ), + ) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 414c1faaa1661..9d8b67041998c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -639,6 +639,13 @@ def construct_job_workunits( aspect=data_job.as_datajob_info_aspect, ).as_workunit() + data_platform_instance_aspect = data_job.as_maybe_platform_instance_aspect + if data_platform_instance_aspect: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_platform_instance_aspect, + ).as_workunit() + if include_lineage: yield MetadataChangeProposalWrapper( entityUrn=data_job.urn, @@ -654,6 +661,13 @@ def construct_flow_workunits( entityUrn=data_flow.urn, aspect=data_flow.as_dataflow_info_aspect, ).as_workunit() + + data_platform_instance_aspect = data_flow.as_maybe_platform_instance_aspect + if data_platform_instance_aspect: + yield MetadataChangeProposalWrapper( + entityUrn=data_flow.urn, + aspect=data_platform_instance_aspect, + ).as_workunit() # TODO: Add SubType when it appear def get_inspectors(self) -> Iterable[Inspector]: diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index b67ebfb206883..b36188405e7e1 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -1,13 +1,14 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData" }, @@ -23,7 +24,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -39,12 +40,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -55,7 +57,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -73,12 +75,17 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "entityUrn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + ] } }, "systemMetadata": { @@ -89,7 +96,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -105,19 +112,36 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { "json": { "customProperties": { - "job_id": "c2d77890-83ba-435f-879b-1c77fa38dd47", + "job_id": "b8907be7-52f5-4df4-a870-f4fe0679ec45", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-12-05 16:44:43.910000", - "date_modified": "2024-12-05 16:44:44.043000", + "date_created": "2024-12-19 12:34:45.843000", + "date_modified": "2024-12-19 12:34:46.017000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -138,7 +162,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -156,12 +197,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -172,13 +213,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_accessadmin" @@ -195,7 +237,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -211,12 +253,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -227,7 +270,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -245,15 +288,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:7da983a1581c33cce8a106587b150f02", + "entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -266,12 +313,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -282,13 +329,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_backupoperator" @@ -305,7 +353,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -321,12 +369,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -337,7 +386,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -355,15 +404,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:671f67227a05c22c9fa97c27abc56820", + "entityUrn": "urn:li:container:5d8a64d9bc388814ac06d9a4d7a3ad22", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -376,12 +429,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -392,13 +445,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_datareader" @@ -415,7 +469,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -431,12 +485,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -447,7 +502,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -465,15 +520,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:830660638ee785d5352ca300835af7ec", + "entityUrn": "urn:li:container:d5f6914a2b8e0dd461f1ad02e7b28c11", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -486,12 +545,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -502,13 +561,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_datawriter" @@ -525,7 +585,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -541,12 +601,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -557,7 +618,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -575,15 +636,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:e6b69ac2a511e798a89a4186881f70b8", + "entityUrn": "urn:li:container:e3f86c86f3794233740cad99cba0b854", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -596,12 +661,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -612,13 +677,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_ddladmin" @@ -635,7 +701,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -651,12 +717,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -667,7 +734,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -685,15 +752,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:a5b29b900882d27c0d5fb0d5ccac92a5", + "entityUrn": "urn:li:container:c978c9ed6c196412685945ad89f8fbd6", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -706,12 +777,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -722,13 +793,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_denydatareader" @@ -745,7 +817,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -761,12 +833,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -777,7 +850,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -795,15 +868,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:b6baf19c5f148fba3d3385151a8c672f", + "entityUrn": "urn:li:container:17749025f27ce9ebd6febcaa6a49d715", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -816,12 +893,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -832,13 +909,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_denydatawriter" @@ -855,7 +933,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -871,12 +949,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -887,7 +966,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -905,15 +984,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:ee19bd6cf8db0a0d086fbe78f7539bf7", + "entityUrn": "urn:li:container:63c0518620c06ef7af76019fea52b862", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -926,12 +1009,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -942,13 +1025,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_owner" @@ -965,7 +1049,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -981,12 +1065,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -997,7 +1082,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1015,15 +1100,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:6514a64e5b04f103c9c1dd0ebe3d8b47", + "entityUrn": "urn:li:container:c6e96aed010f9205f809c1ce9a530003", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1036,12 +1125,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1052,13 +1141,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "db_securityadmin" @@ -1075,7 +1165,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1091,12 +1181,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1107,7 +1198,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1125,15 +1216,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:fd80008628a03642d6e747c460a90619", + "entityUrn": "urn:li:container:895216bb602fb0002beac82d96507acf", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1146,12 +1241,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1162,13 +1257,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "dbo" @@ -1185,7 +1281,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1201,12 +1297,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1217,7 +1314,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1235,15 +1332,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", + "entityUrn": "urn:li:container:92899b29bb814fdeb1186eb99139073f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1256,12 +1357,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1" + "container": "urn:li:container:92899b29bb814fdeb1186eb99139073f" } }, "systemMetadata": { @@ -1273,7 +1374,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1345,7 +1446,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1363,19 +1481,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.dbo.Products,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1", - "urn": "urn:li:container:61332a50b978d8ca7245ddb34565d7b1" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:92899b29bb814fdeb1186eb99139073f", + "urn": "urn:li:container:92899b29bb814fdeb1186eb99139073f" } ] } @@ -1388,12 +1510,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -1404,13 +1526,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "Foo" @@ -1427,7 +1550,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1443,12 +1566,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -1459,7 +1583,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1477,15 +1601,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "entityUrn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -1498,12 +1626,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1515,7 +1643,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1587,7 +1715,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1605,19 +1750,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.age_dist,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1630,12 +1779,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1647,7 +1796,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1720,7 +1869,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1738,19 +1904,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Items,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1763,12 +1933,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1780,7 +1950,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1877,7 +2047,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1895,19 +2082,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -1920,12 +2111,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -1937,7 +2128,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2012,12 +2203,12 @@ { "name": "FK_TempSales_SalesReason", "foreignFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD),ID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD),ID)" ], "sourceFields": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD),TempID)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD),TempID)" ], - "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Persons,PROD)" + "foreignDataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.Persons,PROD)" } ] } @@ -2033,7 +2224,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2051,19 +2259,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.SalesReason,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -2076,12 +2288,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } }, "systemMetadata": { @@ -2093,7 +2305,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2103,8 +2315,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n", - "is_view": "True" + "is_view": "True", + "view_definition": "CREATE VIEW Foo.PersonsView AS SELECT * FROM Foo.Persons;\n" }, "name": "PersonsView", "tags": [] @@ -2192,7 +2404,24 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2210,7 +2439,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "viewProperties", "aspect": { @@ -2228,19 +2457,23 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" }, { - "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", - "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" + }, + { + "id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680", + "urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680" } ] } @@ -2253,7 +2486,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "dataFlowInfo", "aspect": { @@ -2269,9 +2502,26 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -2282,8 +2532,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-12-05 16:44:43.800000", - "date_modified": "2024-12-05 16:44:43.800000" + "date_created": "2024-12-19 12:34:45.660000", + "date_modified": "2024-12-19 12:34:45.660000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2300,7 +2550,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -2310,8 +2577,8 @@ "depending_on_procedure": "{}", "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", "input parameters": "[]", - "date_created": "2024-12-05 16:44:43.803000", - "date_modified": "2024-12-05 16:44:43.803000" + "date_created": "2024-12-19 12:34:45.667000", + "date_modified": "2024-12-19 12:34:45.667000" }, "externalUrl": "", "name": "DemoData.Foo.NewProc", @@ -2326,14 +2593,31 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2344,13 +2628,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "guest" @@ -2367,7 +2652,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2383,12 +2668,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2399,7 +2685,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2417,15 +2703,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", + "entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2438,12 +2728,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2454,13 +2744,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "INFORMATION_SCHEMA" @@ -2477,7 +2768,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2493,12 +2784,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2509,7 +2801,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2527,15 +2819,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:f84e3b6c61876e1625f9112cbc0e988f", + "entityUrn": "urn:li:container:63c0319e212536168ec5b7dce2b7da2f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2548,12 +2844,12 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } }, "systemMetadata": { @@ -2564,13 +2860,14 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "mssql", + "instance": "my-instance", "env": "PROD", "database": "DemoData", "schema": "sys" @@ -2587,7 +2884,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2603,12 +2900,13 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:mssql" + "platform": "urn:li:dataPlatform:mssql", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" } }, "systemMetadata": { @@ -2619,7 +2917,7 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2637,15 +2935,19 @@ }, { "entityType": "container", - "entityUrn": "urn:li:container:d730a6ecf30bbb41cac5df5c0014168d", + "entityUrn": "urn:li:container:b0e2ef63fa03ab69f77b60844124ec97", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", - "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)" + }, + { + "id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a", + "urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a" } ] } @@ -2658,7 +2960,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.PersonsView,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.DemoData.Foo.PersonsView,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -2669,7 +2971,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,my-instance.demodata.foo.persons,PROD)", "type": "VIEW" } ] @@ -2683,7 +2985,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2699,7 +3001,7 @@ }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2715,7 +3017,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2731,7 +3033,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2747,7 +3049,7 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml index 40bef3ff104a3..e003ec39cd528 100644 --- a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml +++ b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_to_file.yml @@ -7,6 +7,7 @@ source: password: test!Password database: DemoData host_port: localhost:21433 + platform_instance: my-instance # use_odbc: True # uri_args: # driver: "ODBC Driver 17 for SQL Server" From acb76cd97c8fc104b5c26a438db862a8d5e87705 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 19 Dec 2024 20:26:58 +0100 Subject: [PATCH 15/17] fix(tableau): prevents warning in case of site admin creator role (#12175) --- .../src/datahub/ingestion/source/tableau/tableau.py | 2 +- .../datahub/ingestion/source/tableau/tableau_constant.py | 4 +++- .../ingestion/source/tableau/tableau_server_wrapper.py | 8 ++++++-- .../ingestion/source/tableau/tableau_validation.py | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 6cc2220d90fd9..7838e5fa256b8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -645,7 +645,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None: # the site-role might be different on another site logged_in_user: UserInfo = UserInfo.from_server(server=server) - if not logged_in_user.is_site_administrator_explorer(): + if not logged_in_user.has_site_administrator_explorer_privileges(): report.warning( title=title, message=message, diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py index ea0878143ef35..d69312f803021 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_constant.py @@ -82,4 +82,6 @@ SITE = "Site" IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql" SITE_PERMISSION = "sitePermission" -SITE_ROLE = "SiteAdministratorExplorer" +ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer" +ROLE_SITE_ADMIN_CREATOR = "SiteAdministratorCreator" +ROLE_SERVER_ADMIN = "ServerAdministrator" diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py index f309622d12b91..482140a227511 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_server_wrapper.py @@ -11,8 +11,12 @@ class UserInfo: site_role: str site_id: str - def is_site_administrator_explorer(self): - return self.site_role == c.SITE_ROLE + def has_site_administrator_explorer_privileges(self): + return self.site_role in [ + c.ROLE_SITE_ADMIN_EXPLORER, + c.ROLE_SITE_ADMIN_CREATOR, + c.ROLE_SERVER_ADMIN, + ] @staticmethod def from_server(server: Server) -> "UserInfo": diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py index 4a703faf6091b..4ec0e5ef01d3c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_validation.py @@ -28,7 +28,7 @@ def check_user_role( try: # TODO: Add check for `Enable Derived Permissions` - if not logged_in_user.is_site_administrator_explorer(): + if not logged_in_user.has_site_administrator_explorer_privileges(): capability_dict[c.SITE_PERMISSION] = CapabilityReport( capable=False, failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.", From eceb799e634aa19340dbfe9da51714311f401996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 20 Dec 2024 08:37:21 +0100 Subject: [PATCH 16/17] fix(tableau): restart server object when reauthenticating (#12182) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/tableau/tableau.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 7838e5fa256b8..fadcb8ff8f396 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -896,10 +896,9 @@ def dataset_browse_prefix(self) -> str: return f"/{self.config.env.lower()}{self.no_env_browse_prefix}" def _re_authenticate(self): - tableau_auth: Union[ - TableauAuth, PersonalAccessTokenAuth - ] = self.config.get_tableau_auth(self.site_id) - self.server.auth.sign_in(tableau_auth) + # Sign-in again may not be enough because Tableau sometimes caches invalid sessions + # so we need to recreate the Tableau Server object + self.server = self.config.make_tableau_client(self.site_id) @property def site_content_url(self) -> Optional[str]: From 66df362c0f7f10f5f0230054977410c3f1eb688a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 20 Dec 2024 09:57:53 +0100 Subject: [PATCH 17/17] fix(dagster): support dagster v1.9.6 (#12189) --- .../src/datahub_dagster_plugin/client/dagster_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index 2fdd0a41edf6c..a87f490f2d947 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -522,7 +522,7 @@ def generate_datajob( # Also, add datahub inputs/outputs if present in input/output metatdata. for input_def_snap in op_def_snap.input_def_snaps: job_property_bag[f"input.{input_def_snap.name}"] = str( - input_def_snap._asdict() + input_def_snap.__dict__ ) if Constant.DATAHUB_INPUTS in input_def_snap.metadata: datajob.inlets.extend( @@ -533,7 +533,7 @@ def generate_datajob( for output_def_snap in op_def_snap.output_def_snaps: job_property_bag[f"output_{output_def_snap.name}"] = str( - output_def_snap._asdict() + output_def_snap.__dict__ ) if ( Constant.DATAHUB_OUTPUTS in output_def_snap.metadata