From a09575fb6f00c9a3f372ef5938a6ac971b877f12 Mon Sep 17 00:00:00 2001 From: sagar-salvi-apptware <159135491+sagar-salvi-apptware@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:04:07 +0530 Subject: [PATCH] fix(ingestion/glue): Add support for missing config options for profiling in Glue (#10858) --- docs/how/updating-datahub.md | 13 + .../src/datahub/ingestion/source/aws/glue.py | 71 +- .../ingestion/source/glue_profiling_config.py | 8 + .../tests/unit/glue/glue_mces_golden.json | 2448 ++++++++--------- .../unit/glue/glue_mces_golden_profiling.json | 289 ++ .../tests/unit/test_glue_source.py | 83 +- .../tests/unit/test_glue_source_stubs.py | 106 + 7 files changed, 1759 insertions(+), 1259 deletions(-) create mode 100644 metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 9618aff78075cf..2821b63e7d305a 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -55,6 +55,19 @@ New (optional fields `systemMetadata` and `headers`): "headers": {} } ``` +- #10858 Profiling configuration for Glue source has been updated. + +Previously, the configuration was: +```yaml +profiling: {} +``` + +Now, it needs to be: + +```yaml +profiling: + enabled: true +``` ### Potential Downtime diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index f7e1e2610e7e28..58fb1cb1eb2c28 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -167,8 +167,8 @@ class GlueSourceConfig( default=False, description="If an S3 Objects Tags should be created for the Tables ingested by Glue.", ) - profiling: Optional[GlueProfilingConfig] = Field( - default=None, + profiling: GlueProfilingConfig = Field( + default_factory=GlueProfilingConfig, description="Configs to ingest data profiles from glue table", ) # Custom Stateful Ingestion settings @@ -186,7 +186,7 @@ class GlueSourceConfig( ) def is_profiling_enabled(self) -> bool: - return self.profiling is not None and is_profiling_enabled( + return self.profiling.enabled and is_profiling_enabled( self.profiling.operation_config ) @@ -867,34 +867,39 @@ def _create_profile_mcp( # instantiate column profile class for each column column_profile = DatasetFieldProfileClass(fieldPath=column_name) - if self.source_config.profiling.unique_count in column_params: - column_profile.uniqueCount = int( - float(column_params[self.source_config.profiling.unique_count]) - ) - if self.source_config.profiling.unique_proportion in column_params: - column_profile.uniqueProportion = float( - column_params[self.source_config.profiling.unique_proportion] - ) - if self.source_config.profiling.null_count in column_params: - column_profile.nullCount = int( - float(column_params[self.source_config.profiling.null_count]) - ) - if self.source_config.profiling.null_proportion in column_params: - column_profile.nullProportion = float( - column_params[self.source_config.profiling.null_proportion] - ) - if self.source_config.profiling.min in column_params: - column_profile.min = column_params[self.source_config.profiling.min] - if self.source_config.profiling.max in column_params: - column_profile.max = column_params[self.source_config.profiling.max] - if self.source_config.profiling.mean in column_params: - column_profile.mean = column_params[self.source_config.profiling.mean] - if self.source_config.profiling.median in column_params: - column_profile.median = column_params[ - self.source_config.profiling.median - ] - if self.source_config.profiling.stdev in column_params: - column_profile.stdev = column_params[self.source_config.profiling.stdev] + if not self.source_config.profiling.profile_table_level_only: + if self.source_config.profiling.unique_count in column_params: + column_profile.uniqueCount = int( + float(column_params[self.source_config.profiling.unique_count]) + ) + if self.source_config.profiling.unique_proportion in column_params: + column_profile.uniqueProportion = float( + column_params[self.source_config.profiling.unique_proportion] + ) + if self.source_config.profiling.null_count in column_params: + column_profile.nullCount = int( + float(column_params[self.source_config.profiling.null_count]) + ) + if self.source_config.profiling.null_proportion in column_params: + column_profile.nullProportion = float( + column_params[self.source_config.profiling.null_proportion] + ) + if self.source_config.profiling.min in column_params: + column_profile.min = column_params[self.source_config.profiling.min] + if self.source_config.profiling.max in column_params: + column_profile.max = column_params[self.source_config.profiling.max] + if self.source_config.profiling.mean in column_params: + column_profile.mean = column_params[ + self.source_config.profiling.mean + ] + if self.source_config.profiling.median in column_params: + column_profile.median = column_params[ + self.source_config.profiling.median + ] + if self.source_config.profiling.stdev in column_params: + column_profile.stdev = column_params[ + self.source_config.profiling.stdev + ] dataset_profile.fieldProfiles.append(column_profile) @@ -914,9 +919,7 @@ def _create_profile_mcp( def get_profile_if_enabled( self, mce: MetadataChangeEventClass, database_name: str, table_name: str ) -> Iterable[MetadataWorkUnit]: - # We don't need both checks only the second one - # but then lint believes that GlueProfilingConfig can be None - if self.source_config.profiling and self.source_config.is_profiling_enabled(): + if self.source_config.is_profiling_enabled(): # for cross-account ingestion kwargs = dict( DatabaseName=database_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py index c0001f0d37531d..115d194da59587 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/glue_profiling_config.py @@ -7,6 +7,14 @@ class GlueProfilingConfig(ConfigModel): + enabled: bool = Field( + default=False, + description="Whether profiling should be done.", + ) + profile_table_level_only: bool = Field( + default=False, + description="Whether to perform profiling at table-level only, or include column-level profiling as well.", + ) row_count: Optional[str] = Field( default=None, description="The parameter name for row count in glue table.", diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index c688eb9a2d1150..77e8354e657a66 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -1,1342 +1,1342 @@ [ -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "glue", - "env": "PROD", - "database": "flights-database", - "param1": "value1", - "param2": "value2", - "LocationUri": "s3://test-bucket/test-prefix", - "CreateTime": "June 09, 2021 at 14:14:19" - }, - "name": "flights-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database" + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "flights-database", + "param1": "value1", + "param2": "value2", + "LocationUri": "s3://test-bucket/test-prefix", + "CreateTime": "June 09, 2021 at 14:14:19" + }, + "name": "flights-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:glue" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Database" - ] + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "glue", - "env": "PROD", - "database": "test-database", - "CreateTime": "June 01, 2021 at 14:55:02" - }, - "name": "test-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/test-database" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "test-database", + "CreateTime": "June 01, 2021 at 14:55:02" + }, + "name": "test-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/test-database" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:glue" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Database" - ] + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "glue", - "env": "PROD", - "database": "empty-database", - "CreateTime": "June 01, 2021 at 14:55:13" - }, - "name": "empty-database", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "empty-database", + "CreateTime": "June 01, 2021 at 14:55:13" + }, + "name": "empty-database", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:glue" + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } } - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Database" - ] + }, + { + "entityType": "container", + "entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "CrawlerSchemaDeserializerVersion": "1.0", - "CrawlerSchemaSerializerVersion": "1.0", - "UPDATED_BY_CRAWLER": "flights-crawler", - "averageRecordSize": "55", - "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", - "classification": "avro", - "compressionType": "none", - "objectCount": "30", - "recordCount": "169222196", - "sizeKey": "9503351413", - "typeOfData": "file", - "Location": "s3://crawler-public-us-west-2/flight/avro/", - "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", - "Compressed": "False", - "NumberOfBuckets": "-1", - "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", - "BucketColumns": "[]", - "SortColumns": "[]", - "StoredAsSubDirectories": "False" - }, - "name": "avro", - "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database/avro", - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "flights-database.avro", - "platform": "urn:li:dataPlatform:glue", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=int].yr", - "nullable": true, - "description": "test comment", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "avro", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database/avro", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "flights-database.avro", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, - { - "fieldPath": "[version=2.0].[type=string].flightdate", - "nullable": true, - "type": { + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "description": "test comment", "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].uniquecarrier", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].flightdate", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=int].airlineid", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].uniquecarrier", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].carrier", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].airlineid", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].flightnum", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].carrier", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].origin", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=string].flightnum", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].year", - "nullable": true, - "description": "partition test comment", - "type": { + { + "fieldPath": "[version=2.0].[type=string].origin", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "description": "partition test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - ] + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba" + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0b9f1f731ecf6743be6207fec3dc9cba" + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "CrawlerSchemaDeserializerVersion": "1.0", - "CrawlerSchemaSerializerVersion": "1.0", - "UPDATED_BY_CRAWLER": "test-jsons", - "averageRecordSize": "273", - "classification": "json", - "compressionType": "none", - "objectCount": "1", - "recordCount": "1", - "sizeKey": "273", - "typeOfData": "file", - "Location": "s3://test-glue-jsons/markers/", - "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", - "Compressed": "False", - "NumberOfBuckets": "-1", - "SerdeInfo": "{'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe', 'Parameters': {'paths': 'markers'}}", - "BucketColumns": "[]", - "SortColumns": "[]", - "StoredAsSubDirectories": "False" - }, - "name": "test_jsons_markers", - "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_jsons_markers", - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_jsons_markers", - "platform": "urn:li:dataPlatform:glue", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", - "nullable": true, - "type": { + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "test-jsons", + "averageRecordSize": "273", + "classification": "json", + "compressionType": "none", + "objectCount": "1", + "recordCount": "1", + "sizeKey": "273", + "typeOfData": "file", + "Location": "s3://test-glue-jsons/markers/", + "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.openx.data.jsonserde.JsonSerDe', 'Parameters': {'paths': 'markers'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "test_jsons_markers", + "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_jsons_markers", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test-database.test_jsons_markers", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "record" - ] + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } } - } + }, + "nativeDataType": "array,location:array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" }, - "nativeDataType": "array,location:array>>", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"array,location:array>>\"}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=string].name", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].position", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "double" - ] + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "double" + ] + } } - } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" }, - "nativeDataType": "array", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"array\"}" - }, - { - "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].markers.[type=array].[type=double].location", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": { - "nestedType": [ - "double" - ] + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "double" + ] + } } - } - }, - "nativeDataType": "array", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"array\"}" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] } - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - ] + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_jsons_markers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "CrawlerSchemaDeserializerVersion": "1.0", - "CrawlerSchemaSerializerVersion": "1.0", - "UPDATED_BY_CRAWLER": "test", - "averageRecordSize": "19", - "classification": "parquet", - "compressionType": "none", - "objectCount": "60", - "recordCount": "167497743", - "sizeKey": "4463574900", - "typeOfData": "file", - "Location": "s3://crawler-public-us-west-2/flight/parquet/", - "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", - "Compressed": "False", - "NumberOfBuckets": "-1", - "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 'Parameters': {'serialization.format': '1'}}", - "BucketColumns": "[]", - "SortColumns": "[]", - "StoredAsSubDirectories": "False" - }, - "name": "test_parquet", - "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_parquet", - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "test-database.test_parquet", - "platform": "urn:li:dataPlatform:glue", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "[version=2.0].[type=int].yr", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "test", + "averageRecordSize": "19", + "classification": "parquet", + "compressionType": "none", + "objectCount": "60", + "recordCount": "167497743", + "sizeKey": "4463574900", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/parquet/", + "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 'Parameters': {'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "test_parquet", + "qualifiedName": "arn:aws:glue:us-west-2:795586375822:table/test-database/test_parquet", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "test-database.test_parquet", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, - { - "fieldPath": "[version=2.0].[type=int].quarter", - "nullable": true, - "type": { + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=int].month", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].quarter", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=int].dayofmonth", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].month", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" - }, - { - "fieldPath": "[version=2.0].[type=string].year", - "nullable": true, - "type": { + { + "fieldPath": "[version=2.0].[type=int].dayofmonth", + "nullable": true, "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false, - "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:glue" - } - }, - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:owner", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "fieldPath": "[version=2.0].[type=string].year", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] } - } - }, - { - "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [ - { - "tag": "urn:li:tag:baz:bob" - }, - { - "tag": "urn:li:tag:foo:bar" + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" } - ] + } + }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:baz:bob" + }, + { + "tag": "urn:li:tag:foo:bar" + } + ] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } } - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + }, + { + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:bdf4342ea6899d162eae685bfe9074a7" + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { - "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { - "customProperties": { - "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", - "created": "2021-06-10 16:51:25.690000", - "modified": "2021-06-10 16:55:35.307000", - "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-1.py" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1", - "description": "The first test job" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(glue,test-job-1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": { + "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", + "created": "2021-06-10 16:51:25.690000", + "modified": "2021-06-10 16:55:35.307000", + "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-1.py" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1", + "description": "The first test job" + } } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { - "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { - "customProperties": { - "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", - "created": "2021-06-10 16:58:32.469000", - "modified": "2021-06-10 16:58:32.469000", - "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-2.py" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2", - "description": "The second test job" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataFlowSnapshot": { + "urn": "urn:li:dataFlow:(glue,test-job-2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataFlowInfo": { + "customProperties": { + "role": "arn:aws:iam::123412341234:role/service-role/AWSGlueServiceRole-glue-crawler", + "created": "2021-06-10 16:58:32.469000", + "modified": "2021-06-10 16:58:32.469000", + "command": "s3://aws-glue-assets-123412341234-us-west-2/scripts/job-2.py" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2", + "description": "The second test job" + } } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "f": "lambda row : ()", - "transformation_ctx": "\"Transform0\"", - "transformType": "Filter", - "nodeId": "Transform0_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:Filter-Transform0_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "f": "lambda row : ()", + "transformation_ctx": "\"Transform0\"", + "transformType": "Filter", + "nodeId": "Transform0_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:Filter-Transform0_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", - "transformation_ctx": "\"Transform1\"", - "transformType": "ApplyMapping", - "nodeId": "Transform1_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform1_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform1\"", + "transformType": "ApplyMapping", + "nodeId": "Transform1_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform1_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", - "transformation_ctx": "\"Transform2\"", - "transformType": "ApplyMapping", - "nodeId": "Transform2_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform2_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform2\"", + "transformType": "ApplyMapping", + "nodeId": "Transform2_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform2_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database.avro,PROD)" - ], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "keys2": "[\"(right) flightdate\"]", - "transformation_ctx": "\"Transform3\"", - "keys1": "[\"yr\"]", - "transformType": "Join", - "nodeId": "Transform3_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:Join-Transform3_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "keys2": "[\"(right) flightdate\"]", + "transformation_ctx": "\"Transform3\"", + "keys1": "[\"yr\"]", + "transformType": "Join", + "nodeId": "Transform3_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:Join-Transform3_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", - "transformation_ctx": "\"Transform4\"", - "transformType": "ApplyMapping", - "nodeId": "Transform4_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform4_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\"), (\"flightnum\", \"string\", \"flightnum\", \"string\"), (\"origin\", \"string\", \"origin\", \"string\"), (\"dest\", \"string\", \"dest\", \"string\"), (\"depdelay\", \"int\", \"depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"weatherdelay\", \"int\"), (\"year\", \"string\", \"year\", \"string\")]", + "transformation_ctx": "\"Transform4\"", + "transformType": "ApplyMapping", + "nodeId": "Transform4_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform4_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"(right) yr\", \"int\"), (\"flightdate\", \"string\", \"(right) flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"(right) uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"(right) airlineid\", \"int\"), (\"carrier\", \"string\", \"(right) carrier\", \"string\"), (\"flightnum\", \"string\", \"(right) flightnum\", \"string\"), (\"origin\", \"string\", \"(right) origin\", \"string\"), (\"dest\", \"string\", \"(right) dest\", \"string\"), (\"depdelay\", \"int\", \"(right) depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"(right) carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"(right) weatherdelay\", \"int\"), (\"year\", \"string\", \"(right) year\", \"string\")]", - "transformation_ctx": "\"Transform5\"", - "transformType": "ApplyMapping", - "nodeId": "Transform5_job1" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", - "name": "test-job-1:ApplyMapping-Transform5_job1", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"(right) yr\", \"int\"), (\"flightdate\", \"string\", \"(right) flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"(right) uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"(right) airlineid\", \"int\"), (\"carrier\", \"string\", \"(right) carrier\", \"string\"), (\"flightnum\", \"string\", \"(right) flightnum\", \"string\"), (\"origin\", \"string\", \"(right) origin\", \"string\"), (\"dest\", \"string\", \"(right) dest\", \"string\"), (\"depdelay\", \"int\", \"(right) depdelay\", \"int\"), (\"carrierdelay\", \"int\", \"(right) carrierdelay\", \"int\"), (\"weatherdelay\", \"int\", \"(right) weatherdelay\", \"int\"), (\"year\", \"string\", \"(right) year\", \"string\")]", + "transformation_ctx": "\"Transform5\"", + "transformType": "ApplyMapping", + "nodeId": "Transform5_job1" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-1/graph", + "name": "test-job-1:ApplyMapping-Transform5_job1", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "connection_type": "s3", - "format": "json", - "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", - "transformation_ctx": "DataSink1" - }, - "tags": [] + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "connection_type": "s3", + "format": "json", + "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", + "transformation_ctx": "DataSink1" + }, + "tags": [] + } } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "paths": "[\"yr\", \"quarter\", \"month\", \"dayofmonth\", \"dayofweek\", \"flightdate\", \"uniquecarrier\"]", - "name2": "\"Transform0Output1\"", - "name1": "\"Transform0Output0\"", - "transformation_ctx": "\"Transform0\"", - "transformType": "SplitFields", - "nodeId": "Transform0_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:SplitFields-Transform0_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "paths": "[\"yr\", \"quarter\", \"month\", \"dayofmonth\", \"dayofweek\", \"flightdate\", \"uniquecarrier\"]", + "name2": "\"Transform0Output1\"", + "name1": "\"Transform0Output0\"", + "transformation_ctx": "\"Transform0\"", + "transformType": "SplitFields", + "nodeId": "Transform0_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:SplitFields-Transform0_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"quarter\", \"int\", \"quarter\", \"int\"), (\"month\", \"int\", \"month\", \"int\"), (\"dayofmonth\", \"int\", \"dayofmonth\", \"int\"), (\"dayofweek\", \"int\", \"dayofweek\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\")]", - "transformation_ctx": "\"Transform1\"", - "transformType": "ApplyMapping", - "nodeId": "Transform1_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:ApplyMapping-Transform1_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "mappings": "[(\"yr\", \"int\", \"yr\", \"int\"), (\"quarter\", \"int\", \"quarter\", \"int\"), (\"month\", \"int\", \"month\", \"int\"), (\"dayofmonth\", \"int\", \"dayofmonth\", \"int\"), (\"dayofweek\", \"int\", \"dayofweek\", \"int\"), (\"flightdate\", \"string\", \"flightdate\", \"string\"), (\"uniquecarrier\", \"string\", \"uniquecarrier\", \"string\"), (\"airlineid\", \"int\", \"airlineid\", \"int\"), (\"carrier\", \"string\", \"carrier\", \"string\")]", + "transformation_ctx": "\"Transform1\"", + "transformType": "ApplyMapping", + "nodeId": "Transform1_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:ApplyMapping-Transform1_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:glue,test-database.test_parquet,PROD)" - ], - "outputDatasets": [], - "inputDatajobs": [] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "missing_values_column": "\"dayofmonth\"", - "transformation_ctx": "\"Transform2\"", - "transformType": "FillMissingValues", - "nodeId": "Transform2_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:FillMissingValues-Transform2_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "missing_values_column": "\"dayofmonth\"", + "transformation_ctx": "\"Transform2\"", + "transformType": "FillMissingValues", + "nodeId": "Transform2_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:FillMissingValues-Transform2_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { - "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", - "aspects": [ - { - "com.linkedin.pegasus2avro.datajob.DataJobInfo": { - "customProperties": { - "paths": "[]", - "transformation_ctx": "\"Transform3\"", - "transformType": "SelectFields", - "nodeId": "Transform3_job2" - }, - "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", - "name": "test-job-2:SelectFields-Transform3_job2", - "type": { - "string": "GLUE" + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DataJobSnapshot": { + "urn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", + "aspects": [ + { + "com.linkedin.pegasus2avro.datajob.DataJobInfo": { + "customProperties": { + "paths": "[]", + "transformation_ctx": "\"Transform3\"", + "transformType": "SelectFields", + "nodeId": "Transform3_job2" + }, + "externalUrl": "https://us-west-2.console.aws.amazon.com/gluestudio/home?region=us-west-2#/editor/job/test-job-2/graph", + "name": "test-job-2:SelectFields-Transform3_job2", + "type": { + "string": "GLUE" + } + } + }, + { + "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" + ] } } - }, - { - "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { - "inputDatasets": [], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)" - ], - "inputDatajobs": [ - "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)" - ] - } - } - ] + ] + } } - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "connection_type": "s3", - "format": "json", - "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", - "transformation_ctx": "DataSink0" - }, - "tags": [] + }, + { + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,test-glue-jsons,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "connection_type": "s3", + "format": "json", + "connection_options": "{'path': 's3://test-glue-jsons/', 'partitionKeys': []}", + "transformation_ctx": "DataSink0" + }, + "tags": [] + } } - } - ] + ] + } } - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(glue,test-job-1,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(glue,test-job-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(glue,test-job-2,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(glue,test-job-2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform1_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform2_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform4_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),ApplyMapping-Transform5_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Filter-Transform0_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-1,PROD),Join-Transform3_job1)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),ApplyMapping-Transform1_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),FillMissingValues-Transform2_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SelectFields-Transform3_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false + }, + { + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(glue,test-job-2,PROD),SplitFields-Transform0_job2)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } } - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:baz:bob", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "baz:bob" + }, + { + "entityType": "tag", + "entityUrn": "urn:li:tag:baz:bob", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "baz:bob" + } } - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:foo:bar", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "foo:bar" + }, + { + "entityType": "tag", + "entityUrn": "urn:li:tag:foo:bar", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "foo:bar" + } } } -} -] \ No newline at end of file + ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json new file mode 100644 index 00000000000000..800d36a13d2dc2 --- /dev/null +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden_profiling.json @@ -0,0 +1,289 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "glue", + "env": "PROD", + "database": "flights-database-profiling", + "param1": "value1", + "param2": "value2", + "LocationUri": "s3://test-bucket/test-prefix", + "CreateTime": "June 09, 2021 at 14:14:19" + }, + "name": "flights-database-profiling", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/flights-database-profiling" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:glue" + } + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": "{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}", + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": "False", + "NumberOfBuckets": "-1", + "SerdeInfo": "{'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe', 'Parameters': {'avro.schema.literal': '{\"type\":\"record\",\"name\":\"flights_avro_subset\",\"namespace\":\"default\",\"fields\":[{\"name\":\"yr\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"flightdate\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"uniquecarrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"airlineid\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrier\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"flightnum\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"origin\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"dest\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"depdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"carrierdelay\",\"type\":[\"null\",\"int\"],\"default\":null},{\"name\":\"weatherdelay\",\"type\":[\"null\",\"int\"],\"default\":null}]}', 'serialization.format': '1'}}", + "BucketColumns": "[]", + "SortColumns": "[]", + "StoredAsSubDirectories": "False" + }, + "name": "avro-profiling", + "qualifiedName": "arn:aws:glue:us-west-2:123412341234:table/flights-database-profiling/avro-profiling", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "flights-database-profiling.avro-profiling", + "platform": "urn:li:dataPlatform:glue", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=int].yr", + "nullable": true, + "description": "test comment", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightdate", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].uniquecarrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=int].airlineid", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"int\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].carrier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].flightnum", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=string].origin", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.DataPlatformInstance": { + "platform": "urn:li:dataPlatform:glue" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:owner", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:77f8f4c39b47069d3a71191de1333b0e" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:glue,flights-database-profiling.avro-profiling,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "fieldProfiles": [ + { + "fieldPath": "yr", + "uniqueCount": 1, + "uniqueProportion": 2.0, + "nullCount": 0, + "nullProportion": 11.0, + "min": "1", + "max": "10", + "mean": "1", + "median": "2", + "stdev": "3" + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index b43db47ae00711..45b9899eacaa77 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -13,7 +13,11 @@ from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.sink.file import write_metadata_file -from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig +from datahub.ingestion.source.aws.glue import ( + GlueProfilingConfig, + GlueSource, + GlueSourceConfig, +) from datahub.ingestion.source.state.sql_common_state import ( BaseSQLAlchemyCheckpointState, ) @@ -38,6 +42,7 @@ get_databases_delta_response, get_databases_response, get_databases_response_for_lineage, + get_databases_response_profiling, get_databases_response_with_resource_link, get_dataflow_graph_response_1, get_dataflow_graph_response_2, @@ -54,9 +59,11 @@ get_tables_response_1, get_tables_response_2, get_tables_response_for_target_database, + get_tables_response_profiling_1, resource_link_database, tables_1, tables_2, + tables_profiling_1, target_database_tables, ) @@ -93,6 +100,42 @@ def glue_source( ) +def glue_source_with_profiling( + platform_instance: Optional[str] = None, + use_s3_bucket_tags: bool = False, + use_s3_object_tags: bool = False, + extract_delta_schema_from_parameters: bool = False, +) -> GlueSource: + profiling_config = GlueProfilingConfig( + enabled=True, + profile_table_level_only=False, + row_count="row_count", + column_count="column_count", + unique_count="unique_count", + unique_proportion="unique_proportion", + null_count="null_count", + null_proportion="null_proportion", + min="min", + max="max", + mean="mean", + median="median", + stdev="stdev", + ) + + return GlueSource( + ctx=PipelineContext(run_id="glue-source-test"), + config=GlueSourceConfig( + aws_region="us-west-2", + extract_transforms=False, + platform_instance=platform_instance, + use_s3_bucket_tags=use_s3_bucket_tags, + use_s3_object_tags=use_s3_object_tags, + extract_delta_schema_from_parameters=extract_delta_schema_from_parameters, + profiling=profiling_config, + ), + ) + + column_type_test_cases: Dict[str, Tuple[str, Type]] = { "char": ("char", StringTypeClass), "array": ("array", ArrayTypeClass), @@ -641,3 +684,41 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: output_path=tmp_path / mce_file, golden_path=test_resources_dir / mce_golden_file, ) + + +@freeze_time(FROZEN_TIME) +def test_glue_ingest_with_profiling( + tmp_path: Path, + pytestconfig: PytestConfig, +) -> None: + glue_source_instance = glue_source_with_profiling() + mce_file = "glue_mces.json" + mce_golden_file = "glue_mces_golden_profiling.json" + with Stubber(glue_source_instance.glue_client) as glue_stubber: + glue_stubber.add_response("get_databases", get_databases_response_profiling, {}) + + glue_stubber.add_response( + "get_tables", + get_tables_response_profiling_1, + {"DatabaseName": "flights-database-profiling"}, + ) + + glue_stubber.add_response( + "get_table", + {"Table": tables_profiling_1[0]}, + {"DatabaseName": "flights-database-profiling", "Name": "avro-profiling"}, + ) + + mce_objects = [wu.metadata for wu in glue_source_instance.get_workunits()] + + glue_stubber.assert_no_pending_responses() + + write_metadata_file(tmp_path / mce_file, mce_objects) + + # Verify the output. + test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_file, + golden_path=test_resources_dir / mce_golden_file, + ) diff --git a/metadata-ingestion/tests/unit/test_glue_source_stubs.py b/metadata-ingestion/tests/unit/test_glue_source_stubs.py index f44a384a02c4ab..46ab65234c22df 100644 --- a/metadata-ingestion/tests/unit/test_glue_source_stubs.py +++ b/metadata-ingestion/tests/unit/test_glue_source_stubs.py @@ -973,6 +973,112 @@ get_tables_lineage_response_1 = {"TableList": tables_lineage_1} +get_databases_response_profiling = { + "DatabaseList": [ + { + "Name": "flights-database-profiling", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 14, 19), + "CreateTableDefaultPermissions": [ + { + "Principal": { + "DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS" + }, + "Permissions": ["ALL"], + } + ], + "CatalogId": "123412341234", + "LocationUri": "s3://test-bucket/test-prefix", + "Parameters": {"param1": "value1", "param2": "value2"}, + }, + ] +} + +tables_profiling_1 = [ + { + "Name": "avro-profiling", + "DatabaseName": "flights-database-profiling", + "Owner": "owner", + "CreateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "UpdateTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "LastAccessTime": datetime.datetime(2021, 6, 9, 14, 17, 35), + "Retention": 0, + "StorageDescriptor": { + "Columns": [ + { + "Name": "yr", + "Type": "int", + "Comment": "test comment", + "Parameters": { + "unique_proportion": "2", + "min": "1", + "median": "2", + "max": "10", + "mean": "1", + "null_proportion": "11", + "unique_count": "1", + "stdev": "3", + "null_count": "0", + }, + }, + {"Name": "flightdate", "Type": "string"}, + {"Name": "uniquecarrier", "Type": "string"}, + {"Name": "airlineid", "Type": "int"}, + {"Name": "carrier", "Type": "string"}, + {"Name": "flightnum", "Type": "string"}, + {"Name": "origin", "Type": "string"}, + ], + "Location": "s3://crawler-public-us-west-2/flight/avro/", + "InputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", + "OutputFormat": "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", + "Compressed": False, + "NumberOfBuckets": -1, + "SerdeInfo": { + "SerializationLibrary": "org.apache.hadoop.hive.serde2.avro.AvroSerDe", + "Parameters": { + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "serialization.format": "1", + }, + }, + "BucketColumns": [], + "SortColumns": [], + "Parameters": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + }, + "StoredAsSubDirectories": False, + }, + "PartitionKeys": [], + "TableType": "EXTERNAL_TABLE", + "Parameters": { + "CrawlerSchemaDeserializerVersion": "1.0", + "CrawlerSchemaSerializerVersion": "1.0", + "UPDATED_BY_CRAWLER": "flights-crawler", + "averageRecordSize": "55", + "avro.schema.literal": '{"type":"record","name":"flights_avro_subset","namespace":"default","fields":[{"name":"yr","type":["null","int"],"default":null},{"name":"flightdate","type":["null","string"],"default":null},{"name":"uniquecarrier","type":["null","string"],"default":null},{"name":"airlineid","type":["null","int"],"default":null},{"name":"carrier","type":["null","string"],"default":null},{"name":"flightnum","type":["null","string"],"default":null},{"name":"origin","type":["null","string"],"default":null},{"name":"dest","type":["null","string"],"default":null},{"name":"depdelay","type":["null","int"],"default":null},{"name":"carrierdelay","type":["null","int"],"default":null},{"name":"weatherdelay","type":["null","int"],"default":null}]}', + "classification": "avro", + "compressionType": "none", + "objectCount": "30", + "recordCount": "169222196", + "sizeKey": "9503351413", + "typeOfData": "file", + }, + "CreatedBy": "arn:aws:sts::123412341234:assumed-role/AWSGlueServiceRole-flights-crawler/AWS-Crawler", + "IsRegisteredWithLakeFormation": False, + "CatalogId": "123412341234", + } +] +get_tables_response_profiling_1 = {"TableList": tables_profiling_1} + + def mock_get_object_response(raw_body: str) -> Dict[str, Any]: """ Mock s3 client get_object() response object.