Skip to content

Commit

Permalink
feat(models): Introducing Dataset Partitions Aspect (datahub-project#…
Browse files Browse the repository at this point in the history
…10997)

Co-authored-by: John Joyce <[email protected]>
Co-authored-by: John Joyce <[email protected]>
  • Loading branch information
3 people authored Jul 30, 2024
1 parent 9321e94 commit 6f09b96
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
namespace com.linkedin.dataset

import com.linkedin.common.AuditStamp

/**
* Defines how the data is partitioned
*/
record PartitionSummary {
/**
* A unique id / value for the partition for which statistics were collected,
* generated by applying the key definition to a given row.
*/
partition: string

/**
* The created time for a given partition.
*/
created: optional AuditStamp

/**
* The last modified / touched time for a given partition.
*/
lastModified: optional AuditStamp
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace com.linkedin.dataset

/**
* Defines how the data is partitioned for Data Lake tables (e.g. Hive, S3, Iceberg, Delta, Hudi, etc).
*/
@Aspect = {
"name": "partitionsSummary"
}
record PartitionsSummary {
/**
* The minimum partition as ordered
*/
minPartition: optional PartitionSummary

/**
* The maximum partition as ordered
*/
maxPartition: optional PartitionSummary
}
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ record SchemaField {

/**
* For Datasets which are partitioned, this determines the partitioning key.
* Note that multiple columns can be part of a partitioning key, but currently we do not support
* rendering the ordered partitioning key.
*/
isPartitioningKey: optional boolean

Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
namespace com.linkedin.timeseries

/**
* Defines how the data is partitioned
* A reference to a specific partition in a dataset.
*/
record PartitionSpec {

type: enum PartitionType {
FULL_TABLE,
QUERY,
PARTITION
} = "PARTITION"

/**
* String representation of the partition
* A unique id / value for the partition for which statistics were collected,
* generated by applying the key definition to a given row.
*/
@TimeseriesField = {}
partition: string

/**
* Time window of the partition if applicable
* Time window of the partition, if we are able to extract it from the partition key.
*/
timePartition: optional TimeWindow

/**
* Unused!
*/
@deprecated
type: enum PartitionType {
FULL_TABLE,
QUERY,
PARTITION
} = "PARTITION"
}
1 change: 1 addition & 0 deletions metadata-models/src/main/resources/entity-registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ entities:
- access
- structuredProperties
- forms
- partitionsSummary
- name: dataHubPolicy
doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
category: internal
Expand Down

0 comments on commit 6f09b96

Please sign in to comment.