forked from pinot-contrib/pinot-docs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added doc on unnesting JSON array into separate records (pinot-contri…
- Loading branch information
Showing
4 changed files
with
176 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
--- | ||
description: Unnest JSON records in Apache Pinot. | ||
--- | ||
|
||
# Unnest JSON records | ||
In this example, we would look at un-nesting json records that are batched together as part of a single key at the root | ||
level. We will make use of the [ComplexType](complex-type.md) configs to persist the individual student records as | ||
separate rows in Pinot. | ||
|
||
Consider the following array of student records. | ||
```json | ||
{ | ||
"students": [ | ||
{ | ||
"firstName": "Jane", | ||
"id": "100", | ||
"scores": { | ||
"physics": 91, | ||
"chemistry": 93, | ||
"maths": 99 | ||
} | ||
}, | ||
{ | ||
"firstName": "John", | ||
"id": "101", | ||
"scores": { | ||
"physics": 97, | ||
"chemistry": 98, | ||
"maths": 99 | ||
} | ||
}, | ||
{ | ||
"firstName": "Jen", | ||
"id": "102", | ||
"scores": { | ||
"physics": 96, | ||
"chemistry": 95, | ||
"maths": 100 | ||
} | ||
} | ||
] | ||
} | ||
``` | ||
|
||
|
||
# Pinot Schema | ||
The Pinot schema for this example would look as follows. | ||
|
||
```json | ||
{ | ||
"schemaName": "students001", | ||
"enableColumnBasedNullHandling": false, | ||
"dimensionFieldSpecs": [ | ||
{ | ||
"name": "students.firstName", | ||
"dataType": "STRING", | ||
"notNull": false, | ||
"fieldType": "DIMENSION" | ||
}, | ||
{ | ||
"name": "students.id", | ||
"dataType": "STRING", | ||
"notNull": false, | ||
"fieldType": "DIMENSION" | ||
}, | ||
{ | ||
"name": "students.scores", | ||
"dataType": "JSON", | ||
"notNull": false, | ||
"fieldType": "DIMENSION" | ||
} | ||
], | ||
"dateTimeFieldSpecs": [ | ||
{ | ||
"name": "ts", | ||
"fieldType": "DATE_TIME", | ||
"dataType": "LONG", | ||
"format": "1:MILLISECONDS:EPOCH", | ||
"granularity": "1:MILLISECONDS" | ||
} | ||
], | ||
"metricFieldSpecs": [] | ||
} | ||
``` | ||
|
||
# Pinot Table Configuration | ||
|
||
The Pinot table configuration for this schema would look as follows. | ||
|
||
```json | ||
{ | ||
"description": "Pinot table config inferred for: S3", | ||
"type": "PINOT", | ||
"config": { | ||
"tableName": "students001_OFFLINE", | ||
"tableType": "OFFLINE", | ||
"segmentsConfig": { | ||
"deletedSegmentsRetentionPeriod": "7d", | ||
"segmentPushType": "APPEND", | ||
"minimizeDataMovement": false, | ||
"replication": "1", | ||
"timeColumnName": "ts", | ||
"retentionTimeUnit": "DAYS", | ||
"retentionTimeValue": "180" | ||
}, | ||
"tenants": { | ||
"broker": "DefaultTenant", | ||
"server": "DefaultTenant" | ||
}, | ||
"tableIndexConfig": { | ||
"optimizeDictionaryForMetrics": false, | ||
"noDictionarySizeRatioThreshold": 0, | ||
"aggregateMetrics": false, | ||
"columnMajorSegmentBuilderEnabled": true, | ||
"loadMode": "MMAP", | ||
"varLengthDictionaryColumns": [ | ||
"students.firstName", | ||
"students.id", | ||
"students.scores" | ||
], | ||
"enableDefaultStarTree": false, | ||
"enableDynamicStarTreeCreation": false, | ||
"nullHandlingEnabled": true, | ||
"autoGeneratedInvertedIndex": false, | ||
"createInvertedIndexDuringSegmentGeneration": true, | ||
"rangeIndexVersion": 2, | ||
"optimizeDictionary": false, | ||
"invertedIndexColumns": [ | ||
"students.firstName", | ||
"students.id" | ||
] | ||
}, | ||
"metadata": {}, | ||
"task": { | ||
"taskTypeConfigsMap": { | ||
|
||
} | ||
}, | ||
"ingestionConfig": { | ||
"complexTypeConfig": { | ||
"fieldsToUnnest": [ | ||
"students" | ||
] | ||
}, | ||
"transformConfigs": [ | ||
{ | ||
"columnName": "ts", | ||
"transformFunction": "now()" | ||
} | ||
], | ||
"rowTimeValueCheck": true, | ||
"segmentTimeValueCheck": false, | ||
"continueOnError": true, | ||
"batchIngestionConfig": { | ||
"segmentIngestionType": "APPEND", | ||
"consistentDataPush": false | ||
} | ||
}, | ||
"isDimTable": false | ||
} | ||
} | ||
``` | ||
|
||
# Data in Pinot | ||
|
||
Post ingestion, the student records would appear as separate records in Pinot. Note that the nested field `scores` is | ||
captured as a JSON field. | ||
|
||
![Unnested Student Records](../../.gitbook/unnested-student-records-json.png) |