diff --git a/README.md b/README.md index badd375d..3bd32993 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,12 @@ of that, knowing about the type of a field allows us to compress the remaining data more efficiently. +Nested Lists for Hive / Athena +----------------------- + +Lists have to be annotated to be queriable with AWS Athena. See [parquet-format](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) for more detail and a full working example with comments in the test directory ([`test/list.js`](test/list.js)) + + List of Supported Types & Encodings ----------------------------------- diff --git a/lib/schema.js b/lib/schema.js index 897c3552..790f9aa5 100644 --- a/lib/schema.js +++ b/lib/schema.js @@ -117,6 +117,8 @@ function buildFields(schema, rLevelParentMax, dLevelParentMax, path) { path.concat([name])) }; + if (opts.type == 'LIST' || opts.type == 'MAP') fieldList[name].originalType = opts.type; + continue; } diff --git a/test/list.js b/test/list.js new file mode 100644 index 00000000..7d597a54 --- /dev/null +++ b/test/list.js @@ -0,0 +1,95 @@ +'use strict'; +const chai = require('chai'); +const assert = chai.assert; +const parquet = require('../parquet.js'); + + +/* + This test creates a test file that has an annotated LIST wrapper that works with AWS Athena + Currently the schema (and the input data) needs to follow the specification for an annotated list + + The Athena schema for this test is `id string, test (array>)` + but instead of input data `{id: 'Row1', test: [{a: 'test1', b: 1}, {a: 'test2', b: 2}, {a: 'test3', b: 3}]}` + we need to wrap the data inside `list` and every element inside `element`, i.e: + `{id: 'Row1', test: {list: [{element: {a:'test1', b:1}}, {element: { a: 'test2', b: 2}}, {element: {a: 'test3', b: 3}}]}` + and the schema needs to match this structure as well (see listSchema below) + + To see a working example on Athena, run this test and copy the list.parquet file to an s3 bucket. + + In Athena create the listTest table with the following command: + + CREATE EXTERNAL TABLE `listTest`( + id string, + `test` array> + ) + ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + LOCATION + 's3://s3bucket/.../list.parquet' + + And verify that Athena parses the parquet file correctly by `SELECT * from listTest` +*/ + + + +const listSchema = new parquet.ParquetSchema({ + id: { type: 'UTF8'}, + test: { + type: 'LIST', + fields: { + list: { + repeated: true, + fields: { + element: { + fields: { + a: {type: 'UTF8'}, + b: {type: 'INT64'} + } + } + } + } + } + } +}); + +describe('list', async function() { + let reader; + const row1 = { + id: 'Row1', + test: {list: [{element: {a:'test1', b:1}}, {element: { a: 'test2', b: 2}}, {element: {a: 'test3', b: 3}}]} + }; + + const row2 = { + id: 'Row2', + test: {list: [{element: {a:'test4', b:4}}]} + }; + + before(async function(){ + let writer = await parquet.ParquetWriter.openFile(listSchema, 'list.parquet', {pageSize: 100}); + + writer.appendRow(row1); + writer.appendRow(row2); + + await writer.close(); + reader = await parquet.ParquetReader.openFile('list.parquet'); + }); + + it('schema is encoded correctly', async function() { + const schema = reader.metadata.schema; + assert.equal(schema.length, 7); + assert.equal(schema[2].name, 'test'); + assert.equal(schema[2].converted_type, 3); + }); + + it('output matches input', async function() { + const cursor = reader.getCursor(); + let row = await cursor.next(); + assert.deepEqual(row, row1); + row = await cursor.next(); + assert.deepEqual(row, row2); + }); +}); \ No newline at end of file