From f19981ed0af3e3537e5853d315afe28614234f26 Mon Sep 17 00:00:00 2001 From: Gary Irick Date: Wed, 20 Apr 2022 12:31:22 -0500 Subject: [PATCH 1/2] Fixed RLE encoding and decoding when a dictionary has > 255 entries. --- .gitattributes | 2 ++ lib/codec/rle.js | 21 ++++++++++++++++----- test/codec_rle.js | 12 ++++++------ 3 files changed, 24 insertions(+), 11 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..f9cbac8b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# This is important, or the unit tests fail on a Windows box. +*.csv text eol=lf diff --git a/lib/codec/rle.js b/lib/codec/rle.js index e7a22865..dcff5f2c 100644 --- a/lib/codec/rle.js +++ b/lib/codec/rle.js @@ -1,3 +1,7 @@ +// For questions about RLE encoding, see the spec: +// +// https://github.com/apache/parquet-format/blob/master/Encodings.md + const varint = require('varint') function encodeRunBitpacked(values, opts) { @@ -20,10 +24,13 @@ function encodeRunBitpacked(values, opts) { function encodeRunRepeated(value, count, opts) { let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); + let remainingValue = value + // This is encoded LSB to MSB, so we pick off the least + // significant byte and shift to get the next one. for (let i = 0; i < buf.length; ++i) { - buf.writeUInt8(value & 0xff, i); - value >> 8; + buf.writeUInt8(remainingValue & 0xff, i); + remainingValue = remainingValue >> 8; } return Buffer.concat([ @@ -109,10 +116,14 @@ function decodeRunBitpacked(cursor, count, opts) { } function decodeRunRepeated(cursor, count, opts) { + var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8); let value = 0; - for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) { - value << 8; - value += cursor.buffer[cursor.offset]; + + for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) { + const byte = cursor.buffer[cursor.offset] + // Bytes are stored LSB to MSB, so we need to shift + // each new byte appropriately. + value += byte << (i * 8); cursor.offset += 1; } diff --git a/test/codec_rle.js b/test/codec_rle.js index a20583d7..6d27b80d 100644 --- a/test/codec_rle.js +++ b/test/codec_rle.js @@ -66,29 +66,29 @@ describe('ParquetCodec::RLE', function() { it('should encode repeated values', function() { let buf = parquet_codec_rle.encodeValues( 'INT32', - [42, 42, 42, 42, 42, 42, 42, 42], + [1234567, 1234567, 1234567, 1234567, 1234567, 1234567, 1234567, 1234567], { disableEnvelope: true, - bitWidth: 6 + bitWidth: 21 }); - assert.deepEqual(buf, Buffer.from([0x10, 0x2a])); + assert.deepEqual(buf, Buffer.from([0x10, 0x87, 0xD6, 0x12])); }); it('should decode repeated values', function() { let vals = parquet_codec_rle.decodeValues( 'INT32', { - buffer: Buffer.from([0x10, 0x2a]), + buffer: Buffer.from([0x10, 0x87, 0xD6, 0x12]), offset: 0, }, 8, { disableEnvelope: true, - bitWidth: 3 + bitWidth: 21 }); - assert.deepEqual(vals, [42, 42, 42, 42, 42, 42, 42, 42]); + assert.deepEqual(vals, [1234567, 1234567, 1234567, 1234567, 1234567, 1234567, 1234567, 1234567]); }); it('should encode mixed runs', function() { From b89663135ca429f83f3f7a59f8988aa3b3f9ef80 Mon Sep 17 00:00:00 2001 From: Gary Irick Date: Wed, 20 Apr 2022 12:34:45 -0500 Subject: [PATCH 2/2] Fixed decoding of a column chunk with pages in PLAIN and PLAIN_DICTIONARY in the same chunk. --- lib/reader.js | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/reader.js b/lib/reader.js index aa60de7f..2d887b63 100644 --- a/lib/reader.js +++ b/lib/reader.js @@ -733,7 +733,11 @@ function decodePages(buffer, opts) { continue; } - if (opts.dictionary) { + // It's possible to have a column chunk where some pages should use + // the dictionary (PLAIN_DICTIONARY for example) and others should + // not (PLAIN for example). + + if (opts.dictionary && pageData.useDictionary) { pageData.values = pageData.values.map(d => opts.dictionary[d]); } @@ -862,7 +866,8 @@ function decodeDataPage(cursor, header, opts) { dlevels: dLevels, rlevels: rLevels, values: values, - count: valueCount + count: valueCount, + useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' }; } @@ -938,7 +943,8 @@ function decodeDataPageV2(cursor, header, opts) { dlevels: dLevels, rlevels: rLevels, values: values, - count: valueCount + count: valueCount, + useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' }; }