From 6fdb9dac9864d27ae4ae62ea5d4ce8a3f49c29e2 Mon Sep 17 00:00:00 2001 From: Sarit Vakrat Date: Thu, 18 Jan 2024 21:56:18 +0200 Subject: [PATCH] Update index.ts to support RLE_DICTIONARY (#112) Problem ======= problem statement - when trying to read a parquet file that was generated using V2 parquet and had RLE_DICTIONARY, got an error: invalid encoding: RLE_DICTIONARY #96 Reported issue: https://github.com/LibertyDSNP/parquetjs/issues/96 Solution ======== What I/we did to solve this problem added: export * as RLE_DICTIONARY from './plain_dictionary'; ---------------- I added this line to an existing project in the node modules and it works. without this line I get an an error with this line added - it passed --------- Co-authored-by: Wil Wade --- lib/codec/index.ts | 2 +- test/test-files.js | 27 +++++++++++++++--- .../rle/rle-dict-snappy-checksum.parquet | Bin 0 -> 822 bytes ...dict-uncompressed-corrupt-checksum.parquet | Bin 0 -> 814 bytes .../rle/rle_boolean_encoding.parquet | Bin 0 -> 192 bytes 5 files changed, 24 insertions(+), 5 deletions(-) create mode 100644 test/test-files/rle/rle-dict-snappy-checksum.parquet create mode 100644 test/test-files/rle/rle-dict-uncompressed-corrupt-checksum.parquet create mode 100644 test/test-files/rle/rle_boolean_encoding.parquet diff --git a/lib/codec/index.ts b/lib/codec/index.ts index af182ab1..85a1e507 100644 --- a/lib/codec/index.ts +++ b/lib/codec/index.ts @@ -1,5 +1,5 @@ export * as PLAIN from './plain' export * as RLE from './rle' export * as PLAIN_DICTIONARY from './plain_dictionary' - +export * as RLE_DICTIONARY from './plain_dictionary' diff --git a/test/test-files.js b/test/test-files.js index baf36dbf..39a11df6 100644 --- a/test/test-files.js +++ b/test/test-files.js @@ -146,7 +146,7 @@ describe('test-files', function() { const scale = schema.fields["value"].scale; assert.equal(scale, 2); const divider = 10 ** scale; - + for (let i = 0; i < data.length; i++) { const valueToMatch = i + 1; // Decimal values whose primitive types are fixed length byte array will @@ -160,11 +160,11 @@ describe('test-files', function() { assert.equal(numericalValue, valueToMatch); } }); - + it('byte_array_decimal.parquet loads', async function () { const schema = await readSchema('byte_array_decimal.parquet'); const data = await readData('byte_array_decimal.parquet'); - + const scale = schema.fields["value"].scale; assert.equal(scale, 2); const divider = 10 ** scale; @@ -173,7 +173,7 @@ describe('test-files', function() { const valueToMatch = i + 1; // Decimal values whose primitive types are byte array will // be returned as raw buffer values. - // For the test data, the actual decimal values and the corresponding buffer lengths + // For the test data, the actual decimal values and the corresponding buffer lengths // are small enough so we can treat the buffer as a positive integer and compare the values. // In reality, the user will need to use a more novel approach to parse the // buffer to an object that can handle large fractional numbers. @@ -188,4 +188,23 @@ describe('test-files', function() { assert.equal(decimalValue, valueToMatch); } }); + + describe("RLE", function () { + // Tracked in https://github.com/LibertyDSNP/parquetjs/issues/113 + it.skip('rle_boolean_encoding.parquet loads', async function() { + const data = await readData('rle/rle_boolean_encoding.parquet'); + assert.deepEqual(data[0],{ datatype_boolean: true }); + assert.deepEqual(data[1],{ datatype_boolean: false }); + }); + + it('rle-dict-snappy-checksum.parquet loads', async function() { + const data = await readData('rle/rle-dict-snappy-checksum.parquet'); + assert.deepEqual(data[0],{ binary_field: "c95e263a-f5d4-401f-8107-5ca7146a1f98", long_field: "0" }); + }); + + it('rle-dict-uncompressed-corrupt-checksum.parquet loads', async function() { + const data = await readData('rle/rle-dict-uncompressed-corrupt-checksum.parquet'); + assert.deepEqual(data[0],{ binary_field: "6325c32b-f417-41aa-9e02-9b8601542aff", long_field: "0" }); + }); + }) }); diff --git a/test/test-files/rle/rle-dict-snappy-checksum.parquet b/test/test-files/rle/rle-dict-snappy-checksum.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4c183d89b4a3c5ee0967fac6257feacd3bbf48ec GIT binary patch literal 822 zcma)5OKaOe5FRO_Rf0<#@GM%;!G}b+U>VES%Mc3jDW}H7ITp%VS``(^PLxL<=l+Bq za>y+?l~8DM>Cecef1>C7fQ({Or%;M^VP|%}*?D}+jCS@N)Ub{X{BeH!OW#Hn0YEqD zcwG35_Sze~TSq95Itc<+bfAldBRs&%%P&7y7Y|z(n}kLKPxk!2uoEwKZMW+rcIb2i z+lxftxV~_bLHNJGd_(1iJ}&^^iHa&`WGLayDoHB3Ey9{?h#|!XZnAr2t`fAptK_Hw zYFl8eWwYtKw@E6q7_df$+9aKd;?u&!?$)`<+t$!Tjc?Vd^d}EcC78Wug*CKvbJ^cg zW6=83_0kHwO-uu-K%CV{06W*P!LC(&1=hV{Vr`!5uUh8WzA5ZRAs_3rS&gx3UQ}o$ z{;Ng%M}yZIGWJ`=?+UQ5C2y9_uGB{PHO9t=B0Z5sC!fVK>r6#1jpy>C$PY6K#`sVa zN2l_{&Wq-mERNIJwCQx5ey3}XU+;{@yL$)C*5owJ;%3ixk|6H+!NBbXQP}T$t~c@g d#En8vxI%^#FB-_$eaYK24&_9=Anr#rF%Qvpq`CkA literal 0 HcmV?d00001 diff --git a/test/test-files/rle/rle-dict-uncompressed-corrupt-checksum.parquet b/test/test-files/rle/rle-dict-uncompressed-corrupt-checksum.parquet new file mode 100644 index 0000000000000000000000000000000000000000..20e23aaabea75cb4db325a77bdc9aa98b23de49d GIT binary patch literal 814 zcma)5PfO!K6n_a}w@4Q&_=XJR(1QjSn$jebv;`@6^Q2|FM?ogdWQioLW_8t1pl1zhU|t-&{1XM)LUk>Y=)hI>LP6r-|&~ z{UXBTsF9kWWE$8fdAc~etIU#C2nc;I82Vo1B*JYw!j;nLs+Q+;qfXdz1L4Ue`F~Nq zq;s?KxBVLvbv>CDCM7rDh7cF#hCuhxLSO+X-+?Xf+(XIHHGd~=Jz+P|siUf>O48Bhme ztVRmhxq-{v*dH zyVQv{yFz+-@-*hsagiRXqM46ll{H5)SLSDRBJ=%BfjQWh#lev}bn>G9OBFxU@u==L zUBB6K23s5b!RGe&`dW09W^vsM-J~6RVY@3@?P16F0}(_)n22E~kV2|X6b!p67GHV8 N0-z0Fi2FYMlvk5Jqd@=w literal 0 HcmV?d00001 diff --git a/test/test-files/rle/rle_boolean_encoding.parquet b/test/test-files/rle/rle_boolean_encoding.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6a6de0a9422bb42b08139f58a5b18f1df4ed6ed6 GIT binary patch literal 192 zcmWG=3^EjD6EzWyi4pB!6y*UCY@%YKEDQ`CjO@Sb>tz@@|NY)?FW=3<00ECv5)uS# zs*T(8w|y-0W+?vTIfWM}Bg!PH