From 18baa234ad328867df843875a1da7c1e9ab62153 Mon Sep 17 00:00:00 2001 From: ilhan2316 Date: Sun, 6 Oct 2024 10:16:09 -0500 Subject: [PATCH] string file updated and naming of suite --- testing/regress/ecl/key/parquetCompress.xml | 44 +++ testing/regress/ecl/key/parquetCorrupt.xml | 2 + testing/regress/ecl/key/parquetEmpty.xml | 3 + testing/regress/ecl/key/parquetOverwrite.xml | 2 + testing/regress/ecl/key/parquetPartition.xml | 1 + testing/regress/ecl/key/parquetSchema.xml | 7 + testing/regress/ecl/key/parquetSize.xml | 13 + testing/regress/ecl/key/parquetString.xml | 8 + testing/regress/ecl/key/parquetWrite.xml | 304 +++++++++++++++++++ testing/regress/ecl/parquetCompress.ecl | 186 ++++++++++++ testing/regress/ecl/parquetCorrupt.ecl | 36 +++ testing/regress/ecl/parquetEmpty.ecl | 36 +++ testing/regress/ecl/parquetOverwrite.ecl | 23 ++ testing/regress/ecl/parquetPartition.ecl | 69 +++++ testing/regress/ecl/parquetSchema.ecl | 47 +++ testing/regress/ecl/parquetSize.ecl | 40 +++ testing/regress/ecl/parquetString.ecl | 53 ++++ testing/regress/ecl/parquetWrite.ecl | 49 +++ 18 files changed, 923 insertions(+) create mode 100644 testing/regress/ecl/key/parquetCompress.xml create mode 100644 testing/regress/ecl/key/parquetCorrupt.xml create mode 100644 testing/regress/ecl/key/parquetEmpty.xml create mode 100644 testing/regress/ecl/key/parquetOverwrite.xml create mode 100644 testing/regress/ecl/key/parquetPartition.xml create mode 100644 testing/regress/ecl/key/parquetSchema.xml create mode 100644 testing/regress/ecl/key/parquetSize.xml create mode 100644 testing/regress/ecl/key/parquetString.xml create mode 100644 testing/regress/ecl/key/parquetWrite.xml create mode 100644 testing/regress/ecl/parquetCompress.ecl create mode 100644 testing/regress/ecl/parquetCorrupt.ecl create mode 100644 testing/regress/ecl/parquetEmpty.ecl create mode 100644 testing/regress/ecl/parquetOverwrite.ecl create mode 100644 testing/regress/ecl/parquetPartition.ecl create mode 100644 testing/regress/ecl/parquetSchema.ecl create mode 100644 testing/regress/ecl/parquetSize.ecl create mode 100644 testing/regress/ecl/parquetString.ecl create mode 100644 testing/regress/ecl/parquetWrite.ecl diff --git a/testing/regress/ecl/key/parquetCompress.xml b/testing/regress/ecl/key/parquetCompress.xml new file mode 100644 index 00000000000..94bb5c620c3 --- /dev/null +++ b/testing/regress/ecl/key/parquetCompress.xml @@ -0,0 +1,44 @@ + + 0aaatrue + 1aabfalse + + + 0min-9223372036854775808 + 1max9223372036854775807 + + + 0min2.225073858507201e-308 + 1max1.797693134862316e+308 + + + 0max0 + 1min0 + + + 0emp + 1lonThis is a long string to test the maximum length of a STRING field in HPCC + + + 0quoSTRING WITH "QUOTES" AND SPACES + 1speSTRING WITH NEWLINE AND TAB + + + 0mixASCII and Unicode こんにちは + 1emoEmoji test: 🚀🌟💬😊 + + + 0chi中文测试 + 1mixMix of scripts: АБВ αβγ こんにちは + + + 0bin0123456789ABCDEF + 1all00FF + + + 0shoShort + 1lonThis is a longer varstring to test variable-length behavior + + + 0ascASCII only + 1mixMixed scripts: Latin, Кириллица, 日本語 + diff --git a/testing/regress/ecl/key/parquetCorrupt.xml b/testing/regress/ecl/key/parquetCorrupt.xml new file mode 100644 index 00000000000..f35199613d5 --- /dev/null +++ b/testing/regress/ecl/key/parquetCorrupt.xml @@ -0,0 +1,2 @@ +eclagentError: 0: parquet: Invalid: Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.. + diff --git a/testing/regress/ecl/key/parquetEmpty.xml b/testing/regress/ecl/key/parquetEmpty.xml new file mode 100644 index 00000000000..887f998e548 --- /dev/null +++ b/testing/regress/ecl/key/parquetEmpty.xml @@ -0,0 +1,3 @@ + + 4105813624 + diff --git a/testing/regress/ecl/key/parquetOverwrite.xml b/testing/regress/ecl/key/parquetOverwrite.xml new file mode 100644 index 00000000000..71ed7f9f5fb --- /dev/null +++ b/testing/regress/ecl/key/parquetOverwrite.xml @@ -0,0 +1,2 @@ +eclagentError: 0: parquet: The target file /var/lib/HPCCSystems/mydropzone/SingleRowTest.parquet already exists. To delete the file set the overwrite option to true. + diff --git a/testing/regress/ecl/key/parquetPartition.xml b/testing/regress/ecl/key/parquetPartition.xml new file mode 100644 index 00000000000..a2f3f511b45 --- /dev/null +++ b/testing/regress/ecl/key/parquetPartition.xml @@ -0,0 +1 @@ +eclagentError: 0: parquet: Error processing result row diff --git a/testing/regress/ecl/key/parquetSchema.xml b/testing/regress/ecl/key/parquetSchema.xml new file mode 100644 index 00000000000..b8d2f924ad8 --- /dev/null +++ b/testing/regress/ecl/key/parquetSchema.xml @@ -0,0 +1,7 @@ + + + + Alice50000.51 + Bob60000.752 + + diff --git a/testing/regress/ecl/key/parquetSize.xml b/testing/regress/ecl/key/parquetSize.xml new file mode 100644 index 00000000000..1b2940ea6e3 --- /dev/null +++ b/testing/regress/ecl/key/parquetSize.xml @@ -0,0 +1,13 @@ + + 1Alice10.5yes + 2Bob20.75no + 3Charlie15.25yes + 1A10.01 + + + 1A10.01 + 3Charlie15.25yes + 2Bob20.75no + 1A10.01 + 1Alice10.5yes + diff --git a/testing/regress/ecl/key/parquetString.xml b/testing/regress/ecl/key/parquetString.xml new file mode 100644 index 00000000000..9a8e2a60d3a --- /dev/null +++ b/testing/regress/ecl/key/parquetString.xml @@ -0,0 +1,8 @@ + + + + + + + All records match + diff --git a/testing/regress/ecl/key/parquetWrite.xml b/testing/regress/ecl/key/parquetWrite.xml new file mode 100644 index 00000000000..a04a4cfda02 --- /dev/null +++ b/testing/regress/ecl/key/parquetWrite.xml @@ -0,0 +1,304 @@ + + 1B10false + 2C10true + 3D10false + 4E10true + 5F10false + 6G10true + 7H10false + 8I10true + 9J10false + 10A10true + 11B10false + 12C10true + 13D10false + 14E10true + 15F10false + 16G10true + 17H10false + 18I10true + 19J10false + 20A10true + 21B10false + 22C10true + 23D10false + 24E10true + 25F10false + 26G10true + 27H10false + 28I10true + 29J10false + 30A10true + 31B10false + 32C10true + 33D10false + 34E10true + 35F10false + 36G10true + 37H10false + 38I10true + 39J10false + 40A10true + 41B10false + 42C10true + 43D10false + 44E10true + 45F10false + 46G10true + 47H10false + 48I10true + 49J10false + 50A10true + + + 1B20false + 2C20true + 3D20false + 4E20true + 5F20false + 6G20true + 7H20false + 8I20true + 9J20false + 10A20true + 11B20false + 12C20true + 13D20false + 14E20true + 15F20false + 16G20true + 17H20false + 18I20true + 19J20false + 20A20true + 21B20false + 22C20true + 23D20false + 24E20true + 25F20false + 26G20true + 27H20false + 28I20true + 29J20false + 30A20true + 31B20false + 32C20true + 33D20false + 34E20true + 35F20false + 36G20true + 37H20false + 38I20true + 39J20false + 40A20true + 41B20false + 42C20true + 43D20false + 44E20true + 45F20false + 46G20true + 47H20false + 48I20true + 49J20false + 50A20true + 51B20false + 52C20true + 53D20false + 54E20true + 55F20false + 56G20true + 57H20false + 58I20true + 59J20false + 60A20true + 61B20false + 62C20true + 63D20false + 64E20true + 65F20false + 66G20true + 67H20false + 68I20true + 69J20false + 70A20true + 71B20false + 72C20true + 73D20false + 74E20true + 75F20false + 76G20true + 77H20false + 78I20true + 79J20false + 80A20true + 81B20false + 82C20true + 83D20false + 84E20true + 85F20false + 86G20true + 87H20false + 88I20true + 89J20false + 90A20true + 91B20false + 92C20true + 93D20false + 94E20true + 95F20false + 96G20true + 97H20false + 98I20true + 99J20false + 100A20true + 101B20false + 102C20true + 103D20false + 104E20true + 105F20false + 106G20true + 107H20false + 108I20true + 109J20false + 110A20true + 111B20false + 112C20true + 113D20false + 114E20true + 115F20false + 116G20true + 117H20false + 118I20true + 119J20false + 120A20true + 121B20false + 122C20true + 123D20false + 124E20true + 125F20false + 126G20true + 127H20false + 128I20true + 129J20false + 130A20true + 131B20false + 132C20true + 133D20false + 134E20true + 135F20false + 136G20true + 137H20false + 138I20true + 139J20false + 140A20true + 141B20false + 142C20true + 143D20false + 144E20true + 145F20false + 146G20true + 147H20false + 148I20true + 149J20false + 150A20true + 151B20false + 152C20true + 153D20false + 154E20true + 155F20false + 156G20true + 157H20false + 158I20true + 159J20false + 160A20true + 161B20false + 162C20true + 163D20false + 164E20true + 165F20false + 166G20true + 167H20false + 168I20true + 169J20false + 170A20true + 171B20false + 172C20true + 173D20false + 174E20true + 175F20false + 176G20true + 177H20false + 178I20true + 179J20false + 180A20true + 181B20false + 182C20true + 183D20false + 184E20true + 185F20false + 186G20true + 187H20false + 188I20true + 189J20false + 190A20true + 191B20false + 192C20true + 193D20false + 194E20true + 195F20false + 196G20true + 197H20false + 198I20true + 199J20false + 200A20true + 201B20false + 202C20true + 203D20false + 204E20true + 205F20false + 206G20true + 207H20false + 208I20true + 209J20false + 210A20true + 211B20false + 212C20true + 213D20false + 214E20true + 215F20false + 216G20true + 217H20false + 218I20true + 219J20false + 220A20true + 221B20false + 222C20true + 223D20false + 224E20true + 225F20false + 226G20true + 227H20false + 228I20true + 229J20false + 230A20true + 231B20false + 232C20true + 233D20false + 234E20true + 235F20false + 236G20true + 237H20false + 238I20true + 239J20false + 240A20true + 241B20false + 242C20true + 243D20false + 244E20true + 245F20false + 246G20true + 247H20false + 248I20true + 249J20false + 250A20true + diff --git a/testing/regress/ecl/parquetCompress.ecl b/testing/regress/ecl/parquetCompress.ecl new file mode 100644 index 00000000000..f9e1b891d0c --- /dev/null +++ b/testing/regress/ecl/parquetCompress.ecl @@ -0,0 +1,186 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet +//version compressionType='UNCOMPRESSED' +//version compressionType='Snappy' +//version compressionType='GZip' +//version compressionType='Brotli' +//version compressionType='LZ4' +//version compressionType='ZSTD' + +IMPORT Std; +IMPORT Parquet; + +compressionType := #IFDEFINED(root.compressionType, 'SNAPPY'); + +// Define record structures +BooleanRec := RECORD + UNSIGNED testid; + STRING3 testname; + BOOLEAN value; +END; + +IntegerRec := RECORD + UNSIGNED testid; + STRING3 testname; + INTEGER value; +END; + +RealRec := RECORD + UNSIGNED testid; + STRING3 testname; + REAL value; +END; + +DecimalRec := RECORD + UNSIGNED testid; + STRING3 testname; + DECIMAL10_2 value; +END; + +StringRec := RECORD + UNSIGNED testid; + STRING3 testname; + STRING value; +END; + +QStringRec := RECORD + UNSIGNED testid; + STRING3 testname; + STRING value; +END; + +UnicodeRec := RECORD + UNSIGNED testid; + STRING3 testname; + UNICODE value; +END; + +UTF8Rec := RECORD + UNSIGNED testid; + STRING3 testname; + UTF8 value; +END; + +DataRec := RECORD + UNSIGNED testid; + STRING3 testname; + DATA value; +END; + +VarstringRec := RECORD + UNSIGNED testid; + STRING3 testname; + VARSTRING value; +END; + +VarunicodeRec := RECORD + UNSIGNED testid; + STRING3 testname; + VARUNICODE value; +END; + +BooleanData := DATASET([ + {0, 'aaa', TRUE}, + {1, 'aab', FALSE} +], BooleanRec); + +IntegerData := DATASET([ + {0, 'min', -9223372036854775808}, // Minimum value for SIGNED8 (64-bit integer) + {1, 'max', 9223372036854775807} // Maximum value for SIGNED8 (64-bit integer) +], IntegerRec); + +RealData := DATASET([ + {0, 'min', 2.2250738585072014e-308}, // Smallest positive normalized double-precision float + {1, 'max', 1.7976931348623157e+308} // Largest finite double-precision float +], RealRec); + +DecimalData := DATASET([ + {0, 'max', 9999999999999999999.99}, // Maximum value for DECIMAL32_2 + {1, 'min', -9999999999999999999.99} // Minimum value for DECIMAL32_2 +], DecimalRec); + +StringData := DATASET([ + {0, 'empty', ''}, + {1, 'long', 'This is a long string to test the maximum length of a STRING field in HPCC'} +], StringRec); + +QStringData := DATASET([ + {0, 'quoted', 'String with "quotes" and spaces '}, + {1, 'special', 'String with \n newline and \t tab'} +], QStringRec); + +UnicodeData := DATASET([ + {0, 'mixed', U'ASCII and Unicode こんにちは'}, + {1, 'emoji', U'Emoji test: 🚀🌟💬😊'} +], UnicodeRec); + +UTF8Data := DATASET([ + {0, 'chinese', U'中文测试'}, + {1, 'mixed', U'Mix of scripts: АБВ αβγ こんにちは'} +], UTF8Rec); + +DataData := DATASET([ + {0, 'binary', X'0123456789ABCDEF'}, + {1, 'allbits', X'00FF'} // All bits set in a byte +], DataRec); + +VarstringData := DATASET([ + {0, 'short', 'Short'}, + {1, 'long', 'This is a longer varstring to test variable-length behavior'} +], VarstringRec); + +VarunicodeData := DATASET([ + {0, 'ascii', U'ASCII only'}, + {1, 'mixed', U'Mixed scripts: Latin, Кириллица, 日本語'} +], VarunicodeRec); + +// Write datasets to Parquet files +ParquetIO.Write(BooleanData, '/var/lib/HPCCSystems/mydropzone/Boolean.parquet', TRUE); +ParquetIO.Write(IntegerData, '/var/lib/HPCCSystems/mydropzone/Integer.parquet', TRUE); +ParquetIO.Write(RealData, '/var/lib/HPCCSystems/mydropzone/Real.parquet', TRUE); +ParquetIO.Write(DecimalData, '/var/lib/HPCCSystems/mydropzone/Decimal.parquet', TRUE); +ParquetIO.Write(StringData, '/var/lib/HPCCSystems/mydropzone/String.parquet', TRUE); +ParquetIO.Write(QStringData, '/var/lib/HPCCSystems/mydropzone/QString.parquet', TRUE); +ParquetIO.Write(UnicodeData, '/var/lib/HPCCSystems/mydropzone/Unicode.parquet', TRUE); +ParquetIO.Write(UTF8Data, '/var/lib/HPCCSystems/mydropzone/UTF8.parquet', TRUE); +ParquetIO.Write(DataData, '/var/lib/HPCCSystems/mydropzone/Data.parquet', TRUE); +ParquetIO.Write(VarstringData, '/var/lib/HPCCSystems/mydropzone/Varstring.parquet', TRUE); +ParquetIO.Write(VarunicodeData, '/var/lib/HPCCSystems/mydropzone/Varunicode.parquet', TRUE); + +// Read datasets from Parquet files +BooleanDataRead := ParquetIO.Read(BooleanRec, '/var/lib/HPCCSystems/mydropzone/Boolean.parquet'); +IntegerDataRead := ParquetIO.Read(IntegerRec, '/var/lib/HPCCSystems/mydropzone/Integer.parquet'); +RealDataRead := ParquetIO.Read(RealRec, '/var/lib/HPCCSystems/mydropzone/Real.parquet'); +DecimalDataRead := ParquetIO.Read(DecimalRec, '/var/lib/HPCCSystems/mydropzone/Decimal.parquet'); +StringDataRead := ParquetIO.Read(StringRec, '/var/lib/HPCCSystems/mydropzone/String.parquet'); +QStringDataRead := ParquetIO.Read(QStringRec, '/var/lib/HPCCSystems/mydropzone/QString.parquet'); +UnicodeDataRead := ParquetIO.Read(UnicodeRec, '/var/lib/HPCCSystems/mydropzone/Unicode.parquet'); +UTF8DataRead := ParquetIO.Read(UTF8Rec, '/var/lib/HPCCSystems/mydropzone/UTF8.parquet'); +DataDataRead := ParquetIO.Read(DataRec, '/var/lib/HPCCSystems/mydropzone/Data.parquet'); +VarstringDataRead := ParquetIO.Read(VarstringRec, '/var/lib/HPCCSystems/mydropzone/Varstring.parquet'); +VarunicodeDataRead := ParquetIO.Read(VarunicodeRec, '/var/lib/HPCCSystems/mydropzone/Varunicode.parquet'); + +// Output datasets read from Parquet files +OUTPUT(BooleanDataRead, NAMED('BooleanData')); +OUTPUT(IntegerDataRead, NAMED('IntegerData')); +OUTPUT(RealDataRead, NAMED('RealData')); +OUTPUT(DecimalDataRead, NAMED('DecimalData')); +OUTPUT(StringDataRead, NAMED('StringData')); +OUTPUT(QStringDataRead, NAMED('QStringData')); +OUTPUT(UnicodeDataRead, NAMED('UnicodeData')); +OUTPUT(UTF8DataRead, NAMED('UTF8Data')); +OUTPUT(DataDataRead, NAMED('DataData')); +OUTPUT(VarstringDataRead, NAMED('VarstringData')); +OUTPUT(VarunicodeDataRead, NAMED('VarunicodeData')); \ No newline at end of file diff --git a/testing/regress/ecl/parquetCorrupt.ecl b/testing/regress/ecl/parquetCorrupt.ecl new file mode 100644 index 00000000000..7e51e5a5be4 --- /dev/null +++ b/testing/regress/ecl/parquetCorrupt.ecl @@ -0,0 +1,36 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet +//fail + +//This ECL code reads a potentially corrupt Parquet file, handling the case where it might be corrupt +//by outputting either the file contents or a single informative record if the file can't be read. + +IMPORT Parquet, Std.Uni; + +RECORDDEF:= RECORD + UNSIGNED4 index; + STRING name; + STRING director; +END; + +filePath2 := '/var/lib/HPCCSystems/mydropzone/corrupt.parquet'; + +CORRUPT_PARQUET := ParquetIO.Read(RECORDDEF, filePath2); + +CORRUPT_RESULT := IF(COUNT(CORRUPT_PARQUET) = 0, + DATASET([{0, 'Corrupt Parquet File', ''}], RECORDDEF), + CORRUPT_PARQUET); + +OUTPUT(CORRUPT_RESULT); diff --git a/testing/regress/ecl/parquetEmpty.ecl b/testing/regress/ecl/parquetEmpty.ecl new file mode 100644 index 00000000000..d45c3d7f6f9 --- /dev/null +++ b/testing/regress/ecl/parquetEmpty.ecl @@ -0,0 +1,36 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet +//fail + +//This ECL code reads an empty Parquet file, handling the case where it might be empty +//by outputting either the file contents or a single informative record if the file is empty. + +IMPORT Parquet, Std.Uni; + +RECORDDEF:= RECORD + UNSIGNED4 index; + STRING name; + STRING director; +END; + +filePath1 := '/var/lib/HPCCSystems/mydropzone/empty.parquet'; + +EMPTY_PARQUET := ParquetIO.Read(RECORDDEF, filePath1); + +EMPTY_RESULT := IF(COUNT(EMPTY_PARQUET) = 0, + DATASET([{0, 'Empty Parquet File', ''}], RECORDDEF), + EMPTY_PARQUET); + +OUTPUT(EMPTY_RESULT); diff --git a/testing/regress/ecl/parquetOverwrite.ecl b/testing/regress/ecl/parquetOverwrite.ecl new file mode 100644 index 00000000000..856c24f77ec --- /dev/null +++ b/testing/regress/ecl/parquetOverwrite.ecl @@ -0,0 +1,23 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet +//fail + +IMPORT Parquet; + +SingleRowDataset := DATASET([{1, 'SingleRow', TRUE}], {UNSIGNED id, STRING name, BOOLEAN flag}); + +writeParquetFile := ParquetIO.write(SingleRowDataset, '/var/lib/HPCCSystems/mydropzone/SingleRowTest.parquet'); + +SEQUENTIAL(writeParquetFile, writeParquetFile); diff --git a/testing/regress/ecl/parquetPartition.ecl b/testing/regress/ecl/parquetPartition.ecl new file mode 100644 index 00000000000..f9a4400e2a8 --- /dev/null +++ b/testing/regress/ecl/parquetPartition.ecl @@ -0,0 +1,69 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet + +IMPORT Std; +IMPORT Parquet; + +// Define the record layout for the dataset +datasetRecordLayout := RECORD + INTEGER id; + STRING name; + INTEGER age; + STRING city; +END; + +// Create a small dataset +smallData := DATASET([ + {1, 'Alice', 30, 'New York'}, + {2, 'Bob', 25, 'Los Angeles'}, + {3, 'Charlie', 40, 'Chicago'} +], datasetRecordLayout); + +// Set options +overwriteOption := TRUE; +rowSize := 1; + +// Define base path using GetDefaultDropZone() +basePath := Std.File.GetDefaultDropZone() + 'regress/parquet/'; + +// Write out the dataset with Hive partitioning on CITY +ParquetIO.HivePartition.Write( + smallData, + rowSize, // Number of rows per file + basePath + 'hive_partitioned/', + overwriteOption, // Overwrite existing files + 'city' // Partition key +); + +// Write out the dataset with Directory partitioning on AGE +ParquetIO.DirectoryPartition.Write( + smallData, // Data to write + rowSize, // Number of rows per file + basePath + 'dir_partitioned/', + overwriteOption, // Overwrite existing files + 'age' // Partition key +); + +// Define file paths for partitioned datasets +hiveFilePath := basePath + 'hive_partitioned/'; +dirFilePath := basePath + 'dir_partitioned/'; + +// Read back the partitioned data +readBackHiveData := ParquetIO.HivePartition.Read(datasetRecordLayout, hiveFilePath); +readBackDirData := ParquetIO.DirectoryPartition.Read(datasetRecordLayout, dirFilePath, 'age'); + +// Output the entire dataset for verification +OUTPUT(readBackHiveData, NAMED('HivePartitionedSampleData')); +OUTPUT(readBackDirData, NAMED('DirPartitionedSampleData')); \ No newline at end of file diff --git a/testing/regress/ecl/parquetSchema.ecl b/testing/regress/ecl/parquetSchema.ecl new file mode 100644 index 00000000000..1e6284fb101 --- /dev/null +++ b/testing/regress/ecl/parquetSchema.ecl @@ -0,0 +1,47 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet +//fail + +IMPORT Std; +IMPORT Parquet; + +// Original layout +Layout1 := RECORD + INTEGER id; + STRING name; + REAL salary; +END; + +// Reordered layout +Layout2 := RECORD + STRING name; + REAL salary; + INTEGER id; +END; + +testData := DATASET([ + { 1, 'Alice', 50000.50 }, + { 2, 'Bob', 60000.75 } +], Layout1); + +filePath := '/var/lib/HPCCSystems/mydropzone/reorder_test.parquet'; + +// Write using Layout1 +ParquetIO.Write(testData, filePath, TRUE); + +// Read using Layout2 +readData := ParquetIO.Read(Layout2, filePath); + +OUTPUT(readData, NAMED('ReadData')); diff --git a/testing/regress/ecl/parquetSize.ecl b/testing/regress/ecl/parquetSize.ecl new file mode 100644 index 00000000000..4f1bb533b93 --- /dev/null +++ b/testing/regress/ecl/parquetSize.ecl @@ -0,0 +1,40 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +##############################################################################*/ + +//class=parquet + +IMPORT Parquet; + +recordLayout := RECORD + UNSIGNED4 id; + STRING name; + REAL8 price; + STRING isactive; +END; + +// Paths to the files +singleFilePath := '/var/lib/HPCCSystems/mydropzone/single.parquet'; +multiFilePath := '/var/lib/HPCCSystems/mydropzone/multi.parquet'; + +// Reading the single and multi-part files +singleDataset := ParquetIO.Read(recordLayout, singleFilePath); +multiDataset := ParquetIO.Read(recordLayout, multiFilePath); + +// Output the datasets +SEQUENTIAL( + OUTPUT(singleDataset, NAMED('singleDataset')), // Output for the single file + OUTPUT(multiDataset, NAMED('multiDataset')) // Output for the combined multi-part files +); \ No newline at end of file diff --git a/testing/regress/ecl/parquetString.ecl b/testing/regress/ecl/parquetString.ecl new file mode 100644 index 00000000000..359e238dd0a --- /dev/null +++ b/testing/regress/ecl/parquetString.ecl @@ -0,0 +1,53 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet + +IMPORT Std; +IMPORT Parquet; + +layout := RECORD + STRING10 s1; + STRING20 s2; + STRING30 s3; +END; + +stringData := DATASET([ + {'Hello', 'World', 'Test Data 1'}, + {'HPCC', 'Systems', 'Test Data 2'}, + {'Parquet', 'I/O', 'Test Data 3'} +], layout); + +dropzoneDirectory := Std.File.GetDefaultDropZone(); +parquetFilePath := dropzoneDirectory + '/regress/string_test.parquet'; + +ParquetIO.Write(stringData, parquetFilePath, TRUE); + +parquetString := ParquetIO.Read(layout, parquetFilePath); + +layout compareTransform(layout original, layout fromParquet) := TRANSFORM + SELF.s1 := IF(original.s1 = fromParquet.s1, '', 'Mismatch in s1'); + SELF.s2 := IF(original.s2 = fromParquet.s2, '', 'Mismatch in s2'); + SELF.s3 := IF(original.s3 = fromParquet.s3, '', 'Mismatch in s3'); +END; + +result := JOIN(stringData, parquetString, + LEFT.s1 = RIGHT.s1 AND LEFT.s2 = RIGHT.s2 AND LEFT.s3 = RIGHT.s3, + compareTransform(LEFT, RIGHT), + FULL OUTER); + +OUTPUT(result, NAMED('ComparisonResult')); + +mismatchCount := COUNT(result(s1 != '' OR s2 != '' OR s3 != '')); +OUTPUT(IF(mismatchCount = 0, 'All records match', 'Mismatches found'), NAMED('TestResult')); + diff --git a/testing/regress/ecl/parquetWrite.ecl b/testing/regress/ecl/parquetWrite.ecl new file mode 100644 index 00000000000..237e08b3521 --- /dev/null +++ b/testing/regress/ecl/parquetWrite.ecl @@ -0,0 +1,49 @@ +/*############################################################################## + HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +############################################################################## */ + +//class=parquet + +IMPORT Parquet; +IMPORT Std; + +SimpleRecord := RECORD + INTEGER id; + STRING name; + DECIMAL8 price; + BOOLEAN isActive; +END; + +STRING generateName(INTEGER id) := CHOOSE(id % 10 + 1, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'); + +smallDataset := DATASET(50, TRANSFORM(SimpleRecord, + SELF.id := COUNTER, + SELF.name := generateName(COUNTER), + SELF.price := 10.00, // Fixed price + SELF.isActive := COUNTER % 2 = 0 // Alternating boolean values +)); + +mediumDataset := DATASET(250, TRANSFORM(SimpleRecord, + SELF.id := COUNTER, + SELF.name := generateName(COUNTER), + SELF.price := 20.00, // Fixed price + SELF.isActive := COUNTER % 2 = 0 // Alternating boolean values +)); + +ParquetIO.Write(smallDataset, '/var/lib/HPCCSystems/mydropzone/small1.parquet', TRUE); +ParquetIO.Write(mediumDataset, '/var/lib/HPCCSystems/mydropzone/medium1.parquet', TRUE); + +smallReadbackData := ParquetIO.Read(SimpleRecord, '/var/lib/HPCCSystems/mydropzone/small1.parquet'); +mediumReadbackData := ParquetIO.Read(SimpleRecord, '/var/lib/HPCCSystems/mydropzone/medium1.parquet'); + +OUTPUT(smallReadbackData, NAMED('SmallDataset')); +OUTPUT(mediumReadbackData, NAMED('MediumDataset'));