diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java index 49339ac218c8..c4dc8c167f56 100644 --- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java +++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java @@ -137,11 +137,11 @@ public void init(File dataFile, @Nullable Set fieldsToRead, @Nullable Re } if (_isHeaderProvided) { + _headerMap = parseLineAsHeader(config.getHeader()); + _format = _format.builder().setHeader(_headerMap.keySet().toArray(new String[0])).build(); if (!_useLineIterator) { validateHeaderForDelimiter(delimiter, config.getHeader(), _format); } - _headerMap = parseLineAsHeader(config.getHeader()); - _format = _format.builder().setHeader(_headerMap.keySet().toArray(new String[0])).build(); } if (config.isMultiValueDelimiterEnabled()) { @@ -329,7 +329,12 @@ private void initLineIteratorResources() // read the first line String headerLine = _bufferedReader.readLine(); _headerMap = parseLineAsHeader(headerLine); - _format = _format.builder().setHeader(_headerMap.keySet().toArray(new String[0])).build(); + _format = _format.builder() + // If header isn't provided, the first line would be set as header and the 'skipHeader' property + // is set to false. + .setSkipHeaderRecord(false) + .setHeader(_headerMap.keySet().toArray(new String[0])) + .build(); } _nextLine = _bufferedReader.readLine(); } diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java index e174ced7e50c..d245fb33c0e3 100644 --- a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java +++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java @@ -532,6 +532,46 @@ public void testReadingDataFileWithNoRecords() // Note: The default CSVRecordReader cannot handle unparseable rows } + @Test + public void testReadingDataFileWithNoHeaderAndDataRecordsWithEmptyValues() + throws URISyntaxException, IOException { + URI uri = ClassLoader.getSystemResource("dataFileWithNoHeader2.csv").toURI(); + File dataFile = new File(uri); + + // test using line iterator + CSVRecordReaderConfig readerConfig = new CSVRecordReaderConfig(); + readerConfig.setSkipUnParseableLines(true); + readerConfig.setHeader("key,num0,num1"); + List genericRows = readCSVRecords(dataFile, readerConfig, null, false); + Assert.assertEquals(4, genericRows.size()); + + // test using default CSVRecordReader + readerConfig.setSkipUnParseableLines(false); + genericRows = readCSVRecords(dataFile, readerConfig, null, false); + Assert.assertEquals(4, genericRows.size()); + } + + @Test + public void testReadingDataFileWithValidHeaders() + throws URISyntaxException, IOException { + URI uri = ClassLoader.getSystemResource("dataFileWithValidHeaders.csv").toURI(); + File dataFile = new File(uri); + + // test using line iterator + CSVRecordReaderConfig readerConfig = new CSVRecordReaderConfig(); + readerConfig.setSkipUnParseableLines(true); + // No explicit header is set and attempt to skip the header should be ignored. 1st line would be treated as the + // header line. + readerConfig.setSkipHeader(false); + List genericRows = readCSVRecords(dataFile, readerConfig, null, false); + Assert.assertEquals(4, genericRows.size()); + + // test using default CSVRecordReader + readerConfig.setSkipUnParseableLines(false); + genericRows = readCSVRecords(dataFile, readerConfig, null, false); + Assert.assertEquals(4, genericRows.size()); + } + private List readCSVRecords(File dataFile, CSVRecordReaderConfig readerConfig, GenericRow genericRow, boolean rewind) throws IOException { @@ -543,10 +583,11 @@ private List readCSVRecords(File dataFile, while (recordReader.hasNext()) { if (genericRow != null) { recordReader.next(reuse); + genericRows.add(reuse); } else { - recordReader.next(); + GenericRow nextRow = recordReader.next(); + genericRows.add(nextRow); } - genericRows.add(genericRow); } if (rewind) { diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv new file mode 100644 index 000000000000..e54016ac4c94 --- /dev/null +++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv @@ -0,0 +1,4 @@ +"key00",12.3,8.42 +"key01",,7.1 +"key02",,16.81 +"key03",,7.12 diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv new file mode 100644 index 000000000000..010cab05fa30 --- /dev/null +++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv @@ -0,0 +1,5 @@ +"key","num0","num1" +"key00",12.3,8.42 +"key01",,7.1 +"key02",,16.81 +"key03",,7.12