Skip to content

Commit

Permalink
Re-implement CSV record reader to skip unparseable lines (apache#14396)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackie-Jiang authored Nov 8, 2024
1 parent 381097d commit d03241a
Show file tree
Hide file tree
Showing 22 changed files with 657 additions and 745 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,14 @@ public class CSVRecordReaderConfig implements RecordReaderConfig {
private Character _escapeCharacter; // Default is null
private String _nullStringValue;
private boolean _skipHeader;
private boolean _skipUnParseableLines = false;
private boolean _ignoreEmptyLines = true;
private boolean _ignoreSurroundingSpaces = true;
private Character _quoteCharacter = '"';
private String _quoteMode;
private String _recordSeparator;

// When set to true, the record reader will stop processing the file if it encounters an error.
private boolean _stopOnError;

public String getFileFormat() {
return _fileFormat;
Expand Down Expand Up @@ -77,14 +78,6 @@ public void setMultiValueDelimiter(char multiValueDelimiter) {
_multiValueDelimiter = multiValueDelimiter;
}

public boolean isSkipUnParseableLines() {
return _skipUnParseableLines;
}

public void setSkipUnParseableLines(boolean skipUnParseableLines) {
_skipUnParseableLines = skipUnParseableLines;
}

public boolean isMultiValueDelimiterEnabled() {
return _multiValueDelimiterEnabled;
}
Expand Down Expand Up @@ -165,6 +158,14 @@ public void setRecordSeparator(String recordSeparator) {
_recordSeparator = recordSeparator;
}

public boolean isStopOnError() {
return _stopOnError;
}

public void setStopOnError(boolean stopOnError) {
_stopOnError = stopOnError;
}

@Override
public String toString() {
return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
id,name
"100","John"
"101","Jane"
"102","Alice"
"103","Bob"
100,John
101,Jane
102,Alice
103,Bob
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id|fisrtName|lastName
id|firstName|lastName
100|John|Doe
101|Jane|Doe
102|Jen|Doe
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id,name
100,NULL
,Jane
NULL,NULL
,
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id,name
100,"John
101,Jane"
102,Alice
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ id,name
102,Jerry

103,Suzanne
# below line is unparseable by the commons-csv library
# below line is unparseable by the commons-csv library
"104","Yu"s"
"105","Zack"

# below line is multi-line value
"105","Zack
Zack"

# below line is escaped quotes
\"106\",\"Ze\"

107,Zu

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name
John
Jane
Jen

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
firstName , lastName , id
John , Doe,100
Jane, Doe, 101
Jen,Doe ,102
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,name
"100","John"
"101","Jane"s"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id,name
"100","John"
"101","Jane"s"
"102","Alice"

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,14 @@ void init(File dataFile, @Nullable Set<String> fieldsToRead, @Nullable RecordRea

/**
* Return <code>true</code> if more records remain to be read.
* <p>This method should not throw exception. Caller is not responsible for handling exceptions from this method.
*/
boolean hasNext();

/**
* Get the next record.
* <p>This method should be called only if {@link #hasNext()} returns <code>true</code>. Caller is responsible for
* handling exceptions from this method and skip the row if user wants to continue reading the remaining rows.
*/
default GenericRow next()
throws IOException {
Expand All @@ -60,6 +63,8 @@ default GenericRow next()
/**
* Get the next record. Re-use the given row to reduce garbage.
* <p>The passed in row should be cleared before calling this method.
* <p>This method should be called only if {@link #hasNext()} returns <code>true</code>. Caller is responsible for
* handling exceptions from this method and skip the row if user wants to continue reading the remaining rows.
*
* TODO: Consider clearing the row within the record reader to simplify the caller
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,13 @@ public void closeRecordReader()

// Return true if RecordReader is done processing.
public boolean isRecordReaderDone() {
if (_isRecordReaderInitialized) {
return !_recordReader.hasNext();
if (!_isRecordReaderInitialized) {
return false;
}
if (_isRecordReaderClosed) {
return true;
}
return false;
return !_recordReader.hasNext();
}

// For testing purposes only.
Expand Down

0 comments on commit d03241a

Please sign in to comment.