From b16ef63abbcde3cbcd131d0673cca9150c1c8a21 Mon Sep 17 00:00:00 2001 From: Robert Hague Date: Sun, 2 Apr 2023 18:14:17 +0200 Subject: [PATCH] Expose GetFieldSpan on IParser Co-authored-by: JanEggers --- src/CsvHelper/CsvParser.cs | 230 +++++++++--------- src/CsvHelper/FieldCache.cs | 27 +- src/CsvHelper/IParser.cs | 25 +- tests/CsvHelper.Tests/CsvParserTests.cs | 13 + tests/CsvHelper.Tests/Mocks/ParserMock.cs | 4 + tests/CsvHelper.Tests/Parsing/BadDataTests.cs | 30 +++ .../Parsing/FieldCacheTests.cs | 15 +- 7 files changed, 204 insertions(+), 140 deletions(-) diff --git a/src/CsvHelper/CsvParser.cs b/src/CsvHelper/CsvParser.cs index 99bdbcbea..8fdfccec1 100644 --- a/src/CsvHelper/CsvParser.cs +++ b/src/CsvHelper/CsvParser.cs @@ -61,12 +61,12 @@ public class CsvParser : IParser, IDisposable private bool inQuotes; private bool inEscape; private Field[] fields; - private string[] processedFields; + private Memory[] processedFields; private int fieldsPosition; private bool disposed; private int quoteCount; private char[] processFieldBuffer; - private int processFieldBufferSize; + private int processFieldBufferUsedLength; private ParserState state; private int delimiterPosition = 1; private int newLinePosition = 1; @@ -76,6 +76,11 @@ public class CsvParser : IParser, IDisposable private bool isRecordProcessed; private string[] record; + /// + /// Gets a span representing the currently unused space in . + /// + private Span AvailableProcessFieldBuffer => processFieldBuffer.AsSpan(processFieldBufferUsedLength); + /// public long CharCount => charCount; @@ -117,6 +122,15 @@ public string[] Record /// public string RawRecord => new string(buffer, rowStartPosition, bufferPosition - rowStartPosition); + /// + /// + /// WARNING: This is an advanced API. The underlying memory of the returned span may + /// change upon subsequent calls to , resulting in undefined behavior. + /// If this is unclear or you want a consistent view of the raw record at this + /// point in time, use instead. + /// + public ReadOnlySpan RawRecordSpan => buffer.AsSpan(rowStartPosition, bufferPosition - rowStartPosition); + /// public int Count => fieldsPosition; @@ -133,27 +147,14 @@ public string[] Record public IParserConfiguration Configuration => configuration; /// + /// is negative or greater than or equal to . public string this[int index] { get { - if (isProcessingField) - { - var message = - $"You can't access {nameof(IParser)}[int] or {nameof(IParser)}.{nameof(IParser.Record)} inside of the {nameof(BadDataFound)} callback. " + - $"Use {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.Field)} and {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.RawRecord)} instead." - ; - - throw new ParserException(Context, message); - } - - isProcessingField = true; + ReadOnlySpan fieldSpan = GetFieldSpan(index); - var field = GetField(index); - - isProcessingField = false; - - return field; + return cacheFields ? fieldCache.GetField(fieldSpan) : fieldSpan.ToString(); } } @@ -200,15 +201,14 @@ public CsvParser(TextReader reader, IParserConfiguration configuration, bool lea newLine = configuration.NewLine; newLineFirstChar = configuration.NewLine[0]; mode = configuration.Mode; - processFieldBufferSize = configuration.ProcessFieldBufferSize; quote = configuration.Quote; whiteSpaceChars = configuration.WhiteSpaceChars; trimOptions = configuration.TrimOptions; buffer = new char[bufferSize]; - processFieldBuffer = new char[processFieldBufferSize]; + processFieldBuffer = new char[configuration.ProcessFieldBufferSize]; fields = new Field[128]; - processedFields = new string[128]; + processedFields = new Memory[128]; } /// @@ -219,6 +219,7 @@ public bool Read() fieldStartPosition = rowStartPosition; fieldsPosition = 0; quoteCount = 0; + processFieldBufferUsedLength = 0; row++; rawRow++; var c = '\0'; @@ -254,6 +255,7 @@ public async Task ReadAsync() fieldStartPosition = rowStartPosition; fieldsPosition = 0; quoteCount = 0; + processFieldBufferUsedLength = 0; row++; rawRow++; var c = '\0'; @@ -785,30 +787,55 @@ private async Task FillBufferAsync() return true; } - private string GetField(int index) + /// + /// + /// WARNING: This is an advanced API. The underlying memory of the returned span may + /// change upon subsequent calls to , resulting in undefined behavior. + /// If this is unclear or you want a consistent view of the field at this point + /// in time, use instead. + /// + /// is negative or greater than or equal to . + public ReadOnlySpan GetFieldSpan(int index) { - if (index > fieldsPosition) + if (index >= fieldsPosition) { - throw new IndexOutOfRangeException(); + throw new IndexOutOfRangeException($"Index was out of range. Must be non-negative and less than {nameof(Count)}"); } ref var field = ref fields[index]; if (field.Length == 0) { - return string.Empty; + return ReadOnlySpan.Empty; } if (field.IsProcessed) { - return processedFields[index]; + return processedFields[index].Span; } + if (isProcessingField) + { + // This check is to guard against stack overflow in the case that + // someone tries to access a (bad and unprocessed) field from + // within BadDataFound, and that access calls BadDataFound, + // which then tries to access... (and so on until the process crashes). + + var message = + $"You can't access {nameof(IParser)}[int], {nameof(IParser)}.{nameof(IParser.GetFieldSpan)} or " + + $"{nameof(IParser)}.{nameof(IParser.Record)} inside of the {nameof(BadDataFound)} callback. " + + $"Use {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.Field)} and {nameof(BadDataFoundArgs)}.{nameof(BadDataFoundArgs.RawRecord)} instead."; + + throw new ParserException(Context, message); + } + + isProcessingField = true; + var start = field.Start + rowStartPosition; var length = field.Length; var quoteCount = field.QuoteCount; - ProcessedField processedField; + Memory processedField; switch (mode) { case CsvMode.RFC4180: @@ -826,14 +853,12 @@ private string GetField(int index) throw new InvalidOperationException($"ParseMode '{mode}' is not handled."); } - var value = cacheFields - ? fieldCache.GetField(processedField.Buffer, processedField.Start, processedField.Length) - : new string(processedField.Buffer, processedField.Start, processedField.Length); - - processedFields[index] = value; + processedFields[index] = processedField; field.IsProcessed = true; - return value; + isProcessingField = false; + + return processedField.Span; } /// @@ -843,7 +868,7 @@ private string GetField(int index) /// The length of the field. /// The number of counted quotes. /// The processed field. - protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCount) + protected Memory ProcessRFC4180Field(int start, int length, int quoteCount) { var newStart = start; var newLength = length; @@ -857,8 +882,7 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou { // Not quoted. // No processing needed. - - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); } if (buffer[newStart] != quote || buffer[newStart + newLength - 1] != quote || newLength == 1 && buffer[newStart] == quote) @@ -891,19 +915,10 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou { // The only quotes are the ends of the field. // No more processing is needed. - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); } - if (newLength > processFieldBuffer.Length) - { - // Make sure the field processing buffer is large engough. - while (newLength > processFieldBufferSize) - { - processFieldBufferSize *= 2; - } - - processFieldBuffer = new char[processFieldBufferSize]; - } + EnsureAvailableProcessFieldBuffer(newLength); // Remove escapes. var inEscape = false; @@ -923,11 +938,14 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou continue; } - processFieldBuffer[position] = c; + AvailableProcessFieldBuffer[position] = c; position++; } - return new ProcessedField(0, position, processFieldBuffer); + int fieldStartIndex = processFieldBufferUsedLength; + processFieldBufferUsedLength += position; + + return processFieldBuffer.AsMemory(fieldStartIndex, position); } /// @@ -936,12 +954,10 @@ protected ProcessedField ProcessRFC4180Field(int start, int length, int quoteCou /// The start index of the field. /// The length of the field. /// The processed field. - protected ProcessedField ProcessRFC4180BadField(int start, int length) + protected Memory ProcessRFC4180BadField(int start, int length) { // If field is already known to be bad, different rules can be applied. - - var args = new BadDataFoundArgs(new string(buffer, start, length), RawRecord, Context); - badDataFound?.Invoke(args); + badDataFound?.Invoke(new BadDataFoundArgs(new string(buffer, start, length), RawRecord, Context)); var newStart = start; var newLength = length; @@ -954,19 +970,10 @@ protected ProcessedField ProcessRFC4180BadField(int start, int length) if (buffer[newStart] != quote) { // If the field doesn't start with a quote, don't process it. - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); } - if (newLength > processFieldBuffer.Length) - { - // Make sure the field processing buffer is large engough. - while (newLength > processFieldBufferSize) - { - processFieldBufferSize *= 2; - } - - processFieldBuffer = new char[processFieldBufferSize]; - } + EnsureAvailableProcessFieldBuffer(newLength); // Remove escapes until the last quote is found. var inEscape = false; @@ -1007,11 +1014,14 @@ protected ProcessedField ProcessRFC4180BadField(int start, int length) continue; } - processFieldBuffer[position] = c; + AvailableProcessFieldBuffer[position] = c; position++; } - return new ProcessedField(0, position, processFieldBuffer); + int fieldStartIndex = processFieldBufferUsedLength; + processFieldBufferUsedLength += position; + + return processFieldBuffer.AsMemory(fieldStartIndex, position); } /// @@ -1020,7 +1030,7 @@ protected ProcessedField ProcessRFC4180BadField(int start, int length) /// The start index of the field. /// The length of the field. /// The processed field. - protected ProcessedField ProcessEscapeField(int start, int length) + protected Memory ProcessEscapeField(int start, int length) { var newStart = start; var newLength = length; @@ -1030,16 +1040,7 @@ protected ProcessedField ProcessEscapeField(int start, int length) ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars); } - if (newLength > processFieldBuffer.Length) - { - // Make sure the field processing buffer is large engough. - while (newLength > processFieldBufferSize) - { - processFieldBufferSize *= 2; - } - - processFieldBuffer = new char[processFieldBufferSize]; - } + EnsureAvailableProcessFieldBuffer(newLength); // Remove escapes. var inEscape = false; @@ -1058,11 +1059,14 @@ protected ProcessedField ProcessEscapeField(int start, int length) continue; } - processFieldBuffer[position] = c; + AvailableProcessFieldBuffer[position] = c; position++; } - return new ProcessedField(0, position, processFieldBuffer); + int fieldStartIndex = processFieldBufferUsedLength; + processFieldBufferUsedLength += position; + + return processFieldBuffer.AsMemory(fieldStartIndex, position); } /// @@ -1072,7 +1076,7 @@ protected ProcessedField ProcessEscapeField(int start, int length) /// The start index of the field. /// The length of the field. /// The processed field. - protected ProcessedField ProcessNoEscapeField(int start, int length) + protected Memory ProcessNoEscapeField(int start, int length) { var newStart = start; var newLength = length; @@ -1082,7 +1086,35 @@ protected ProcessedField ProcessNoEscapeField(int start, int length) ArrayHelper.Trim(buffer, ref newStart, ref newLength, whiteSpaceChars); } - return new ProcessedField(newStart, newLength, buffer); + return buffer.AsMemory(newStart, newLength); + } + + private void EnsureAvailableProcessFieldBuffer(int length) + { + if (AvailableProcessFieldBuffer.Length < length) + { + // We iteratively double the buffer size (rather than sizing it exactly + // to "length") to reduce the number of resizes for subsequent fields. + + int newProcessFieldBufferLength; + checked + { + int minimumSize = processFieldBufferUsedLength + length; + + newProcessFieldBufferLength = 1 + processFieldBufferUsedLength * 2; + + while (newProcessFieldBufferLength < minimumSize) + { + newProcessFieldBufferLength *= 2; + } + } + + Debug.Assert(newProcessFieldBufferLength > processFieldBuffer.Length); + + Array.Resize(ref processFieldBuffer, newProcessFieldBufferLength); + + Debug.Assert(AvailableProcessFieldBuffer.Length >= length); + } } /// @@ -1120,42 +1152,6 @@ protected virtual void Dispose(bool disposing) disposed = true; } - /// - /// Processes a raw field based on configuration. - /// This will remove quotes, remove escapes, and trim if configured to. - /// - [DebuggerDisplay("Start = {Start}, Length = {Length}, Buffer.Length = {Buffer.Length}")] - protected readonly struct ProcessedField - { - /// - /// The start of the field in the buffer. - /// - public readonly int Start; - - /// - /// The length of the field in the buffer. - /// - public readonly int Length; - - /// - /// The buffer that contains the field. - /// - public readonly char[] Buffer; - - /// - /// Creates a new instance of ProcessedField. - /// - /// The start of the field in the buffer. - /// The length of the field in the buffer. - /// The buffer that contains the field. - public ProcessedField(int start, int length, char[] buffer) - { - Start = start; - Length = length; - Buffer = buffer; - } - } - private enum ReadLineResult { None = 0, diff --git a/src/CsvHelper/FieldCache.cs b/src/CsvHelper/FieldCache.cs index 37686e527..9392be37c 100644 --- a/src/CsvHelper/FieldCache.cs +++ b/src/CsvHelper/FieldCache.cs @@ -7,6 +7,7 @@ using System.Diagnostics; using System.Linq; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Text; using System.Threading.Tasks; @@ -34,26 +35,26 @@ public FieldCache(int initialSize = 128, int maxFieldSize = 128) entries = new Entry[size]; } - public string GetField(char[] buffer, int start, int length) + public string GetField(ReadOnlySpan buffer) { - if (length == 0) + if (buffer.IsEmpty) { return string.Empty; } - if (length > maxFieldSize) + if (buffer.Length > maxFieldSize) { - return new string(buffer, start, length); + return buffer.ToString(); } - var hashCode = GetHashCode(buffer, start, length); + var hashCode = GetHashCode(buffer); ref var bucket = ref GetBucket(hashCode); int i = bucket - 1; while ((uint)i < (uint)entries.Length) { ref var entry = ref entries[i]; - if (entry.HashCode == hashCode && entry.Value.AsSpan().SequenceEqual(new Span(buffer, start, length))) + if (entry.HashCode == hashCode && entry.Value.AsSpan().SequenceEqual(buffer)) { return entry.Value; } @@ -70,7 +71,7 @@ public string GetField(char[] buffer, int start, int length) ref var reference = ref entries[count]; reference.HashCode = hashCode; reference.Next = bucket - 1; - reference.Value = new string(buffer, start, length); + reference.Value = buffer.ToString(); bucket = count + 1; count++; @@ -78,17 +79,23 @@ public string GetField(char[] buffer, int start, int length) } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private uint GetHashCode(char[] buffer, int start, int length) + private static uint GetHashCode(ReadOnlySpan buffer) { unchecked { +#if NET6_0_OR_GREATER + HashCode hash = new(); + hash.AddBytes(MemoryMarshal.AsBytes(buffer)); + return (uint)hash.ToHashCode(); +#else uint hash = 17; - for (var i = start; i < start + length; i++) + foreach (char c in buffer) { - hash = hash * 31 + buffer[i]; + hash = hash * 31 + c; } return hash; +#endif } } diff --git a/src/CsvHelper/IParser.cs b/src/CsvHelper/IParser.cs index f3ddd6445..fd527d472 100644 --- a/src/CsvHelper/IParser.cs +++ b/src/CsvHelper/IParser.cs @@ -33,10 +33,21 @@ public interface IParser : IDisposable /// /// Gets the field at the specified index for the current row. /// - /// The index. - /// The field. + /// The index of the field in the current row. + /// A representing the field at the specified index in the current row. string this[int index] { get; } + /// + /// Gets a span over the field at the specified index in the current row. + /// + /// The index of the field in the current row. + /// A span representing the field at the specified index in the current row. + ReadOnlySpan GetFieldSpan(int index) +#if NET + => this[index] +#endif + ; + /// /// Gets the record for the current row. Note: /// It is much more efficient to only get the fields you need. If @@ -49,6 +60,16 @@ public interface IParser : IDisposable /// string RawRecord { get; } + /// + /// Gets a span over the raw record for the current row. + /// + ReadOnlySpan RawRecordSpan +#if NET + => RawRecord; +#else + { get; } +#endif + /// /// Gets the CSV row the parser is currently on. /// diff --git a/tests/CsvHelper.Tests/CsvParserTests.cs b/tests/CsvHelper.Tests/CsvParserTests.cs index e9b899ee3..4daa96784 100644 --- a/tests/CsvHelper.Tests/CsvParserTests.cs +++ b/tests/CsvHelper.Tests/CsvParserTests.cs @@ -1366,5 +1366,18 @@ public void RawRowCountWithSingleLineAndNoLineEndingTest() Assert.Equal(1, parser.RawRow); } } + + [Theory] + [InlineData(-1)] + [InlineData(2)] + public void Parser_IndexOutOfRangeException(int index) + { + using (var reader = new StringReader("1,2\r\n")) + using (var parser = new CsvParser(reader, CultureInfo.InvariantCulture)) + { + Assert.True(parser.Read()); + Assert.Throws(() => parser[index]); + } + } } } diff --git a/tests/CsvHelper.Tests/Mocks/ParserMock.cs b/tests/CsvHelper.Tests/Mocks/ParserMock.cs index c3b224a74..29af6ee12 100644 --- a/tests/CsvHelper.Tests/Mocks/ParserMock.cs +++ b/tests/CsvHelper.Tests/Mocks/ParserMock.cs @@ -29,6 +29,8 @@ public class ParserMock : IParser, IEnumerable public string RawRecord => string.Empty; + public ReadOnlySpan RawRecordSpan => ReadOnlySpan.Empty; + public int Row => row; public int RawRow => row; @@ -41,6 +43,8 @@ public class ParserMock : IParser, IEnumerable public string this[int index] => record[index]; + public ReadOnlySpan GetFieldSpan(int index) => record[index]; + public ParserMock() : this(new CsvConfiguration(CultureInfo.InvariantCulture)) { } public ParserMock(CsvConfiguration configuration) diff --git a/tests/CsvHelper.Tests/Parsing/BadDataTests.cs b/tests/CsvHelper.Tests/Parsing/BadDataTests.cs index 92a60fff5..3fc5b59d1 100644 --- a/tests/CsvHelper.Tests/Parsing/BadDataTests.cs +++ b/tests/CsvHelper.Tests/Parsing/BadDataTests.cs @@ -149,5 +149,35 @@ public void Read_AccessingParserRecordInBadDataFound_ThrowsParserException() Assert.Throws(() => parser[1]); } + + [Fact] + public void ConsecutiveBadDataTest() + { + var config = new CsvConfiguration(CultureInfo.InvariantCulture) + { + BadDataFound = null, + CacheFields = false, + ProcessFieldBufferSize = 4 + }; + // These 3 fields each use the processFieldBuffer. + // The test is to ensure consistency of the fields during a read, + // i.e. the memory that each field points to is not overwritten + // during the processing of the other fields in the same row. + string csv = "\"\"\"\",\"two\" \"2,\"three\" \"3\r\n"; // """","two" "2,"three" "3 + using (var reader = new StringReader(csv)) + using (var parser = new CsvParser(reader, config)) + { + Assert.True(parser.Read()); + + Assert.Equal(3, parser.Count); + Assert.Equal("\"", parser.GetFieldSpan(0).ToString()); + Assert.Equal("two \"2", parser.GetFieldSpan(1).ToString()); + Assert.Equal("three \"3", parser.GetFieldSpan(2).ToString()); + Assert.Equal("two \"2", parser.GetFieldSpan(1).ToString()); + Assert.Equal("\"", parser.GetFieldSpan(0).ToString()); + + Assert.False(parser.Read()); + } + } } } diff --git a/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs b/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs index cab3236fb..0cb60d2c7 100644 --- a/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs +++ b/tests/CsvHelper.Tests/Parsing/FieldCacheTests.cs @@ -64,8 +64,6 @@ public void Read_WithFieldCacheDisabled_ReturnsDifferentFieldInstance() [Fact] public void Test1() { - // "542008", "27721116", "98000820" have hash code 3769566006 - var value1 = "542008"; var value2 = "27721116"; var value3 = "98000820"; @@ -73,21 +71,16 @@ public void Test1() var cache = new FieldCache(1); - var field1 = cache.GetField(value1.ToCharArray(), 0, value1.Length); - var field2 = cache.GetField(value2.ToCharArray(), 0, value2.Length); - var field3 = cache.GetField(value3.ToCharArray(), 0, value3.Length); - var field4 = cache.GetField(value4.ToCharArray(), 0, value4.Length); + var field1 = cache.GetField(value1); + var field2 = cache.GetField(value2); + var field3 = cache.GetField(value3); + var field4 = cache.GetField(value4); Assert.Equal(value1, field1); Assert.Equal(value2, field2); Assert.Equal(value3, field3); Assert.Equal(value4, field4); - Assert.NotSame(value1, field1); - Assert.NotSame(value2, field2); - Assert.NotSame(value3, field3); - Assert.NotSame(value4, field4); - Assert.Same(field1, field4); } }