From 105b44d819a37d9329df16ca4417c3b3f6c2ad98 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Sat, 15 Jun 2024 12:48:26 -0700 Subject: [PATCH] Fix memory issues in CSVFieldList (#237) --- .github/workflows/cmake-multi-platform.yml | 2 +- include/csv.hpp | 2 +- include/internal/csv_row.cpp | 6 ++--- include/internal/csv_row.hpp | 22 ++++++++++------ programs/csv_bench.cpp | 8 ++++++ single_include/csv.hpp | 30 +++++++++++++--------- single_include_test/csv.hpp | 30 +++++++++++++--------- 7 files changed, 63 insertions(+), 37 deletions(-) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index e4c85ab8..e2cce85a 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -4,7 +4,7 @@ name: CMake on multiple platforms on: push: - branches: [ "master", "remove-werror" ] + branches: [ "master", "memory-fix-csvfieldlist" ] pull_request: branches: [ "master" ] diff --git a/include/csv.hpp b/include/csv.hpp index e39b3e38..a9ba1f81 100644 --- a/include/csv.hpp +++ b/include/csv.hpp @@ -1,5 +1,5 @@ /* -CSV for C++, version 2.2.3 +CSV for C++, version 2.3.0 https://github.com/vincentlaucsb/csv-parser MIT License diff --git a/include/internal/csv_row.cpp b/include/internal/csv_row.cpp index 93e92917..9c9aaba3 100644 --- a/include/internal/csv_row.cpp +++ b/include/internal/csv_row.cpp @@ -15,10 +15,10 @@ namespace csv { } CSV_INLINE void CSVFieldList::allocate() { - RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; - buffers.push_back(buffer); + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); + _current_buffer_size = 0; - _back = &(buffers.back()[0]); + _back = buffers.back().get(); } } diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index bd92c0ae..0ab935d0 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include // For CSVField #include // For CSVField @@ -73,16 +74,15 @@ namespace csv { // CSVFieldArrays may be moved CSVFieldList(CSVFieldList&& other) : _single_buffer_capacity(other._single_buffer_capacity) { - buffers = std::move(other.buffers); + + for (auto&& buffer : other.buffers) { + this->buffers.emplace_back(std::move(buffer)); + } + _current_buffer_size = other._current_buffer_size; _back = other._back; } - ~CSVFieldList() { - for (auto& buffer : buffers) - delete[] buffer; - } - template void emplace_back(Args&&... args) { if (this->_current_buffer_size == this->_single_buffer_capacity) { @@ -102,7 +102,14 @@ namespace csv { private: const size_t _single_buffer_capacity; - std::vector buffers = {}; + /** + * Prefer std::deque over std::vector because it does not + * reallocate upon expansion, allowing pointers to its members + * to remain valid & avoiding potential race conditions when + * CSVFieldList is accesssed simulatenously by a reading thread and + * a writing thread + */ + std::deque> buffers = {}; /** Number of items in the current buffer */ size_t _current_buffer_size = 0; @@ -114,7 +121,6 @@ namespace csv { void allocate(); }; - /** A class for storing raw CSV data and associated metadata */ struct RawCSVData { std::shared_ptr _data = nullptr; diff --git a/programs/csv_bench.cpp b/programs/csv_bench.cpp index cde8e587..6793050a 100644 --- a/programs/csv_bench.cpp +++ b/programs/csv_bench.cpp @@ -21,8 +21,15 @@ int main(int argc, char** argv) { std::chrono::duration diff = end - start; std::cout << "Parsing took (including disk IO): " << diff.count() << std::endl; + std::cout << "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns " << std::endl; + std::cout << "Columns: "; + for (auto& col : info.col_names) { + std::cout << " " << col; + } + std::cout << std::endl; // Benchmark 2: Parsing Only + /* std::ifstream csv(filename); std::stringstream buffer; buffer << csv.rdbuf(); @@ -35,6 +42,7 @@ int main(int argc, char** argv) { diff = end - start; std::cout << "Parsing took: " << diff.count() << std::endl; + */ return 0; } \ No newline at end of file diff --git a/single_include/csv.hpp b/single_include/csv.hpp index a65bc44c..811c8e14 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -1,6 +1,6 @@ #pragma once /* -CSV for C++, version 2.2.3 +CSV for C++, version 2.3.0 https://github.com/vincentlaucsb/csv-parser MIT License @@ -5051,6 +5051,7 @@ namespace csv { */ #include +#include #include #include // For CSVField #include // For CSVField @@ -5468,16 +5469,15 @@ namespace csv { // CSVFieldArrays may be moved CSVFieldList(CSVFieldList&& other) : _single_buffer_capacity(other._single_buffer_capacity) { - buffers = std::move(other.buffers); + + for (auto&& buffer : other.buffers) { + this->buffers.emplace_back(std::move(buffer)); + } + _current_buffer_size = other._current_buffer_size; _back = other._back; } - ~CSVFieldList() { - for (auto& buffer : buffers) - delete[] buffer; - } - template void emplace_back(Args&&... args) { if (this->_current_buffer_size == this->_single_buffer_capacity) { @@ -5497,7 +5497,14 @@ namespace csv { private: const size_t _single_buffer_capacity; - std::vector buffers = {}; + /** + * Prefer std::deque over std::vector because it does not + * reallocate upon expansion, allowing pointers to its members + * to remain valid & avoiding potential race conditions when + * CSVFieldList is accesssed simulatenously by a reading thread and + * a writing thread + */ + std::deque> buffers = {}; /** Number of items in the current buffer */ size_t _current_buffer_size = 0; @@ -5509,7 +5516,6 @@ namespace csv { void allocate(); }; - /** A class for storing raw CSV data and associated metadata */ struct RawCSVData { std::shared_ptr _data = nullptr; @@ -7708,10 +7714,10 @@ namespace csv { } CSV_INLINE void CSVFieldList::allocate() { - RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; - buffers.push_back(buffer); + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); + _current_buffer_size = 0; - _back = &(buffers.back()[0]); + _back = buffers.back().get(); } } diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index a65bc44c..811c8e14 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -1,6 +1,6 @@ #pragma once /* -CSV for C++, version 2.2.3 +CSV for C++, version 2.3.0 https://github.com/vincentlaucsb/csv-parser MIT License @@ -5051,6 +5051,7 @@ namespace csv { */ #include +#include #include #include // For CSVField #include // For CSVField @@ -5468,16 +5469,15 @@ namespace csv { // CSVFieldArrays may be moved CSVFieldList(CSVFieldList&& other) : _single_buffer_capacity(other._single_buffer_capacity) { - buffers = std::move(other.buffers); + + for (auto&& buffer : other.buffers) { + this->buffers.emplace_back(std::move(buffer)); + } + _current_buffer_size = other._current_buffer_size; _back = other._back; } - ~CSVFieldList() { - for (auto& buffer : buffers) - delete[] buffer; - } - template void emplace_back(Args&&... args) { if (this->_current_buffer_size == this->_single_buffer_capacity) { @@ -5497,7 +5497,14 @@ namespace csv { private: const size_t _single_buffer_capacity; - std::vector buffers = {}; + /** + * Prefer std::deque over std::vector because it does not + * reallocate upon expansion, allowing pointers to its members + * to remain valid & avoiding potential race conditions when + * CSVFieldList is accesssed simulatenously by a reading thread and + * a writing thread + */ + std::deque> buffers = {}; /** Number of items in the current buffer */ size_t _current_buffer_size = 0; @@ -5509,7 +5516,6 @@ namespace csv { void allocate(); }; - /** A class for storing raw CSV data and associated metadata */ struct RawCSVData { std::shared_ptr _data = nullptr; @@ -7708,10 +7714,10 @@ namespace csv { } CSV_INLINE void CSVFieldList::allocate() { - RawCSVField * buffer = new RawCSVField[_single_buffer_capacity]; - buffers.push_back(buffer); + buffers.push_back(std::unique_ptr(new RawCSVField[_single_buffer_capacity])); + _current_buffer_size = 0; - _back = &(buffers.back()[0]); + _back = buffers.back().get(); } }