diff --git a/CMakeLists.txt b/CMakeLists.txt index 0549d43d..4a8884fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,67 +19,17 @@ else() set(CMAKE_CXX_FLAGS_DEBUG "-Og -g -lgcov --coverage") endif(MSVC) -message("CSV for C++ ${CMAKE_BUILD_TYPE} Build with ${CMAKE_CXX_COMPILER}") +set(CSV_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/) +set(CSV_SOURCE_DIR ${CSV_INCLUDE_DIR}/internal/) +set(CSV_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests) -set(SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/internal/) -set(TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests) - -# file(GLOB_RECURSE SOURCES include/ *.hpp *.cpp) -set(SOURCES - ${SOURCE_DIR}/csv_reader.cpp - ${SOURCE_DIR}/csv_reader_iterator.cpp - ${SOURCE_DIR}/csv_row.cpp - ${SOURCE_DIR}/csv_stat.cpp - ${SOURCE_DIR}/csv_utility.cpp - ${SOURCE_DIR}/data_type.cpp - ${SOURCE_DIR}/giant_string_buffer.cpp -) -set(TEST_SOURCES - ${TEST_DIR}/catch.hpp - ${TEST_DIR}/main.cpp - ${TEST_DIR}/test_csv_iterator.cpp - ${TEST_DIR}/test_csv_buffer.cpp - ${TEST_DIR}/test_csv_row.cpp - ${TEST_DIR}/test_csv_stat.cpp - ${TEST_DIR}/test_read_csv.cpp - ${TEST_DIR}/test_write_csv.cpp - ${TEST_DIR}/test_data_type.cpp -) - -include_directories(${CMAKE_CURRENT_LIST_DIR}/include/) -include_directories(${TEST_DIR}) +include_directories(${CSV_INCLUDE_DIR}) ## Main Library -add_library(csv STATIC ${SOURCES}) -set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX) +add_subdirectory(${CSV_SOURCE_DIR}) ## Executables -add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/programs/csv_info.cpp) -target_link_libraries(csv_info csv) - -add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/programs/csv_bench.cpp) -target_link_libraries(csv_bench csv) - -add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/programs/csv_guess_bench.cpp) -target_link_libraries(csv_guess_bench csv) - -add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/programs/csv_stats.cpp) -target_link_libraries(csv_stats csv) - -add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/programs/csv_generator.cpp) -target_link_libraries(csv_generator csv) - -add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/programs/data_type_bench.cpp) -target_link_libraries(data_type_bench csv) +add_subdirectory("programs") ## Tests -add_executable(csv_test ${TEST_SOURCES}) -target_link_libraries(csv_test csv) -add_custom_command( - TARGET csv_test POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory - ${TEST_DIR}/data $/tests/data -) - -enable_testing() -add_test(test csv_test) \ No newline at end of file +add_subdirectory("tests") \ No newline at end of file diff --git a/include/internal/CMakeLists.txt b/include/internal/CMakeLists.txt new file mode 100644 index 00000000..53dcfa09 --- /dev/null +++ b/include/internal/CMakeLists.txt @@ -0,0 +1,14 @@ +add_library(csv STATIC "") + +target_sources(csv + PRIVATE + csv_reader.cpp + csv_reader_iterator.cpp + csv_row.cpp + csv_stat.cpp + csv_utility.cpp + data_type.cpp + giant_string_buffer.cpp +) + +set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX) \ No newline at end of file diff --git a/include/internal/csv_reader.cpp b/include/internal/csv_reader.cpp index 393de4d7..d631c31b 100644 --- a/include/internal/csv_reader.cpp +++ b/include/internal/csv_reader.cpp @@ -267,8 +267,8 @@ namespace csv { return CSV_NOT_FOUND; } - void CSVReader::feed(std::unique_ptr&& buff) { - this->feed(csv::string_view(buff.get())); + void CSVReader::feed(WorkItem&& buff) { + this->feed( csv::string_view(buff.first.get(), buff.second) ); } void CSVReader::feed(csv::string_view in) { @@ -296,56 +296,72 @@ namespace csv { this->record_buffer->reserve(in.size()); std::string& _record_buffer = *(this->record_buffer.get()); - for (size_t i = 0; i < in.size(); i++) { - if (!quote_escape) { - switch (this->parse_flags[in[i] + 128]) { - case NOT_SPECIAL: - _record_buffer +=in[i]; - break; + const size_t in_size = in.size(); + for (size_t i = 0; i < in_size; i++) { + switch (this->parse_flags[in[i] + 128]) { case DELIMITER: - this->split_buffer.push_back(this->record_buffer.size()); - break; + if (!quote_escape) { + this->split_buffer.push_back(this->record_buffer.size()); + break; + } case NEWLINE: - // End of record -> Write record - if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF) - ++i; - this->write_record(); - break; - default: // Quote - // Case: Previous character was delimiter or newline - if (i) { // Don't deref past beginning - auto prev_ch = this->parse_flags[in[i - 1] + 128]; - if (prev_ch >= DELIMITER) quote_escape = true; + if (!quote_escape) { + // End of record -> Write record + if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF) + ++i; + this->write_record(); + break; } + case NOT_SPECIAL: { + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + #if __cplusplus >= 201703L + size_t start = i; + while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + i++; + } + + _record_buffer += in.substr(start, i - start + 1); + #else + _record_buffer += in[i]; + + while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + _record_buffer += in[++i]; + } + #endif + break; } - } - else { - switch (this->parse_flags[in[i] + 128]) { - case NOT_SPECIAL: - case DELIMITER: - case NEWLINE: - // Treat as a regular character - _record_buffer +=in[i]; - break; default: // Quote + if (!quote_escape) { + // Don't deref past beginning + if (i && this->parse_flags[in[i - 1] + 128] >= DELIMITER) { + // Case: Previous character was delimiter or newline + quote_escape = true; + } + + break; + } + auto next_ch = this->parse_flags[in[i + 1] + 128]; if (next_ch >= DELIMITER) { // Case: Delim or newline => end of field quote_escape = false; + break; } - else { - // Case: Escaped quote - _record_buffer +=in[i]; - - if (next_ch == QUOTE) - ++i; // Case: Two consecutive quotes - else if (this->strict) - throw std::runtime_error("Unescaped single quote around line " + - std::to_string(this->correct_rows) + " near:\n" + - std::string(in.substr(i, 100))); - } - } + + // Case: Escaped quote + _record_buffer += in[i]; + + if (next_ch == QUOTE) + ++i; // Case: Two consecutive quotes + else if (this->strict) + throw std::runtime_error("Unescaped single quote around line " + + std::to_string(this->correct_rows) + " near:\n" + + std::string(in.substr(i, 100))); + + break; } } @@ -415,7 +431,7 @@ namespace csv { this->feed_buffer.pop_front(); // Nullptr --> Die - if (!in) break; + if (!in.first) break; lock.unlock(); // Release lock this->feed(std::move(in)); @@ -455,11 +471,12 @@ namespace csv { char * result = std::fgets(line_buffer, internals::PAGE_SIZE, this->infile); if (result == NULL) break; line_buffer += std::strlen(line_buffer); + size_t current_strlen = line_buffer - buffer.get(); - if ((line_buffer - buffer.get()) >= 0.9 * BUFFER_UPPER_LIMIT) { + if (current_strlen >= 0.9 * BUFFER_UPPER_LIMIT) { processed += (line_buffer - buffer.get()); std::unique_lock lock{ this->feed_lock }; - this->feed_buffer.push_back(std::move(buffer)); + this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), current_strlen)); this->feed_cond.notify_one(); buffer = std::unique_ptr(new char[BUFFER_UPPER_LIMIT]); // New pointer @@ -470,8 +487,8 @@ namespace csv { // Feed remaining bits std::unique_lock lock{ this->feed_lock }; - this->feed_buffer.push_back(std::move(buffer)); - this->feed_buffer.push_back(nullptr); // Termination signal + this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), line_buffer - buffer.get())); + this->feed_buffer.push_back(std::make_pair<>(nullptr, 0)); // Termination signal this->feed_cond.notify_one(); lock.unlock(); worker.join(); diff --git a/include/internal/csv_reader.hpp b/include/internal/csv_reader.hpp index b93d2607..8cf40e63 100644 --- a/include/internal/csv_reader.hpp +++ b/include/internal/csv_reader.hpp @@ -151,6 +151,9 @@ namespace csv { NEWLINE }; + using WorkItem = std::pair, size_t>; /**< + @brief A string buffer and its size */ + std::vector make_flags() const; internals::GiantStringBuffer record_buffer; /**< @@ -195,7 +198,7 @@ namespace csv { /** @name Multi-Threaded File Reading Functions */ ///@{ - void feed(std::unique_ptr&&); /**< @brief Helper for read_csv_worker() */ + void feed(WorkItem&&); /**< @brief Helper for read_csv_worker() */ void read_csv( const std::string& filename, const size_t& bytes = internals::ITERATION_CHUNK_SIZE @@ -208,8 +211,7 @@ namespace csv { std::FILE* infile = nullptr; /**< @brief Current file handle. Destroyed by ~CSVReader(). */ - std::deque> - feed_buffer; /**< @brief Message queue for worker */ + std::deque feed_buffer; /**< @brief Message queue for worker */ std::mutex feed_lock; /**< @brief Allow only one worker to write */ std::condition_variable feed_cond; /**< @brief Wake up worker */ diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt new file mode 100644 index 00000000..7981b06c --- /dev/null +++ b/programs/CMakeLists.txt @@ -0,0 +1,17 @@ +add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/csv_info.cpp) +target_link_libraries(csv_info csv) + +add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp) +target_link_libraries(csv_bench csv) + +add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp) +target_link_libraries(csv_guess_bench csv) + +add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/csv_stats.cpp) +target_link_libraries(csv_stats csv) + +add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp) +target_link_libraries(csv_generator csv) + +add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp) +target_link_libraries(data_type_bench csv) \ No newline at end of file diff --git a/single_include/csv.hpp b/single_include/csv.hpp index f9cdeff4..a921a4e7 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -1355,8 +1355,6 @@ nssv_RESTORE_WARNINGS() #define SUPPRESS_UNUSED_WARNING(x) (void)x namespace csv { - using namespace nonstd; - #if __cplusplus >= 201703L #include using string_view = std::string_view; @@ -1820,24 +1818,32 @@ namespace csv { namespace csv { - // Get operating system specific details - #if defined(_WIN32) - #include - #undef max - #undef min - inline int getpagesize() { - _SYSTEM_INFO sys_info = {}; - GetSystemInfo(&sys_info); - return sys_info.dwPageSize; - } + namespace internals { + // Get operating system specific details + #if defined(_WIN32) + #include + #undef max + #undef min + + inline int getpagesize() { + _SYSTEM_INFO sys_info = {}; + GetSystemInfo(&sys_info); + return sys_info.dwPageSize; + } - const int PAGE_SIZE = getpagesize(); - #elif defined(__linux__) - #include - const int PAGE_SIZE = getpagesize(); - #else - const int PAGE_SIZE = 4096; - #endif + const int PAGE_SIZE = getpagesize(); + #elif defined(__linux__) + #include + const int PAGE_SIZE = getpagesize(); + #else + const int PAGE_SIZE = 4096; + #endif + + /** @brief For functions that lazy load a large CSV, this determines how + * many bytes are read at a time + */ + const size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB + } /** @brief Used for counting number of rows */ using RowCount = long long int; @@ -1846,11 +1852,6 @@ namespace csv { /** @name Global Constants */ ///@{ - /** @brief For functions that lazy load a large CSV, this determines how - * many bytes are read at a time - */ - const size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB - /** @brief A dummy variable used to indicate delimiter should be guessed */ const CSVFormat GUESS_CSV = { '\0', '"', 0, {}, false, true }; @@ -2007,6 +2008,9 @@ namespace csv { NEWLINE }; + using WorkItem = std::pair, size_t>; /**< + @brief A string buffer and its size */ + std::vector make_flags() const; internals::GiantStringBuffer record_buffer; /**< @@ -2051,10 +2055,10 @@ namespace csv { /** @name Multi-Threaded File Reading Functions */ ///@{ - void feed(std::unique_ptr&&); /**< @brief Helper for read_csv_worker() */ + void feed(WorkItem&&); /**< @brief Helper for read_csv_worker() */ void read_csv( const std::string& filename, - const size_t& bytes = ITERATION_CHUNK_SIZE + const size_t& bytes = internals::ITERATION_CHUNK_SIZE ); void read_csv_worker(); ///@} @@ -2064,8 +2068,7 @@ namespace csv { std::FILE* infile = nullptr; /**< @brief Current file handle. Destroyed by ~CSVReader(). */ - std::deque> - feed_buffer; /**< @brief Message queue for worker */ + std::deque feed_buffer; /**< @brief Message queue for worker */ std::mutex feed_lock; /**< @brief Allow only one worker to write */ std::condition_variable feed_cond; /**< @brief Wake up worker */ @@ -2462,8 +2465,8 @@ namespace csv { return CSV_NOT_FOUND; } - void CSVReader::feed(std::unique_ptr&& buff) { - this->feed(csv::string_view(buff.get())); + void CSVReader::feed(WorkItem&& buff) { + this->feed( csv::string_view(buff.first.get(), buff.second) ); } void CSVReader::feed(csv::string_view in) { @@ -2491,56 +2494,72 @@ namespace csv { this->record_buffer->reserve(in.size()); std::string& _record_buffer = *(this->record_buffer.get()); - for (size_t i = 0; i < in.size(); i++) { - if (!quote_escape) { - switch (this->parse_flags[in[i] + 128]) { - case NOT_SPECIAL: - _record_buffer +=in[i]; - break; + const size_t in_size = in.size(); + for (size_t i = 0; i < in_size; i++) { + switch (this->parse_flags[in[i] + 128]) { case DELIMITER: - this->split_buffer.push_back(this->record_buffer.size()); - break; + if (!quote_escape) { + this->split_buffer.push_back(this->record_buffer.size()); + break; + } case NEWLINE: - // End of record -> Write record - if (i + 1 < in.size() && in[i + 1] == '\n') // Catches CRLF (or LFLF) - ++i; - this->write_record(); - break; - default: // Quote - // Case: Previous character was delimiter or newline - if (i) { // Don't deref past beginning - auto prev_ch = this->parse_flags[in[i - 1] + 128]; - if (prev_ch >= DELIMITER) quote_escape = true; + if (!quote_escape) { + // End of record -> Write record + if (i + 1 < in_size && in[i + 1] == '\n') // Catches CRLF (or LFLF) + ++i; + this->write_record(); + break; + } + case NOT_SPECIAL: { + // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous + // sequences, use the loop below to avoid having to go through the outer + // switch statement as much as possible + #if __cplusplus >= 201703L + size_t start = i; + while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + i++; } + + _record_buffer += in.substr(start, i - start + 1); + #else + _record_buffer += in[i]; + + while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + _record_buffer += in[++i]; + } + #endif + break; } - } - else { - switch (this->parse_flags[in[i] + 128]) { - case NOT_SPECIAL: - case DELIMITER: - case NEWLINE: - // Treat as a regular character - _record_buffer +=in[i]; - break; default: // Quote + if (!quote_escape) { + // Don't deref past beginning + if (i && this->parse_flags[in[i - 1] + 128] >= DELIMITER) { + // Case: Previous character was delimiter or newline + quote_escape = true; + } + + break; + } + auto next_ch = this->parse_flags[in[i + 1] + 128]; if (next_ch >= DELIMITER) { // Case: Delim or newline => end of field quote_escape = false; + break; } - else { - // Case: Escaped quote - _record_buffer +=in[i]; - - if (next_ch == QUOTE) - ++i; // Case: Two consecutive quotes - else if (this->strict) - throw std::runtime_error("Unescaped single quote around line " + - std::to_string(this->correct_rows) + " near:\n" + - std::string(in.substr(i, 100))); - } - } + + // Case: Escaped quote + _record_buffer += in[i]; + + if (next_ch == QUOTE) + ++i; // Case: Two consecutive quotes + else if (this->strict) + throw std::runtime_error("Unescaped single quote around line " + + std::to_string(this->correct_rows) + " near:\n" + + std::string(in.substr(i, 100))); + + break; } } @@ -2610,7 +2629,7 @@ namespace csv { this->feed_buffer.pop_front(); // Nullptr --> Die - if (!in) break; + if (!in.first) break; lock.unlock(); // Release lock this->feed(std::move(in)); @@ -2647,14 +2666,15 @@ namespace csv { std::thread worker(&CSVReader::read_csv_worker, this); for (size_t processed = 0; processed < bytes; ) { - char * result = std::fgets(line_buffer, PAGE_SIZE, this->infile); + char * result = std::fgets(line_buffer, internals::PAGE_SIZE, this->infile); if (result == NULL) break; line_buffer += std::strlen(line_buffer); + size_t current_strlen = line_buffer - buffer.get(); - if ((line_buffer - buffer.get()) >= 0.9 * BUFFER_UPPER_LIMIT) { + if (current_strlen >= 0.9 * BUFFER_UPPER_LIMIT) { processed += (line_buffer - buffer.get()); std::unique_lock lock{ this->feed_lock }; - this->feed_buffer.push_back(std::move(buffer)); + this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), current_strlen)); this->feed_cond.notify_one(); buffer = std::unique_ptr(new char[BUFFER_UPPER_LIMIT]); // New pointer @@ -2665,8 +2685,8 @@ namespace csv { // Feed remaining bits std::unique_lock lock{ this->feed_lock }; - this->feed_buffer.push_back(std::move(buffer)); - this->feed_buffer.push_back(nullptr); // Termination signal + this->feed_buffer.push_back(std::make_pair<>(std::move(buffer), line_buffer - buffer.get())); + this->feed_buffer.push_back(std::make_pair<>(nullptr, 0)); // Termination signal this->feed_cond.notify_one(); lock.unlock(); worker.join(); @@ -2701,7 +2721,7 @@ namespace csv { bool CSVReader::read_row(CSVRow &row) { if (this->records.empty()) { if (!this->eof()) { - this->read_csv("", ITERATION_CHUNK_SIZE); + this->read_csv("", internals::ITERATION_CHUNK_SIZE); } else return false; // Stop reading } @@ -2815,22 +2835,30 @@ namespace csv { */ csv::string_view CSVRow::get_string_view(size_t n) const { csv::string_view ret(this->row_str); - size_t beg = 0, end = row_str.size(), + size_t beg = 0, + end = 0, r_size = this->size(); if (n >= r_size) throw std::runtime_error("Index out of bounds."); if (!splits.empty()) { - if (n == 0 || r_size == 2) { - if (n == 0) end = this->splits[0]; - else beg = this->splits[0]; + if (n == 0) { + end = this->splits[0]; + } + else if (r_size == 2) { + beg = this->splits[0]; } else { beg = this->splits[n - 1]; if (n != r_size - 1) end = this->splits[n]; } } + + // Performance optimization + if (end == 0) { + end = row_str.size(); + } return ret.substr( beg, @@ -3024,7 +3052,7 @@ namespace csv { * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. */ while (!this->eof()) { - this->read_csv("", ITERATION_CHUNK_SIZE); + this->read_csv("", internals::ITERATION_CHUNK_SIZE); this->calc(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 00000000..d3f6ebb9 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,23 @@ +add_executable(csv_test "") +target_sources(csv_test + PRIVATE + ${CSV_INCLUDE_DIR}/csv.hpp + catch.hpp + main.cpp + test_csv_iterator.cpp + test_csv_buffer.cpp + test_csv_row.cpp + test_csv_stat.cpp + test_read_csv.cpp + test_write_csv.cpp + test_data_type.cpp +) +target_link_libraries(csv_test csv) +add_custom_command( + TARGET csv_test POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + data $/tests/data +) + +enable_testing() +add_test(test csv_test) \ No newline at end of file