From b565755a31311e16af1ba369ae8d8ed2750392ac Mon Sep 17 00:00:00 2001 From: Vincent La Date: Sun, 19 May 2019 16:31:47 -0700 Subject: [PATCH] Performance enhancements + CSVFormat interface changes (#37) ## Public API changes * Performance increased from ~200MB/s to ~220MB/s * Made `CSVFormat` more readable and user-friendly ## Behind the scenes * Resolved #34 by storing all CSV row data in contiguous memory regions * Modified CMake to improve developer experience * Simplified CSVRow constructors/definition * Simplified CSVReader::write_record logic * Various other code clean-up actions --- .gitignore | 6 +- CMakeLists.txt | 30 +- CMakeSettings.json | 8 +- CodeCoverage.cmake | 307 +++++ Doxyfile | 4 +- Makefile | 2 +- README.md | 17 +- _CMakeLists2.txt | 12 + codecov.yml | 7 +- cpp.hint | 4 + docs/source/Doxy.md | 17 +- include/internal/CMakeLists.txt | 14 +- include/internal/compatibility.hpp | 29 +- include/internal/constants.hpp | 15 +- include/internal/csv_format.cpp | 64 + include/internal/csv_format.hpp | 96 +- include/internal/csv_reader.cpp | 226 ++-- include/internal/csv_reader.hpp | 138 ++- include/internal/csv_reader_iterator.cpp | 10 +- include/internal/csv_row.cpp | 93 +- include/internal/csv_row.hpp | 150 ++- include/internal/csv_stat.cpp | 10 +- include/internal/csv_stat.hpp | 8 +- include/internal/csv_utility.cpp | 3 +- include/internal/csv_utility.hpp | 9 +- include/internal/data_type.cpp | 188 --- include/internal/data_type.h | 246 +++- include/internal/giant_string_buffer.cpp | 43 - include/internal/giant_string_buffer.hpp | 22 - include/internal/row_buffer.cpp | 71 ++ include/internal/row_buffer.hpp | 77 ++ programs/CMakeLists.txt | 35 +- programs/csv_guess_bench.cpp | 2 +- programs/data_type_bench.cpp | 54 +- single_include/csv.hpp | 1392 +++++++++++++--------- tests/CMakeLists.txt | 21 +- tests/test_csv_buffer.cpp | 33 +- tests/test_csv_row.cpp | 14 +- tests/test_csv_stat.cpp | 4 +- tests/test_read_csv.cpp | 25 +- 40 files changed, 2208 insertions(+), 1298 deletions(-) create mode 100644 CodeCoverage.cmake create mode 100644 _CMakeLists2.txt create mode 100644 cpp.hint create mode 100644 include/internal/csv_format.cpp delete mode 100644 include/internal/data_type.cpp delete mode 100644 include/internal/giant_string_buffer.cpp delete mode 100644 include/internal/giant_string_buffer.hpp create mode 100644 include/internal/row_buffer.cpp create mode 100644 include/internal/row_buffer.hpp diff --git a/.gitignore b/.gitignore index 8fcd72f8..7431a3ab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,15 @@ +# Custom Settigns +CMakeLists2.txt + # Build bin/ +build/ # Doxygen docs/html *.tmp # Visual Studio -Debug/ -Release/ .vs/ *.pdb *.i* diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a8884fd..b7484476 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,16 @@ cmake_minimum_required(VERSION 3.9) -project(csv_parser) +project(csv) -set(CMAKE_CXX_STANDARD 17) +if(CSV_CXX_STANDARD) + set(CMAKE_CXX_STANDARD ${CSV_CXX_STANDARD}) +else() + set(CMAKE_CXX_STANDARD 17) +endif(CSV_CXX_STANDARD) + +message("Building CSV library using C++${CMAKE_CXX_STANDARD}") +add_compile_definitions(CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}) -if (MSVC) +if(MSVC) # Make Visual Studio report accurate C++ version # See: https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ set(CMAKE_CXX_FLAGS "/EHsc /Zc:__cplusplus") @@ -12,19 +19,27 @@ if (MSVC) # /Wall emits warnings about the C++ standard library set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") endif(CMAKE_BUILD_TYPE MATCHES Debug) - else() - set(CMAKE_CXX_FLAGS "-pthread") - set(CMAKE_CXX_FLAGS_RELEASE "-O3") - set(CMAKE_CXX_FLAGS_DEBUG "-Og -g -lgcov --coverage") + # Ignore Visual Studio pragma regions + set(CMAKE_CXX_FLAGS "-pthread -Wno-unknown-pragmas") + if(CMAKE_BUILD_TYPE MATCHES Debug) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Og -g") + endif(CMAKE_BUILD_TYPE MATCHES Debug) endif(MSVC) +set(CSV_ROOT_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(CSV_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(CSV_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/include/) set(CSV_SOURCE_DIR ${CSV_INCLUDE_DIR}/internal/) set(CSV_TEST_DIR ${CMAKE_CURRENT_LIST_DIR}/tests) include_directories(${CSV_INCLUDE_DIR}) +## Load developer specific CMake settings +include(CMakeLists2.txt OPTIONAL) + ## Main Library add_subdirectory(${CSV_SOURCE_DIR}) @@ -32,4 +47,5 @@ add_subdirectory(${CSV_SOURCE_DIR}) add_subdirectory("programs") ## Tests +enable_testing() add_subdirectory("tests") \ No newline at end of file diff --git a/CMakeSettings.json b/CMakeSettings.json index f82f8033..a2915fc5 100644 --- a/CMakeSettings.json +++ b/CMakeSettings.json @@ -7,8 +7,8 @@ "inheritEnvironments": [ "msvc_x64_x64" ], - "buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}", - "installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}", + "buildRoot": "${projectDir}\\build\\${name}", + "installRoot": "${projectDir}\\install\\${name}", "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "" @@ -20,8 +20,8 @@ "inheritEnvironments": [ "msvc_x64_x64" ], - "buildRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\build\\${name}", - "installRoot": "${env.USERPROFILE}\\CMakeBuilds\\${workspaceHash}\\install\\${name}", + "buildRoot": "${projectDir}\\build\\${name}", + "installRoot": "{projectDir}\\install\\${name}", "cmakeCommandArgs": "", "buildCommandArgs": "-v", "ctestCommandArgs": "" diff --git a/CodeCoverage.cmake b/CodeCoverage.cmake new file mode 100644 index 00000000..cc849c84 --- /dev/null +++ b/CodeCoverage.cmake @@ -0,0 +1,307 @@ +# Copyright (c) 2012 - 2017, Lars Bilke +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# CHANGES: +# +# 2012-01-31, Lars Bilke +# - Enable Code Coverage +# +# 2013-09-17, Joakim Söderberg +# - Added support for Clang. +# - Some additional usage instructions. +# +# 2016-02-03, Lars Bilke +# - Refactored functions to use named parameters +# +# 2017-06-02, Lars Bilke +# - Merged with modified version from github.com/ufz/ogs +# +# +# USAGE: +# +# 1. Copy this file into your cmake modules path. +# +# 2. Add the following line to your CMakeLists.txt: +# include(CodeCoverage) +# +# 3. Append necessary compiler flags: +# APPEND_COVERAGE_COMPILER_FLAGS() +# +# 3.a (OPTIONAL) Set appropriate optimization flags, e.g. -O0, -O1 or -Og +# +# 4. If you need to exclude additional directories from the report, specify them +# using the COVERAGE_LCOV_EXCLUDES variable before calling SETUP_TARGET_FOR_COVERAGE_LCOV. +# Example: +# set(COVERAGE_LCOV_EXCLUDES 'dir1/*' 'dir2/*') +# +# 5. Use the functions described below to create a custom make target which +# runs your test executable and produces a code coverage report. +# +# 6. Build a Debug build: +# cmake -DCMAKE_BUILD_TYPE=Debug .. +# make +# make my_coverage_target +# + +include(CMakeParseArguments) + +# Check prereqs +find_program( GCOV_PATH gcov ) +find_program( LCOV_PATH NAMES lcov lcov.bat lcov.exe lcov.perl) +find_program( GENHTML_PATH NAMES genhtml genhtml.perl genhtml.bat ) +find_program( GCOVR_PATH gcovr PATHS ${CMAKE_SOURCE_DIR}/scripts/test) +find_package(Python COMPONENTS Interpreter) + +if(NOT GCOV_PATH) + message(FATAL_ERROR "gcov not found! Aborting...") +endif() # NOT GCOV_PATH + +if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang") + if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3) + message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...") + endif() +elseif(NOT CMAKE_COMPILER_IS_GNUCXX) + message(FATAL_ERROR "Compiler is not GNU gcc! Aborting...") +endif() + +set(COVERAGE_COMPILER_FLAGS "-g --coverage -fprofile-arcs -ftest-coverage" + CACHE INTERNAL "") + +set(CMAKE_CXX_FLAGS_COVERAGE + ${COVERAGE_COMPILER_FLAGS} + CACHE STRING "Flags used by the C++ compiler during coverage builds." + FORCE ) +set(CMAKE_C_FLAGS_COVERAGE + ${COVERAGE_COMPILER_FLAGS} + CACHE STRING "Flags used by the C compiler during coverage builds." + FORCE ) +set(CMAKE_EXE_LINKER_FLAGS_COVERAGE + "" + CACHE STRING "Flags used for linking binaries during coverage builds." + FORCE ) +set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE + "" + CACHE STRING "Flags used by the shared libraries linker during coverage builds." + FORCE ) +mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_EXE_LINKER_FLAGS_COVERAGE + CMAKE_SHARED_LINKER_FLAGS_COVERAGE ) + +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading") +endif() # NOT CMAKE_BUILD_TYPE STREQUAL "Debug" + +if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + link_libraries(gcov) +else() + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage") +endif() + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_LCOV( +# NAME testrunner_coverage # New target name +# EXECUTABLE testrunner -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES testrunner # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_LCOV) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES LCOV_ARGS GENHTML_ARGS) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT LCOV_PATH) + message(FATAL_ERROR "lcov not found! Aborting...") + endif() # NOT LCOV_PATH + + if(NOT GENHTML_PATH) + message(FATAL_ERROR "genhtml not found! Aborting...") + endif() # NOT GENHTML_PATH + + # Setup target + add_custom_target(${Coverage_NAME} + + # Cleanup lcov + COMMAND ${LCOV_PATH} ${Coverage_LCOV_ARGS} --gcov-tool ${GCOV_PATH} -directory . --zerocounters + # Create baseline to make sure untouched files show up in the report + COMMAND ${LCOV_PATH} ${Coverage_LCOV_ARGS} --gcov-tool ${GCOV_PATH} -c -i -d . -o ${Coverage_NAME}.base + + # Run tests + COMMAND ${Coverage_EXECUTABLE} ${Coverage_EXECUTABLE_ARGS} + + # Capturing lcov counters and generating report + COMMAND ${LCOV_PATH} ${Coverage_LCOV_ARGS} --gcov-tool ${GCOV_PATH} --directory . --capture --output-file ${Coverage_NAME}.info + # add baseline counters + COMMAND ${LCOV_PATH} ${Coverage_LCOV_ARGS} --gcov-tool ${GCOV_PATH} -a ${Coverage_NAME}.base -a ${Coverage_NAME}.info --output-file ${CMAKE_CURRENT_LIST_DIR}/${Coverage_NAME}.total + COMMAND ${LCOV_PATH} ${Coverage_LCOV_ARGS} --gcov-tool ${GCOV_PATH} --remove ${Coverage_NAME}.total ${COVERAGE_LCOV_EXCLUDES} --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + COMMAND ${GENHTML_PATH} ${Coverage_GENHTML_ARGS} -o ${Coverage_NAME} ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + COMMAND ${CMAKE_COMMAND} -E remove ${Coverage_NAME}.base ${Coverage_NAME}.total ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned + + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Resetting code coverage counters to zero.\nProcessing code coverage counters and generating report." + ) + + # Show where to find the lcov info report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Lcov code coverage info report saved in ${Coverage_NAME}.info." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_LCOV + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_GCOVR_XML( +# NAME ctest_coverage # New target name +# EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES executable_target # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_GCOVR_XML) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT Python_FOUND) + message(FATAL_ERROR "python not found! Aborting...") + endif() + + if(NOT GCOVR_PATH) + message(FATAL_ERROR "gcovr not found! Aborting...") + endif() # NOT GCOVR_PATH + + # Combine excludes to several -e arguments + set(GCOVR_EXCLUDES "") + foreach(EXCLUDE ${COVERAGE_GCOVR_EXCLUDES}) + string(REPLACE "*" "\\*" EXCLUDE_REPLACED ${EXCLUDE}) + list(APPEND GCOVR_EXCLUDES "-e") + list(APPEND GCOVR_EXCLUDES "${EXCLUDE_REPLACED}") + endforeach() + + add_custom_target(${Coverage_NAME} + # Run tests + ${Coverage_EXECUTABLE} ${Coverage_EXECUTABLE_ARGS} + + # Running gcovr + COMMAND ${GCOVR_PATH} --xml + -r ${PROJECT_SOURCE_DIR} ${GCOVR_EXCLUDES} + --object-directory=${PROJECT_BINARY_DIR} + -o ${Coverage_NAME}.xml + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Running gcovr to produce Cobertura code coverage report." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Cobertura code coverage report saved in ${Coverage_NAME}.xml." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_GCOVR_XML + +# Defines a target for running and collection code coverage information +# Builds dependencies, runs the given executable and outputs reports. +# NOTE! The executable should always have a ZERO as exit code otherwise +# the coverage generation will not complete. +# +# SETUP_TARGET_FOR_COVERAGE_GCOVR_HTML( +# NAME ctest_coverage # New target name +# EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR +# DEPENDENCIES executable_target # Dependencies to build first +# ) +function(SETUP_TARGET_FOR_COVERAGE_GCOVR_HTML) + + set(options NONE) + set(oneValueArgs NAME) + set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES) + cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT Python_FOUND) + message(FATAL_ERROR "python not found! Aborting...") + endif() + + if(NOT GCOVR_PATH) + message(FATAL_ERROR "gcovr not found! Aborting...") + endif() # NOT GCOVR_PATH + + # Combine excludes to several -e arguments + set(GCOVR_EXCLUDES "") + foreach(EXCLUDE ${COVERAGE_GCOVR_EXCLUDES}) + string(REPLACE "*" "\\*" EXCLUDE_REPLACED ${EXCLUDE}) + list(APPEND GCOVR_EXCLUDES "-e") + list(APPEND GCOVR_EXCLUDES "${EXCLUDE_REPLACED}") + endforeach() + + add_custom_target(${Coverage_NAME} + # Run tests + ${Coverage_EXECUTABLE} ${Coverage_EXECUTABLE_ARGS} + + # Create folder + COMMAND ${CMAKE_COMMAND} -E make_directory ${PROJECT_BINARY_DIR}/${Coverage_NAME} + + # Running gcovr + COMMAND ${Python_EXECUTABLE} ${GCOVR_PATH} --html --html-details + -r ${PROJECT_SOURCE_DIR} ${GCOVR_EXCLUDES} + --object-directory=${PROJECT_BINARY_DIR} + -o ${Coverage_NAME}/index.html + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + DEPENDS ${Coverage_DEPENDENCIES} + COMMENT "Running gcovr to produce HTML code coverage report." + ) + + # Show info where to find the report + add_custom_command(TARGET ${Coverage_NAME} POST_BUILD + COMMAND ; + COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report." + ) + +endfunction() # SETUP_TARGET_FOR_COVERAGE_GCOVR_HTML + +function(APPEND_COVERAGE_COMPILER_FLAGS) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE) + message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}") +endfunction() # APPEND_COVERAGE_COMPILER_FLAGS \ No newline at end of file diff --git a/Doxyfile b/Doxyfile index 599540c4..1c4cff65 100644 --- a/Doxyfile +++ b/Doxyfile @@ -177,7 +177,7 @@ SHORT_NAMES = NO # description.) # The default value is: NO. -JAVADOC_AUTOBRIEF = NO +JAVADOC_AUTOBRIEF = YES # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If @@ -2037,7 +2037,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/Makefile b/Makefile index 6110cb56..2c68781d 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ ifeq ($(CXX), g++-8) TEST_OFLAGS = -Og endif -TEST_FLAGS = -Itests/ $(CFLAGS) $(TEST_OFLAGS) -g --coverage -Wall +TEST_FLAGS = -Itests/ $(CFLAGS) $(TEST_OFLAGS) -g --coverage -Wno-unknown-pragmas -Wall # Main Library SOURCES = $(wildcard include/internal/*.cpp) diff --git a/README.md b/README.md index 999cc94c..d23971ee 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,9 @@ CSVReader reader("very_big_file.csv"); for (CSVRow& row: reader) { // Input iterator for (CSVField& field: row) { - // For efficiency, get<>() produces a string_view + // By default, get<>() produces a std::string. + // A more efficient get() is also available, where the resulting + // string_view is valid as long as the parent CSVRow is alive std::cout << field.get<>() << ... } } @@ -146,12 +148,13 @@ Although the CSV parser has a decent guessing mechanism, in some cases it is pre using namespace csv; -CSVFormat format = { - '\t', // Delimiter - '~', // Quote-character - '2', // Line number of header - {} // Column names -- if empty, then filled by reading header row -}; +CSVFormat format; +format.delimiter('\t') + .quote('~') + .header_row(2); // Header is on 3rd row (zero-indexed) + +// Alternatively, we can use format.delimiter({ '\t', ',', ... }) +// to tell the CSV guesser which delimiters to try out CSVReader reader("wierd_csv_dialect.csv", {}, format); diff --git a/_CMakeLists2.txt b/_CMakeLists2.txt new file mode 100644 index 00000000..1790683e --- /dev/null +++ b/_CMakeLists2.txt @@ -0,0 +1,12 @@ +# This file contains settings intended only for those developing the CSV +# library itself. +# +# To enable this file, rename it from '_CMakeLists2.txt' -> 'CMakeLists2.txt' + +set(CSV_DEVELOPER TRUE) + +# Generate a single header library +add_custom_target(generate_single_header + COMMAND python single_header.py > single_include/csv.hpp + WORKING_DIRECTORY ${CSV_ROOT_DIR} +) \ No newline at end of file diff --git a/codecov.yml b/codecov.yml index c75dcbee..ffb7786f 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,3 +1,8 @@ ignore: - "include/external" - - "tests" \ No newline at end of file + - "tests" +coverage: + status: + project: + default: + target: 95% \ No newline at end of file diff --git a/cpp.hint b/cpp.hint new file mode 100644 index 00000000..bca4ed9d --- /dev/null +++ b/cpp.hint @@ -0,0 +1,4 @@ +// Hint files help the Visual Studio IDE interpret Visual C++ identifiers +// such as names of functions and macros. +// For more information see https://go.microsoft.com/fwlink/?linkid=865984 +#define CONSTEXPR diff --git a/docs/source/Doxy.md b/docs/source/Doxy.md index 20a2ff4f..1f5da98b 100644 --- a/docs/source/Doxy.md +++ b/docs/source/Doxy.md @@ -18,7 +18,9 @@ CSVReader reader("very_big_file.csv"); for (CSVRow& row: reader) { // Input iterator for (CSVField& field: row) { - // For efficiency, get<>() produces a string_view + // By default, get<>() produces a std::string. + // A more efficient get() is also available, where the resulting + // string_view is valid as long as the parent CSVRow is alive std::cout << field.get<>() << ... } } @@ -94,12 +96,13 @@ Although the CSV parser has a decent guessing mechanism, in some cases it is pre using namespace csv; -CSVFormat format = { - '\t', // Delimiter - '~', // Quote-character - '2', // Line number of header - {} // Column names -- if empty, then filled by reading header row -}; +CSVFormat format; +format.delimiter('\t') + .quote('~') + .header_row(2); // Header is on 3rd row (zero-indexed) + +// Alternatively, we can use format.delimiter({ '\t', ',', ... }) +// to tell the CSV guesser which delimiters to try out CSVReader reader("wierd_csv_dialect.csv", {}, format); diff --git a/include/internal/CMakeLists.txt b/include/internal/CMakeLists.txt index 53dcfa09..99fae94a 100644 --- a/include/internal/CMakeLists.txt +++ b/include/internal/CMakeLists.txt @@ -2,13 +2,23 @@ add_library(csv STATIC "") target_sources(csv PRIVATE + compatibility.hpp + constants.hpp + csv_format.hpp + csv_format.cpp + csv_reader.hpp csv_reader.cpp csv_reader_iterator.cpp + csv_row.hpp csv_row.cpp csv_stat.cpp + csv_stat.hpp csv_utility.cpp - data_type.cpp - giant_string_buffer.cpp + csv_utility.hpp + csv_writer.hpp + data_type.h + row_buffer.hpp + row_buffer.cpp ) set_target_properties(csv PROPERTIES LINKER_LANGUAGE CXX) \ No newline at end of file diff --git a/include/internal/compatibility.hpp b/include/internal/compatibility.hpp index 4ff1cc05..a5ba251a 100644 --- a/include/internal/compatibility.hpp +++ b/include/internal/compatibility.hpp @@ -4,10 +4,37 @@ #define SUPPRESS_UNUSED_WARNING(x) (void)x namespace csv { - #if __cplusplus >= 201703L + #if CMAKE_CXX_STANDARD == 17 || __cplusplus >= 201703L + #define CSV_HAS_CXX17 + #endif + + #ifdef CSV_HAS_CXX17 #include + /** @typedef string_view + * The string_view class used by this library. + */ using string_view = std::string_view; #else + /** @typedef string_view + * The string_view class used by this library. + */ using string_view = nonstd::string_view; #endif + + // Resolves g++ bug with regard to constexpr methods + #ifdef __GNUC__ + #if __GNUC__ >= 7 + #if defined(CSV_HAS_CXX17) && (__GNUC_MINOR__ >= 2 || __GNUC__ >= 8) + #define CONSTEXPR constexpr + #endif + #endif + #else + #ifdef CSV_HAS_CXX17 + #define CONSTEXPR constexpr + #endif + #endif + + #ifndef CONSTEXPR + #define CONSTEXPR inline + #endif } \ No newline at end of file diff --git a/include/internal/constants.hpp b/include/internal/constants.hpp index 33a338d6..2ccb5dd7 100644 --- a/include/internal/constants.hpp +++ b/include/internal/constants.hpp @@ -18,6 +18,7 @@ namespace csv { return sys_info.dwPageSize; } + /** Size of a memory page in bytes */ const int PAGE_SIZE = getpagesize(); #elif defined(__linux__) #include @@ -29,23 +30,11 @@ namespace csv { /** @brief For functions that lazy load a large CSV, this determines how * many bytes are read at a time */ - const size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB + const size_t ITERATION_CHUNK_SIZE = 50000000; // 50MB } /** @brief Used for counting number of rows */ using RowCount = long long int; using CSVCollection = std::deque; - - /** @name Global Constants */ - ///@{ - /** @brief A dummy variable used to indicate delimiter should be guessed */ - const CSVFormat GUESS_CSV = { '\0', '"', 0, {}, false, true }; - - /** @brief RFC 4180 CSV format */ - const CSVFormat DEFAULT_CSV = { ',', '"', 0, {}, false, true }; - - /** @brief RFC 4180 CSV format with strict parsing */ - const CSVFormat DEFAULT_CSV_STRICT = { ',', '"', 0, {}, true, true }; - ///@} } \ No newline at end of file diff --git a/include/internal/csv_format.cpp b/include/internal/csv_format.cpp new file mode 100644 index 00000000..cb7a7bab --- /dev/null +++ b/include/internal/csv_format.cpp @@ -0,0 +1,64 @@ +#include "csv_format.hpp" + +namespace csv { + CSVFormat create_default_csv_strict() { + CSVFormat format; + format.delimiter(',') + .quote('"') + .header_row(0) + .detect_bom(true) + .strict_parsing(true); + + return format; + } + + CSVFormat create_guess_csv() { + CSVFormat format; + format.delimiter({ ',', '|', '\t', ';', '^' }) + .quote('"') + .header_row(0) + .detect_bom(true); + + return format; + } + + const CSVFormat CSVFormat::RFC4180_STRICT = create_default_csv_strict(); + const CSVFormat CSVFormat::GUESS_CSV = create_guess_csv(); + + CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + return *this; + } + + CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + return *this; + } + + CSVFormat& CSVFormat::quote(char quote) { + this->quote_char = quote; + return *this; + } + + CSVFormat& CSVFormat::column_names(const std::vector& col_names) { + this->col_names = col_names; + this->header = -1; + return *this; + } + + CSVFormat& CSVFormat::header_row(int row) { + this->header = row; + this->col_names = {}; + return *this; + } + + CSVFormat& CSVFormat::strict_parsing(bool throw_error) { + this->strict = throw_error; + return *this; + } + + CSVFormat& CSVFormat::detect_bom(bool detect) { + this->unicode_detect = detect; + return *this; + } +} \ No newline at end of file diff --git a/include/internal/csv_format.hpp b/include/internal/csv_format.hpp index e74f5fa6..f795063e 100644 --- a/include/internal/csv_format.hpp +++ b/include/internal/csv_format.hpp @@ -1,31 +1,89 @@ #pragma once +#include #include #include namespace csv { - /** - * @brief Stores information about how to parse a CSV file - * - * - Can be used to initialize a csv::CSVReader() object - * - The preferred way to pass CSV format information between functions - * - * @see csv::DEFAULT_CSV, csv::GUESS_CSV - * + class CSVReader; + + /** Stores information about how to parse a CSV file. + * Can be used to construct a csv::CSVReader. */ - struct CSVFormat { - char delim; - char quote_char; + class CSVFormat { + public: + /** Settings for parsing a RFC 4180 CSV file */ + CSVFormat() = default; + + /** Sets the delimiter of the CSV file */ + CSVFormat& delimiter(char delim); + + /** Sets a list of pootential delimiters + * + * @param[in] delim An array of possible delimiters to try parsing the CSV with + */ + CSVFormat& delimiter(const std::vector & delim); + + /** Sets the quote character */ + CSVFormat& quote(char quote); + + /** Sets the column names */ + CSVFormat& column_names(const std::vector& col_names); + + /** Sets the header row */ + CSVFormat& header_row(int row); + + /** Tells the parser to throw an std::runtime_error if an + * invalid CSV sequence is found + */ + CSVFormat& strict_parsing(bool strict = true); + + /** Tells the parser to detect and remove UTF-8 byte order marks */ + CSVFormat& detect_bom(bool detect = true); + + + #ifndef DOXYGEN_SHOULD_SKIP_THIS + char get_delim() { + // This error should never be received by end users. + if (this->possible_delimiters.size() > 1) { + throw std::runtime_error("There is more than one possible delimiter."); + } + + return this->possible_delimiters.at(0); + } + + int get_header() { + return this->header; + } + #endif + + /** CSVFormat for guessing the delimiter */ + static const CSVFormat GUESS_CSV; + + /** CSVFormat for strict RFC 4180 parsing */ + static const CSVFormat RFC4180_STRICT; + + friend CSVReader; + private: + bool guess_delim() { + return this->possible_delimiters.size() > 1; + } + + /**< Set of possible delimiters */ + std::vector possible_delimiters = { ',' }; + + /**< Quote character */ + char quote_char = '"'; - /**< @brief Row number with columns (ignored if col_names is non-empty) */ - int header; + /**< Row number with columns (ignored if col_names is non-empty) */ + int header = 0; - /**< @brief Should be left empty unless file doesn't include header */ - std::vector col_names; + /**< Should be left empty unless file doesn't include header */ + std::vector col_names = {}; - /**< @brief RFC 4180 non-compliance -> throw an error */ - bool strict; + /**< RFC 4180 non-compliance -> throw an error */ + bool strict = false; - /**< @brief Detect and strip out Unicode byte order marks */ - bool unicode_detect; + /**< Detect and strip out Unicode byte order marks */ + bool unicode_detect = true; }; } \ No newline at end of file diff --git a/include/internal/csv_reader.cpp b/include/internal/csv_reader.cpp index d631c31b..6e197cb4 100644 --- a/include/internal/csv_reader.cpp +++ b/include/internal/csv_reader.cpp @@ -1,3 +1,6 @@ +#include "csv_reader.hpp" +#include "csv_reader.hpp" +#include "csv_reader.hpp" #include #include // For read_csv() #include // For read_csv() @@ -37,13 +40,16 @@ namespace csv { } } - void CSVGuesser::guess_delim() { + CSVFormat CSVGuesser::guess_delim() { /** Guess the delimiter of a CSV by scanning the first 100 lines by * First assuming that the header is on the first row * If the first guess returns too few rows, then we move to the second * guess method */ + CSVFormat format; if (!first_guess()) second_guess(); + + return format.delimiter(this->delim).header_row(this->header_row); } bool CSVGuesser::first_guess() { @@ -58,7 +64,7 @@ namespace csv { * Returns True if guess was a good one and second guess isn't needed */ - CSVFormat format = DEFAULT_CSV; + CSVFormat format; char current_delim{ ',' }; RowCount max_rows = 0, temp_rows = 0; @@ -67,8 +73,8 @@ namespace csv { // Read first 500KB of the CSV file this->get_csv_head(); - for (char delim: this->delims) { - format.delim = delim; + for (char cand_delim: this->delims) { + format.delimiter(cand_delim); CSVReader guesser(format); guesser.feed(this->head); guesser.end_feed(); @@ -81,7 +87,7 @@ namespace csv { (guesser.get_col_names().size() > max_cols)) { max_rows = temp_rows; max_cols = guesser.get_col_names().size(); - current_delim = delim; + current_delim = cand_delim; } } @@ -98,12 +104,12 @@ namespace csv { * the mode row length. */ - CSVFormat format = DEFAULT_CSV; + CSVFormat format; size_t max_rlen = 0, header = 0; - for (char delim: this->delims) { - format.delim = delim; + for (char cand_delim: this->delims) { + format.delimiter(cand_delim); Guesser guess(format); guess.feed(this->head); guess.end_feed(); @@ -152,30 +158,25 @@ namespace csv { } /** @brief Guess the delimiter used by a delimiter-separated values file */ - CSVFormat guess_format(const std::string& filename) { - internals::CSVGuesser guesser(filename); - guesser.guess_delim(); - return { guesser.delim, '"', guesser.header_row }; + CSVFormat guess_format(const std::string& filename, const std::vector& delims) { + internals::CSVGuesser guesser(filename, delims); + return guesser.guess_delim(); } - std::vector CSVReader::make_flags() const { - /** Create a vector v where each index i corresponds to the - * ASCII number for a character and, v[i + 128] labels it according to - * the CSVReader::ParseFlags enum - */ - - std::vector ret; + CONSTEXPR std::array CSVReader::make_flags() const { + std::array ret = {}; for (int i = -128; i < 128; i++) { + const int arr_idx = i + 128; char ch = char(i); if (ch == this->delimiter) - ret.push_back(DELIMITER); + ret[arr_idx] = DELIMITER; else if (ch == this->quote_char) - ret.push_back(QUOTE); + ret[arr_idx] = QUOTE; else if (ch == '\r' || ch == '\n') - ret.push_back(NEWLINE); + ret[arr_idx] = NEWLINE; else - ret.push_back(NOT_SPECIAL); + ret[arr_idx] = NOT_SPECIAL; } return ret; @@ -203,13 +204,14 @@ namespace csv { * @brief Allows parsing in-memory sources (by calling feed() and end_feed()). */ CSVReader::CSVReader(CSVFormat format) : - delimiter(format.delim), quote_char(format.quote_char), + delimiter(format.get_delim()), quote_char(format.quote_char), header_row(format.header), strict(format.strict), unicode_bom_scan(!format.unicode_detect) { if (!format.col_names.empty()) { - this->header_row = -1; - this->col_names = std::make_shared(format.col_names); + this->set_col_names(format.col_names); } + + parse_flags = this->make_flags(); }; /** @@ -228,27 +230,35 @@ namespace csv { * */ CSVReader::CSVReader(const std::string& filename, CSVFormat format) { - if (format.delim == '\0') - format = guess_format(filename); + if (format.guess_delim()) + format = guess_format(filename, format.possible_delimiters); - this->col_names = std::make_shared(format.col_names); - delimiter = format.delim; + if (!format.col_names.empty()) { + this->set_col_names(format.col_names); + } + else { + header_row = format.header; + } + + delimiter = format.get_delim(); quote_char = format.quote_char; - header_row = format.header; strict = format.strict; + parse_flags = this->make_flags(); // Read first 500KB of CSV - read_csv(filename, 500000); + this->fopen(filename); + this->read_csv(500000); } /** @brief Return the format of the original raw CSV */ CSVFormat CSVReader::get_format() const { - return { - this->delimiter, - this->quote_char, - this->header_row, - this->col_names->col_names - }; + CSVFormat format; + format.delimiter(this->delimiter) + .quote(this->quote_char) + .header_row(this->header_row) + .column_names(this->col_names->col_names); + + return format; } /** @brief Return the CSV's column names as a vector of strings. */ @@ -271,6 +281,12 @@ namespace csv { this->feed( csv::string_view(buff.first.get(), buff.second) ); } + CONSTEXPR void CSVReader::move_to_end_of_field(csv::string_view in, size_t& i, const size_t& in_size) { + while (i + 1 < in_size && parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + i++; + } + } + void CSVReader::feed(csv::string_view in) { /** @brief Parse a CSV-formatted string. * @@ -278,30 +294,22 @@ namespace csv { * **Note**: end_feed() should be called after the last string */ - if (parse_flags.empty()) parse_flags = this->make_flags(); - + this->handle_unicode_bom(in); bool quote_escape = false; // Are we currently in a quote escaped field? - // Unicode BOM Handling - if (!this->unicode_bom_scan) { - if (in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xEF) { - in.remove_prefix(3); // Remove BOM from input string - this->utf8_bom = true; - } - - this->unicode_bom_scan = true; - } - - // Optimization - this->record_buffer->reserve(in.size()); - std::string& _record_buffer = *(this->record_buffer.get()); + // Optimizations + auto& row_buffer = *(this->record_buffer.get()); + auto& text_buffer = row_buffer.buffer; + auto& split_buffer = row_buffer.split_buffer; + text_buffer.reserve(in.size()); + split_buffer.reserve(in.size() / 10); const size_t in_size = in.size(); for (size_t i = 0; i < in_size; i++) { - switch (this->parse_flags[in[i] + 128]) { + switch (parse_flags[in[i] + 128]) { case DELIMITER: if (!quote_escape) { - this->split_buffer.push_back(this->record_buffer.size()); + split_buffer.push_back((unsigned short)row_buffer.size()); break; } case NEWLINE: @@ -312,22 +320,23 @@ namespace csv { this->write_record(); break; } + + // Treat as regular character + text_buffer += in[i]; + break; case NOT_SPECIAL: { // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous // sequences, use the loop below to avoid having to go through the outer // switch statement as much as possible - #if __cplusplus >= 201703L + #ifdef CSV_HAS_CXX17 size_t start = i; - while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { - i++; - } - - _record_buffer += in.substr(start, i - start + 1); + this->move_to_end_of_field(in, i, in_size); + text_buffer += in.substr(start, i - start + 1); #else - _record_buffer += in[i]; + text_buffer += in[i]; - while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { - _record_buffer += in[++i]; + while (i + 1 < in_size && parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + text_buffer += in[++i]; } #endif @@ -336,7 +345,7 @@ namespace csv { default: // Quote if (!quote_escape) { // Don't deref past beginning - if (i && this->parse_flags[in[i - 1] + 128] >= DELIMITER) { + if (i && parse_flags[in[i - 1] + 128] >= DELIMITER) { // Case: Previous character was delimiter or newline quote_escape = true; } @@ -344,7 +353,7 @@ namespace csv { break; } - auto next_ch = this->parse_flags[in[i + 1] + 128]; + auto next_ch = parse_flags[in[i + 1] + 128]; if (next_ch >= DELIMITER) { // Case: Delim or newline => end of field quote_escape = false; @@ -352,7 +361,7 @@ namespace csv { } // Case: Escaped quote - _record_buffer += in[i]; + text_buffer += in[i]; if (next_ch == QUOTE) ++i; // Case: Two consecutive quotes @@ -365,7 +374,7 @@ namespace csv { } } - this->record_buffer.reset(); + this->record_buffer = row_buffer.reset(); } void CSVReader::end_feed() { @@ -375,45 +384,43 @@ namespace csv { this->write_record(); } + CONSTEXPR void CSVReader::handle_unicode_bom(csv::string_view& in) { + if (!this->unicode_bom_scan) { + if (in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xEF) { + in.remove_prefix(3); // Remove BOM from input string + this->utf8_bom = true; + } + + this->unicode_bom_scan = true; + } + } + void CSVReader::write_record() { /** Push the current row into a queue if it is the right length. * Drop it otherwise. */ - size_t col_names_size = this->col_names->size(); - - auto row = CSVRow( - this->record_buffer.buffer, - this->record_buffer.get_row(), - std::move(this->split_buffer), - this->col_names - ); - - if (this->row_num > this->header_row) { + if (header_was_parsed) { // Make sure record is of the right length - if (row.size() == col_names_size) { + const size_t row_size = this->record_buffer->splits_size(); + if (row_size + 1 == this->n_cols) { this->correct_rows++; - this->records.push_back(std::move(row)); + this->records.push_back(CSVRow(this->record_buffer)); } else { /* 1) Zero-length record, probably caused by extraneous newlines * 2) Too short or too long */ this->row_num--; - if (!row.empty()) - bad_row_handler(std::vector(row)); + if (row_size > 0) + bad_row_handler(std::vector(CSVRow( + this->record_buffer))); } } else if (this->row_num == this->header_row) { - this->col_names = std::make_shared( - std::vector(row)); + this->set_col_names(std::vector(CSVRow(this->record_buffer))); } // else: Ignore rows before header row - // Some memory allocation optimizations - this->split_buffer = {}; - if (this->split_buffer.capacity() < col_names_size) - split_buffer.reserve(col_names_size); - this->row_num++; } @@ -438,28 +445,41 @@ namespace csv { } } - /** - * @brief Parse a CSV file using multiple threads - * - * @param[in] nrows Number of rows to read. Set to -1 to read entire file. - * - * @see CSVReader::read_row() - * - */ - void CSVReader::read_csv(const std::string& filename, const size_t& bytes) { + void CSVReader::fopen(const std::string& filename) { if (!this->infile) { - #ifdef _MSC_BUILD +#ifdef _MSC_BUILD // Silence compiler warnings in Microsoft Visual C++ size_t err = fopen_s(&(this->infile), filename.c_str(), "rb"); if (err) throw std::runtime_error("Cannot open file " + filename); - #else +#else this->infile = std::fopen(filename.c_str(), "rb"); if (!this->infile) throw std::runtime_error("Cannot open file " + filename); - #endif +#endif } + } + + /** + * @param[in] names Column names + */ + void CSVReader::set_col_names(const std::vector& names) + { + this->col_names = std::make_shared(names); + this->record_buffer->col_names = this->col_names; + this->header_was_parsed = true; + this->n_cols = names.size(); + } + /** + * Parse a CSV file using multiple threads + * + * @pre CSVReader::infile points to a valid file handle, i.e. CSVReader::fopen was called + * + * @param[in] bytes Number of bytes to read. + * @see CSVReader::read_row() + */ + void CSVReader::read_csv(const size_t& bytes) { const size_t BUFFER_UPPER_LIMIT = std::min(bytes, (size_t)1000000); std::unique_ptr buffer(new char[BUFFER_UPPER_LIMIT]); auto line_buffer = buffer.get(); @@ -523,7 +543,9 @@ namespace csv { bool CSVReader::read_row(CSVRow &row) { if (this->records.empty()) { if (!this->eof()) { - this->read_csv("", internals::ITERATION_CHUNK_SIZE); + // TODO/Suggestion: Make this call non-blocking, + // i.e. move to it another thread + this->read_csv(internals::ITERATION_CHUNK_SIZE); } else return false; // Stop reading } diff --git a/include/internal/csv_reader.hpp b/include/internal/csv_reader.hpp index 8cf40e63..24baabdf 100644 --- a/include/internal/csv_reader.hpp +++ b/include/internal/csv_reader.hpp @@ -1,4 +1,5 @@ #pragma once +#include #include #include #include @@ -13,14 +14,14 @@ #include "csv_format.hpp" #include "csv_row.hpp" #include "compatibility.hpp" -#include "giant_string_buffer.hpp" +#include "row_buffer.hpp" /** @namespace csv * @brief The all encompassing namespace */ namespace csv { /** @brief Integer indicating a requested column wasn't found. */ - const int CSV_NOT_FOUND = -1; + constexpr int CSV_NOT_FOUND = -1; /** @namespace csv::internals * @brief Stuff that is generally not of interest to end-users @@ -51,11 +52,13 @@ namespace csv { */ class iterator { public: + #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVRow; using difference_type = std::ptrdiff_t; using pointer = CSVRow * ; using reference = CSVRow & ; using iterator_category = std::input_iterator_tag; + #endif iterator() = default; iterator(CSVReader* reader) : daddy(reader) {}; @@ -63,8 +66,8 @@ namespace csv { reference operator*(); pointer operator->(); - iterator& operator++(); // Pre-inc - iterator operator++(int); // Post-inc + iterator& operator++(); /**< Pre-increment iterator */ + iterator operator++(int); /**< Post-increment ierator */ iterator& operator--(); bool operator==(const iterator&) const; @@ -80,8 +83,8 @@ namespace csv { * Constructors for iterating over large files and parsing in-memory sources. */ ///@{ - CSVReader(const std::string& filename, CSVFormat format = GUESS_CSV); - CSVReader(CSVFormat format = DEFAULT_CSV); + CSVReader(const std::string& filename, CSVFormat format = CSVFormat::GUESS_CSV); + CSVReader(CSVFormat format = CSVFormat()); ///@} CSVReader(const CSVReader&) = delete; // No copy constructor @@ -118,18 +121,15 @@ namespace csv { /** @name CSV Metadata: Attributes */ ///@{ - RowCount row_num = 0; /**< @brief How many lines have - * been parsed so far + RowCount row_num = 0; /**< How many lines have been parsed so far */ + RowCount correct_rows = 0; /**< How many correct rows (minus header) + * have been parsed so far */ - RowCount correct_rows = 0; /**< @brief How many correct rows - * (minus header) have been parsed so far - */ - bool utf8_bom = false; /**< @brief Set to true if UTF-8 BOM was detected */ + bool utf8_bom = false; /**< Set to true if UTF-8 BOM was detected */ ///@} - void close(); /**< @brief Close the open file handle. - * Automatically called by ~CSVReader(). - */ + /** Close the open file handle. Automatically called by ~CSVReader(). */ + void close(); friend CSVCollection parse(const std::string&, CSVFormat); protected: @@ -141,80 +141,94 @@ namespace csv { */ /** @typedef ParseFlags - * @brief An enum used for describing the significance of each character - * with respect to CSV parsing + * An enum used for describing the significance of each character + * with respect to CSV parsing */ enum ParseFlags { - NOT_SPECIAL, - QUOTE, - DELIMITER, - NEWLINE + NOT_SPECIAL, /**< Characters with no special meaning */ + QUOTE, /**< Characters which may signify a quote escape */ + DELIMITER, /**< Characters which may signify a new field */ + NEWLINE /**< Characters which may signify a new row */ }; - using WorkItem = std::pair, size_t>; /**< - @brief A string buffer and its size */ + /** A string buffer and its size. Consumed by read_csv_worker(). */ + using WorkItem = std::pair, size_t>; + + /** Create a vector v where each index i corresponds to the + * ASCII number for a character and, v[i + 128] labels it according to + * the CSVReader::ParseFlags enum + */ + CONSTEXPR std::array make_flags() const; + + /** Open a file for reading. Implementation is compiler specific. */ + void fopen(const std::string& filename); - std::vector make_flags() const; + /** Sets this reader's column names and associated data */ + void set_col_names(const std::vector&); - internals::GiantStringBuffer record_buffer; /**< - @brief Buffer for current row being parsed */ + /** Returns true if we have reached end of file */ + bool eof() { return !(this->infile); }; - std::vector split_buffer; /**< - @brief Positions where current row is split */ + /** Buffer for current row being parsed */ + internals::BufferPtr record_buffer = internals::BufferPtr(new internals::RawRowBuffer()); - std::deque records; /**< @brief Queue of parsed CSV rows */ - inline bool eof() { return !(this->infile); }; + /** Queue of parsed CSV rows */ + std::deque records; /** @name CSV Parsing Callbacks * The heart of the CSV parser. * These methods are called by feed(). - */ + */ ///@{ void write_record(); + + /** Handles possible Unicode byte order mark */ + CONSTEXPR void handle_unicode_bom(csv::string_view& in); virtual void bad_row_handler(std::vector); ///@} /** @name CSV Settings **/ ///@{ - char delimiter; /**< @brief Delimiter character */ - char quote_char; /**< @brief Quote character */ - int header_row; /**< @brief Line number of the header row (zero-indexed) */ - bool strict = false; /**< @brief Strictness of parser */ + char delimiter; /**< Delimiter character */ + char quote_char; /**< Quote character */ + int header_row; /**< Line number of the header row (zero-indexed) */ + bool strict = false; /**< Strictness of parser */ - std::vector parse_flags; /**< @brief - A table where the (i + 128)th slot gives the ParseFlags for ASCII character i */ + /** An array where the (i + 128)th slot gives the ParseFlags for ASCII character i */ + std::array parse_flags; ///@} /** @name Parser State */ ///@{ - /** <@brief Pointer to a object containing column information - */ - std::shared_ptr col_names = - std::make_shared(std::vector({})); + /** Pointer to a object containing column information */ + internals::ColNamesPtr col_names = std::make_shared( + std::vector({})); - /** <@brief Whether or not an attempt to find Unicode BOM has been made */ + /** Whether or not an attempt to find Unicode BOM has been made */ bool unicode_bom_scan = false; + + /** Whether or not we have parsed the header row */ + bool header_was_parsed = false; + + /** The number of columns in this CSV */ + size_t n_cols = 0; ///@} /** @name Multi-Threaded File Reading Functions */ ///@{ void feed(WorkItem&&); /**< @brief Helper for read_csv_worker() */ - void read_csv( - const std::string& filename, - const size_t& bytes = internals::ITERATION_CHUNK_SIZE - ); + CONSTEXPR void move_to_end_of_field(csv::string_view in, size_t & i, const size_t& in_size); + void read_csv(const size_t& bytes = internals::ITERATION_CHUNK_SIZE); void read_csv_worker(); ///@} /** @name Multi-Threaded File Reading: Flags and State */ ///@{ - std::FILE* infile = nullptr; /**< @brief Current file handle. + std::FILE* infile = nullptr; /**< Current file handle. Destroyed by ~CSVReader(). */ - - std::deque feed_buffer; /**< @brief Message queue for worker */ - - std::mutex feed_lock; /**< @brief Allow only one worker to write */ - std::condition_variable feed_cond; /**< @brief Wake up worker */ + std::deque feed_buffer; /**< Message queue for worker */ + std::mutex feed_lock; /**< Allow only one worker to write */ + std::condition_variable feed_cond; /**< Wake up worker */ ///@} /**@}*/ // End of parser internals @@ -236,19 +250,21 @@ namespace csv { }; public: - CSVGuesser(const std::string& _filename) : filename(_filename) {}; - std::vector delims = { ',', '|', '\t', ';', '^' }; - void guess_delim(); + CSVGuesser(const std::string& _filename, const std::vector& _delims) : + filename(_filename), delims(_delims) {}; + CSVFormat guess_delim(); bool first_guess(); void second_guess(); - char delim; - int header_row = 0; - private: - void get_csv_head(); - std::string filename; - std::string head; + std::string filename; /**< File to read */ + std::string head; /**< First x bytes of file */ + std::vector delims; /**< Candidate delimiters */ + + char delim; /**< Chosen delimiter (set by guess_delim()) */ + int header_row = 0; /**< Chosen header row (set by guess_delim()) */ + + void get_csv_head(); /**< Retrieve the first x bytes of a file */ }; } } \ No newline at end of file diff --git a/include/internal/csv_reader_iterator.cpp b/include/internal/csv_reader_iterator.cpp index 3a19ffab..f7c5acd0 100644 --- a/include/internal/csv_reader_iterator.cpp +++ b/include/internal/csv_reader_iterator.cpp @@ -1,19 +1,15 @@ #include "csv_reader.hpp" namespace csv { - /** - * @brief Return an iterator to the first row in the reader - * - */ + /** Return an iterator to the first row in the reader */ CSVReader::iterator CSVReader::begin() { CSVReader::iterator ret(this, std::move(this->records.front())); this->records.pop_front(); return ret; } - /** - * @brief A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. */ CSVReader::iterator CSVReader::end() { return CSVReader::iterator(); diff --git a/include/internal/csv_row.cpp b/include/internal/csv_row.cpp index e7fc6ae9..9368a74f 100644 --- a/include/internal/csv_row.cpp +++ b/include/internal/csv_row.cpp @@ -1,34 +1,12 @@ +/** @file + * Defines the data type used for storing information about a CSV row + */ + #include #include #include "csv_row.hpp" namespace csv { - namespace internals { - ////////////// - // ColNames // - ////////////// - - ColNames::ColNames(const std::vector& _cnames) - : col_names(_cnames) { - for (size_t i = 0; i < _cnames.size(); i++) { - this->col_pos[_cnames[i]] = i; - } - } - - std::vector ColNames::get_col_names() const { - return this->col_names; - } - - size_t ColNames::size() const { - return this->col_names.size(); - } - } - - /** @brief Return the number of fields in this row */ - size_t CSVRow::size() const { - return splits.size() + 1; - } - /** @brief Return a string view of the nth field * @complexity Constant */ @@ -41,16 +19,16 @@ namespace csv { if (n >= r_size) throw std::runtime_error("Index out of bounds."); - if (!splits.empty()) { + if (r_size > 1) { if (n == 0) { - end = this->splits[0]; + end = this->split_at(0); } else if (r_size == 2) { - beg = this->splits[0]; + beg = this->split_at(0); } else { - beg = this->splits[n - 1]; - if (n != r_size - 1) end = this->splits[n]; + beg = this->split_at(n - 1); + if (n != r_size - 1) end = this->split_at(n); } } @@ -86,18 +64,19 @@ namespace csv { * @param[in] col_name The column to look for */ CSVField CSVRow::operator[](const std::string& col_name) const { - auto col_pos = this->col_names->col_pos.find(col_name); - if (col_pos != this->col_names->col_pos.end()) + auto & col_names = this->buffer->col_names; + auto col_pos = col_names->col_pos.find(col_name); + if (col_pos != col_names->col_pos.end()) return this->operator[](col_pos->second); throw std::runtime_error("Can't find a column named " + col_name); } + /** Convert this CSVRow into a vector of strings. + * **Note**: This is a less efficient method of + * accessing data than using the [] operator. + */ CSVRow::operator std::vector() const { - /** Convert this CSVRow into a vector of strings. - * **Note**: This is a less efficient method of - * accessing data than using the [] operator. - */ std::vector ret; for (size_t i = 0; i < size(); i++) @@ -106,34 +85,7 @@ namespace csv { return ret; } - ////////////////////// - // CSVField Methods // - ////////////////////// - - /**< @brief Return the type number of the stored value in - * accordance with the DataType enum - */ - DataType CSVField::type() { - this->get_value(); - return (DataType)_type; - } - - #ifndef DOXYGEN_SHOULD_SKIP_THIS - void CSVField::get_value() { - /* Check to see if value has been cached previously, if not - * evaluate it - */ - if (_type < 0) { - auto dtype = internals::data_type(this->sv, &this->value); - this->_type = (int)dtype; - } - } - #endif - - // - // CSVField Utility Methods - // - +#pragma region CSVField Methods bool CSVField::operator==(csv::string_view other) const { return other == this->sv; } @@ -142,10 +94,9 @@ namespace csv { return other == this->get(); } - ///////////////////// - // CSVRow Iterator // - ///////////////////// +#pragma endregion CSVField Methods +#pragma region CSVRow Iterator /** @brief Return an iterator pointing to the first field. */ CSVRow::iterator CSVRow::begin() const { return CSVRow::iterator(this, 0); @@ -167,6 +118,11 @@ namespace csv { return std::reverse_iterator(this->begin()); } + unsigned short CSVRow::split_at(size_t n) const + { + return this->buffer->split_buffer[this->start + n]; + } + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) : daddy(_reader), i(_i) { if (_i < (int)this->daddy->size()) @@ -236,4 +192,5 @@ namespace csv { bool CSVRow::iterator::operator==(const iterator& other) const { return this->i == other.i; } +#pragma endregion CSVRow Iterator } \ No newline at end of file diff --git a/include/internal/csv_row.hpp b/include/internal/csv_row.hpp index fc169138..d6792402 100644 --- a/include/internal/csv_row.hpp +++ b/include/internal/csv_row.hpp @@ -1,9 +1,8 @@ -#pragma once -// Auxiliary data structures for CSV parser - -#include "data_type.h" -#include "compatibility.hpp" +/** @file + * Defines the data type used for storing information about a CSV row + */ +#pragma once #include #include #include @@ -12,25 +11,11 @@ #include // For CSVField #include // For CSVField -namespace csv { - namespace internals { - /** @struct ColNames - * @brief A data structure for handling column name information. - * - * These are created by CSVReader and passed (via smart pointer) - * to CSVRow objects it creates, thus - * allowing for indexing by column name. - */ - struct ColNames { - ColNames(const std::vector&); - std::vector col_names; - std::unordered_map col_pos; - - std::vector get_col_names() const; - size_t size() const; - }; - } +#include "data_type.h" +#include "compatibility.hpp" +#include "row_buffer.hpp" +namespace csv { /** * @class CSVField * @brief Data type representing individual CSV values. @@ -38,7 +23,8 @@ namespace csv { */ class CSVField { public: - CSVField(csv::string_view _sv) : sv(_sv) { }; + /** Constructs a CSVField from a string_view */ + constexpr CSVField(csv::string_view _sv) : sv(_sv) { }; /** Returns the value casted to the requested type, performing type checking before. * An std::runtime_error will be thrown if a type mismatch occurs, with the exception @@ -52,8 +38,14 @@ namespace csv { * - long long * - double * - long double + * + @warning Any string_views returned are only guaranteed to be valid + * if the parent CSVRow is still alive. If you are concerned + * about object lifetimes, then grab a std::string or a + * numeric value. + * */ - template T get() { + template T get() { auto dest_type = internals::type_num(); if (dest_type >= CSV_INT && is_num()) { if (internals::type_num() < this->type()) @@ -69,20 +61,41 @@ namespace csv { bool operator==(csv::string_view other) const; bool operator==(const long double& other); - DataType type(); - bool is_null() { return type() == CSV_NULL; } - bool is_str() { return type() == CSV_STRING; } - bool is_num() { return type() >= CSV_INT; } - bool is_int() { + /** Returns true if field is an empty string or string of whitespace characters */ + CONSTEXPR bool is_null() { return type() == CSV_NULL; } + + /** Returns true if field is a non-numeric string */ + CONSTEXPR bool is_str() { return type() == CSV_STRING; } + + /** Returns true if field is an integer or float */ + CONSTEXPR bool is_num() { return type() >= CSV_INT; } + + /** Returns true if field is an integer */ + CONSTEXPR bool is_int() { return (type() >= CSV_INT) && (type() <= CSV_LONG_LONG_INT); } - bool is_float() { return type() == CSV_DOUBLE; }; + + /** Returns true if field is a float*/ + CONSTEXPR bool is_float() { return type() == CSV_DOUBLE; }; + + /** Return the type of the underlying CSV data */ + CONSTEXPR DataType type() { + this->get_value(); + return (DataType)_type; + } private: - long double value = 0; - csv::string_view sv = ""; - int _type = -1; - void get_value(); + long double value = 0; /**< Cached numeric value */ + csv::string_view sv = ""; /**< A pointer to this field's text */ + DataType _type = UNKNOWN; /**< Cached data type value */ + CONSTEXPR void get_value() { + /* Check to see if value has been cached previously, if not + * evaluate it + */ + if (_type < 0) { + this->_type = internals::data_type(this->sv, &this->value); + } + } }; /** @@ -100,31 +113,27 @@ namespace csv { class CSVRow { public: CSVRow() = default; - CSVRow( - std::shared_ptr _str, - csv::string_view _row_str, - std::vector&& _splits, - std::shared_ptr _cnames = nullptr) : - str(_str), - row_str(_row_str), - splits(std::move(_splits)), - col_names(_cnames) - {}; - - CSVRow( - std::string _row_str, - std::vector&& _splits, - std::shared_ptr _cnames = nullptr - ) : - str(std::make_shared(_row_str)), - splits(std::move(_splits)), - col_names(_cnames) + + /** Construct a CSVRow from a RawRowBuffer. Should be called by CSVReader::write_record. */ + CSVRow(const internals::BufferPtr& _str) : buffer(_str) { - row_str = csv::string_view(this->str->c_str()); + this->row_str = _str->get_row(); + + auto splits = _str->get_splits(); + this->start = splits.start; + this->n_cols = splits.n_cols; }; - bool empty() const { return this->row_str.empty(); } - size_t size() const; + /** Constructor for testing */ + CSVRow(const std::string& str, const std::vector splits, + const std::shared_ptr& col_names) + : CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {}; + + /** Indicates whether row is empty or not */ + CONSTEXPR bool empty() const { return this->row_str.empty(); } + + /** @brief Return the number of fields in this row */ + CONSTEXPR size_t size() const { return this->n_cols; } /** @name Value Retrieval */ ///@{ @@ -139,6 +148,7 @@ namespace csv { */ class iterator { public: + #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVField; using difference_type = int; @@ -152,6 +162,7 @@ namespace csv { using reference = CSVField & ; using iterator_category = std::random_access_iterator_tag; + #endif iterator(const CSVRow*, int i); @@ -192,28 +203,39 @@ namespace csv { ///@} private: - std::shared_ptr str = nullptr; - csv::string_view row_str = ""; - std::vector splits = {}; - std::shared_ptr col_names = nullptr; + /** Get the index in CSVRow's text buffer where the n-th field begins */ + unsigned short split_at(size_t n) const; + + internals::BufferPtr buffer = nullptr; /**< Memory buffer containing data for this row. */ + csv::string_view row_str = ""; /**< Text data for this row */ + size_t start; /**< Where in split buffer this row begins */ + unsigned short n_cols; /**< Numbers of columns this row has */ }; - // get() specializations +#pragma region CSVField::get Specializations + /** Retrieve this field's original string */ template<> inline std::string CSVField::get() { return std::string(this->sv); } + /** Retrieve a view over this field's string + * + * @warning This string_view is only guaranteed to be valid as long as this + * CSVRow is still alive. + */ template<> - inline csv::string_view CSVField::get() { + CONSTEXPR csv::string_view CSVField::get() { return this->sv; } + /** Retrieve this field's value as a long double */ template<> - inline long double CSVField::get() { + CONSTEXPR long double CSVField::get() { if (!is_num()) throw std::runtime_error("Not a number."); return this->value; } +#pragma endregion CSVField::get Specializations } \ No newline at end of file diff --git a/include/internal/csv_stat.cpp b/include/internal/csv_stat.cpp index d0036ba3..8f6f6192 100644 --- a/include/internal/csv_stat.cpp +++ b/include/internal/csv_stat.cpp @@ -1,11 +1,11 @@ +/** @file + * Calculates statistics from CSV files + */ + #include #include "csv_stat.hpp" namespace csv { - /** @file - * Calculates statistics from CSV files - */ - CSVStat::CSVStat(std::string filename, CSVFormat format) : CSVReader(filename, format) { /** Lazily calculate statistics for a potentially large file. Once this constructor @@ -13,7 +13,7 @@ namespace csv { * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. */ while (!this->eof()) { - this->read_csv("", internals::ITERATION_CHUNK_SIZE); + this->read_csv(internals::ITERATION_CHUNK_SIZE); this->calc(); } diff --git a/include/internal/csv_stat.hpp b/include/internal/csv_stat.hpp index 3cb0da28..ed2021e7 100644 --- a/include/internal/csv_stat.hpp +++ b/include/internal/csv_stat.hpp @@ -1,3 +1,7 @@ +/** @file + * Calculates statistics from CSV files + */ + #pragma once #include #include @@ -24,8 +28,8 @@ namespace csv { std::vector get_counts() const; std::vector get_dtypes() const; - CSVStat(std::string filename, CSVFormat format = GUESS_CSV); - CSVStat(CSVFormat format = DEFAULT_CSV) : CSVReader(format) {}; + CSVStat(std::string filename, CSVFormat format = CSVFormat::GUESS_CSV); + CSVStat(CSVFormat format = CSVFormat()) : CSVReader(format) {}; private: // An array of rolling averages // Each index corresponds to the rolling mean for the column at said index diff --git a/include/internal/csv_utility.cpp b/include/internal/csv_utility.cpp index 3989016c..2aef665e 100644 --- a/include/internal/csv_utility.cpp +++ b/include/internal/csv_utility.cpp @@ -1,6 +1,5 @@ #include -#include "constants.hpp" #include "csv_utility.hpp" #include "csv_reader.hpp" @@ -74,7 +73,7 @@ namespace csv { CSVFileInfo info = { filename, reader.get_col_names(), - format.delim, + format.get_delim(), reader.correct_rows, (int)reader.get_col_names().size() }; diff --git a/include/internal/csv_utility.hpp b/include/internal/csv_utility.hpp index 6b79f17c..feb8b40d 100644 --- a/include/internal/csv_utility.hpp +++ b/include/internal/csv_utility.hpp @@ -18,19 +18,20 @@ namespace csv { */ ///@{ CSVCollection operator ""_csv(const char*, size_t); - CSVCollection parse(const std::string& in, CSVFormat format = DEFAULT_CSV); + CSVCollection parse(const std::string& in, CSVFormat format = CSVFormat()); ///@} /** @name Utility Functions */ ///@{ std::unordered_map csv_data_types(const std::string&); CSVFileInfo get_file_info(const std::string& filename); - CSVFormat guess_format(const std::string& filename); + CSVFormat guess_format(const std::string& filename, + const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); std::vector get_col_names( const std::string& filename, - const CSVFormat format = GUESS_CSV); + const CSVFormat format = CSVFormat::GUESS_CSV); int get_col_pos(const std::string filename, const std::string col_name, - const CSVFormat format = GUESS_CSV); + const CSVFormat format = CSVFormat::GUESS_CSV); ///@} namespace internals { diff --git a/include/internal/data_type.cpp b/include/internal/data_type.cpp deleted file mode 100644 index 8cd29612..00000000 --- a/include/internal/data_type.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include - -#include "data_type.h" -#include "compatibility.hpp" - -/** @file - * @brief Provides numeric parsing functionality - */ - -namespace csv { - namespace internals { - #ifndef DOXYGEN_SHOULD_SKIP_THIS - std::string type_name(const DataType& dtype) { - switch (dtype) { - case CSV_STRING: - return "string"; - case CSV_INT: - return "int"; - case CSV_LONG_INT: - return "long int"; - case CSV_LONG_LONG_INT: - return "long long int"; - case CSV_DOUBLE: - return "double"; - default: - return "null"; - } - }; - #endif - - constexpr long double _INT_MAX = (long double)std::numeric_limits::max(); - constexpr long double _LONG_MAX = (long double)std::numeric_limits::max(); - constexpr long double _LONG_LONG_MAX = (long double)std::numeric_limits::max(); - - /** Given a pointer to the start of what is start of - * the exponential part of a number written (possibly) in scientific notation - * parse the exponent - */ - inline DataType _process_potential_exponential( - csv::string_view exponential_part, - const long double& coeff, - long double * const out) { - long double exponent = 0; - auto result = data_type(exponential_part, &exponent); - - if (result >= CSV_INT && result <= CSV_DOUBLE) { - if (out) *out = coeff * pow10(exponent); - return CSV_DOUBLE; - } - - return CSV_STRING; - } - - /** Given the absolute value of an integer, determine what numeric type - * it fits in - */ - inline DataType _determine_integral_type(const long double& number) { - // We can assume number is always non-negative - assert(number >= 0); - - if (number < _INT_MAX) - return CSV_INT; - else if (number < _LONG_MAX) - return CSV_LONG_INT; - else if (number < _LONG_LONG_MAX) - return CSV_LONG_LONG_INT; - else // Conversion to long long will cause an overflow - return CSV_DOUBLE; - } - - DataType data_type(csv::string_view in, long double* const out) { - /** Distinguishes numeric from other text values. Used by various - * type casting functions, like csv_parser::CSVReader::read_row() - * - * #### Rules - * - Leading and trailing whitespace ("padding") ignored - * - A string of just whitespace is NULL - * - * @param[in] in String value to be examined - */ - - // Empty string --> NULL - if (in.size() == 0) - return CSV_NULL; - - bool ws_allowed = true, - neg_allowed = true, - dot_allowed = true, - digit_allowed = true, - has_digit = false, - prob_float = false; - - unsigned places_after_decimal = 0; - long double integral_part = 0, - decimal_part = 0; - - for (size_t i = 0, ilen = in.size(); i < ilen; i++) { - const char& current = in[i]; - - switch (current) { - case ' ': - if (!ws_allowed) { - if (isdigit(in[i - 1])) { - digit_allowed = false; - ws_allowed = true; - } - else { - // Ex: '510 123 4567' - return CSV_STRING; - } - } - break; - case '-': - if (!neg_allowed) { - // Ex: '510-123-4567' - return CSV_STRING; - } - - neg_allowed = false; - break; - case '.': - if (!dot_allowed) { - return CSV_STRING; - } - - dot_allowed = false; - prob_float = true; - break; - case 'e': - case 'E': - // Process scientific notation - if (prob_float) { - size_t exponent_start_idx = i + 1; - - // Strip out plus sign - if (in[i + 1] == '+') { - exponent_start_idx++; - } - - return _process_potential_exponential( - in.substr(exponent_start_idx), - neg_allowed ? integral_part + decimal_part : -(integral_part + decimal_part), - out - ); - } - - return CSV_STRING; - break; - default: - if (isdigit(current)) { - // Process digit - has_digit = true; - - if (!digit_allowed) - return CSV_STRING; - else if (ws_allowed) // Ex: '510 456' - ws_allowed = false; - - // Build current number - unsigned digit = current - '0'; - if (prob_float) { - decimal_part += digit / pow10(++places_after_decimal); - } - else { - integral_part = (integral_part * 10) + digit; - } - } - else { - return CSV_STRING; - } - } - } - - // No non-numeric/non-whitespace characters found - if (has_digit) { - long double number = integral_part + decimal_part; - if (out) { - *out = neg_allowed ? number : -number; - } - - return prob_float ? CSV_DOUBLE : _determine_integral_type(number); - } - - // Just whitespace - return CSV_NULL; - } - } -} \ No newline at end of file diff --git a/include/internal/data_type.h b/include/internal/data_type.h index 6e1f3f7d..67fb3c86 100644 --- a/include/internal/data_type.h +++ b/include/internal/data_type.h @@ -2,25 +2,27 @@ #include #include #include +#include #include "compatibility.hpp" namespace csv { /** Enumerates the different CSV field types that are - * recognized by this library - * - * - 0. CSV_NULL (empty string) - * - 1. CSV_STRING - * - 2. CSV_INT - * - 3. CSV_LONG_INT - * - 4. CSV_LONG_LONG_INT - * - 5. CSV_DOUBLE - * - * **Note**: Overflowing integers will be stored and classified as doubles. - * Furthermore, the same number may either be a CSV_LONG_INT or CSV_INT depending on - * compiler and platform. - */ + * recognized by this library + * + * - 0. CSV_NULL (empty string) + * - 1. CSV_STRING + * - 2. CSV_INT + * - 3. CSV_LONG_INT + * - 4. CSV_LONG_LONG_INT + * - 5. CSV_DOUBLE + * + * **Note**: Overflowing integers will be stored and classified as doubles. + * Furthermore, the same number may either be a CSV_LONG_INT or CSV_INT depending on + * compiler and platform. + */ enum DataType { + UNKNOWN = -1, CSV_NULL, CSV_STRING, CSV_INT, @@ -30,6 +32,36 @@ namespace csv { }; namespace internals { + /** Compute 10 to the power of n */ + template + CONSTEXPR long double pow10(const T& n) { + long double multiplicand = n > 0 ? 10 : 0.1, + ret = 1; + + // Make all numbers positive + T iterations = n > 0 ? n : -n; + + for (T i = 0; i < iterations; i++) { + ret *= multiplicand; + } + + return ret; + } + + /** Compute 10 to the power of n */ + template<> + CONSTEXPR long double pow10(const unsigned& n) { + long double multiplicand = n > 0 ? 10 : 0.1, + ret = 1; + + for (unsigned i = 0; i < n; i++) { + ret *= multiplicand; + } + + return ret; + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS template DataType type_num(); @@ -41,21 +73,185 @@ namespace csv { template<> inline DataType type_num() { return CSV_NULL; } template<> inline DataType type_num() { return CSV_STRING; } - /* Compute 10 to the power of n */ - template - const long double pow10(const T& n) { - long double multiplicand = n > 0 ? 10 : 0.1, - ret = 1; - T iterations = n > 0 ? n : -n; - - for (T i = 0; i < iterations; i++) { - ret *= multiplicand; + inline std::string type_name(const DataType& dtype) { + switch (dtype) { + case CSV_STRING: + return "string"; + case CSV_INT: + return "int"; + case CSV_LONG_INT: + return "long int"; + case CSV_LONG_LONG_INT: + return "long long int"; + case CSV_DOUBLE: + return "double"; + default: + return "null"; } + }; - return ret; + CONSTEXPR DataType data_type(csv::string_view in, long double* const out = nullptr); +#endif + + /** Largest number that can be stored in an integer */ + constexpr long double _INT_MAX = (long double)std::numeric_limits::max(); + + /** Largest number that can be stored in a long int */ + constexpr long double _LONG_MAX = (long double)std::numeric_limits::max(); + + /** Largest number that can be stored in an long long int */ + constexpr long double _LONG_LONG_MAX = (long double)std::numeric_limits::max(); + + /** Given a pointer to the start of what is start of + * the exponential part of a number written (possibly) in scientific notation + * parse the exponent + */ + CONSTEXPR DataType _process_potential_exponential( + csv::string_view exponential_part, + const long double& coeff, + long double * const out) { + long double exponent = 0; + auto result = data_type(exponential_part, &exponent); + + if (result >= CSV_INT && result <= CSV_DOUBLE) { + if (out) *out = coeff * pow10(exponent); + return CSV_DOUBLE; + } + + return CSV_STRING; + } + + /** Given the absolute value of an integer, determine what numeric type + * it fits in + */ + CONSTEXPR DataType _determine_integral_type(const long double& number) { + // We can assume number is always non-negative + assert(number >= 0); + + if (number < _INT_MAX) + return CSV_INT; + else if (number < _LONG_MAX) + return CSV_LONG_INT; + else if (number < _LONG_LONG_MAX) + return CSV_LONG_LONG_INT; + else // Conversion to long long will cause an overflow + return CSV_DOUBLE; } - std::string type_name(const DataType&); - DataType data_type(csv::string_view in, long double* const out = nullptr); + /** Distinguishes numeric from other text values. Used by various + * type casting functions, like csv_parser::CSVReader::read_row() + * + * #### Rules + * - Leading and trailing whitespace ("padding") ignored + * - A string of just whitespace is NULL + * + * @param[in] in String value to be examined + * @param[out] out Pointer to long double where results of numeric parsing + * get stored + */ + CONSTEXPR DataType data_type(csv::string_view in, long double* const out) { + // Empty string --> NULL + if (in.size() == 0) + return CSV_NULL; + + bool ws_allowed = true, + neg_allowed = true, + dot_allowed = true, + digit_allowed = true, + has_digit = false, + prob_float = false; + + unsigned places_after_decimal = 0; + long double integral_part = 0, + decimal_part = 0; + + for (size_t i = 0, ilen = in.size(); i < ilen; i++) { + const char& current = in[i]; + + switch (current) { + case ' ': + if (!ws_allowed) { + if (isdigit(in[i - 1])) { + digit_allowed = false; + ws_allowed = true; + } + else { + // Ex: '510 123 4567' + return CSV_STRING; + } + } + break; + case '-': + if (!neg_allowed) { + // Ex: '510-123-4567' + return CSV_STRING; + } + + neg_allowed = false; + break; + case '.': + if (!dot_allowed) { + return CSV_STRING; + } + + dot_allowed = false; + prob_float = true; + break; + case 'e': + case 'E': + // Process scientific notation + if (prob_float) { + size_t exponent_start_idx = i + 1; + + // Strip out plus sign + if (in[i + 1] == '+') { + exponent_start_idx++; + } + + return _process_potential_exponential( + in.substr(exponent_start_idx), + neg_allowed ? integral_part + decimal_part : -(integral_part + decimal_part), + out + ); + } + + return CSV_STRING; + break; + default: + short digit = current - '0'; + if (digit >= 0 && digit <= 9) { + // Process digit + has_digit = true; + + if (!digit_allowed) + return CSV_STRING; + else if (ws_allowed) // Ex: '510 456' + ws_allowed = false; + + // Build current number + if (prob_float) + decimal_part += digit / pow10(++places_after_decimal); + else + integral_part = (integral_part * 10) + digit; + } + else { + return CSV_STRING; + } + } + } + + // No non-numeric/non-whitespace characters found + if (has_digit) { + long double number = integral_part + decimal_part; + if (out) { + *out = neg_allowed ? number : -number; + } + + return prob_float ? CSV_DOUBLE : _determine_integral_type(number); + } + + // Just whitespace + return CSV_NULL; + } } } \ No newline at end of file diff --git a/include/internal/giant_string_buffer.cpp b/include/internal/giant_string_buffer.cpp deleted file mode 100644 index d93d0051..00000000 --- a/include/internal/giant_string_buffer.cpp +++ /dev/null @@ -1,43 +0,0 @@ -#include "giant_string_buffer.hpp" - -namespace csv { - namespace internals { - /** - * Return a string_view over the current_row - */ - csv::string_view GiantStringBuffer::get_row() { - csv::string_view ret( - this->buffer->c_str() + this->current_end, // Beginning of string - (this->buffer->size() - this->current_end) // Count - ); - - this->current_end = this->buffer->size(); - return ret; - } - - /** Return size of current row */ - size_t GiantStringBuffer::size() const { - return (this->buffer->size() - this->current_end); - } - - std::string* GiantStringBuffer::get() const { - return this->buffer.get(); - } - - std::string* GiantStringBuffer::operator->() const { - return this->buffer.operator->(); - } - - /** Clear out the buffer, but save current row in progress */ - void GiantStringBuffer::reset() { - // Save current row in progress - auto temp_str = this->buffer->substr( - this->current_end, // Position - (this->buffer->size() - this->current_end) // Count - ); - - this->current_end = 0; - this->buffer = std::make_shared(temp_str); - } - } -} \ No newline at end of file diff --git a/include/internal/giant_string_buffer.hpp b/include/internal/giant_string_buffer.hpp deleted file mode 100644 index 46ea52fb..00000000 --- a/include/internal/giant_string_buffer.hpp +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once -#include - -#include "compatibility.hpp" // For string view - -namespace csv { - namespace internals { - /** Class for reducing number of new string malloc() calls */ - class GiantStringBuffer { - public: - csv::string_view get_row(); - size_t size() const; - std::string* get() const; - std::string* operator->() const; - std::shared_ptr buffer = std::make_shared(); - void reset(); - - private: - size_t current_end = 0; - }; - } -} \ No newline at end of file diff --git a/include/internal/row_buffer.cpp b/include/internal/row_buffer.cpp new file mode 100644 index 00000000..4f1c9bba --- /dev/null +++ b/include/internal/row_buffer.cpp @@ -0,0 +1,71 @@ +#include "row_buffer.hpp" + +namespace csv { + namespace internals { + ////////////// + // ColNames // + ////////////// + + ColNames::ColNames(const std::vector& _cnames) + : col_names(_cnames) { + for (size_t i = 0; i < _cnames.size(); i++) { + this->col_pos[_cnames[i]] = i; + } + } + + std::vector ColNames::get_col_names() const { + return this->col_names; + } + + size_t ColNames::size() const { + return this->col_names.size(); + } + + csv::string_view RawRowBuffer::get_row() { + csv::string_view ret( + this->buffer.c_str() + this->current_end, // Beginning of string + (this->buffer.size() - this->current_end) // Count + ); + + this->current_end = this->buffer.size(); + return ret; + } + + ColumnPositions RawRowBuffer::get_splits() + { + const size_t head_idx = this->current_split_idx, + new_split_idx = this->split_buffer.size(); + + this->current_split_idx = new_split_idx; + return ColumnPositions(*this, head_idx, new_split_idx - head_idx + 1); + } + + size_t RawRowBuffer::size() const { + return this->buffer.size() - this->current_end; + } + + size_t RawRowBuffer::splits_size() const { + return this->split_buffer.size() - this->current_split_idx; + } + + BufferPtr RawRowBuffer::reset() const { + // Save current row in progress + auto new_buff = BufferPtr(new RawRowBuffer()); + + new_buff->buffer = this->buffer.substr( + this->current_end, // Position + (this->buffer.size() - this->current_end) // Count + ); + + new_buff->col_names = this->col_names; + + // No need to remove unnecessary bits from this buffer + // (memory savings would be marginal anyways) + return new_buff; + } + + unsigned short ColumnPositions::split_at(int n) const { + return this->parent->split_buffer[this->start + n]; + } + } +} \ No newline at end of file diff --git a/include/internal/row_buffer.hpp b/include/internal/row_buffer.hpp new file mode 100644 index 00000000..7f8e4639 --- /dev/null +++ b/include/internal/row_buffer.hpp @@ -0,0 +1,77 @@ +#pragma once +#include +#include +#include + +#include "compatibility.hpp" // For string view + +namespace csv { + namespace internals { + class RawRowBuffer; + struct ColumnPositions; + struct ColNames; + using BufferPtr = std::shared_ptr; + using ColNamesPtr = std::shared_ptr; + using SplitArray = std::vector; + + /** @struct ColNames + * A data structure for handling column name information. + * + * These are created by CSVReader and passed (via smart pointer) + * to CSVRow objects it creates, thus + * allowing for indexing by column name. + */ + struct ColNames { + ColNames(const std::vector&); + std::vector col_names; + std::unordered_map col_pos; + + std::vector get_col_names() const; + size_t size() const; + }; + + /** Class for reducing number of new string malloc() calls */ + class RawRowBuffer { + public: + RawRowBuffer() = default; + + /** Constructor mainly used for testing + * @param[in] _buffer CSV text without delimiters or newlines + * @param[in] _splits Positions in buffer where CSV fields begin + * @param[in] _col_names Pointer to a vector of column names + */ + RawRowBuffer(const std::string& _buffer, const std::vector& _splits, + const std::shared_ptr& _col_names) : + buffer(_buffer), split_buffer(_splits), col_names(_col_names) {}; + + csv::string_view get_row(); /**< Return a string_view over the current_row */ + ColumnPositions get_splits(); /**< Return the field start positions for the current row */ + + size_t size() const; /**< Return size of current row */ + size_t splits_size() const; /**< Return (num columns - 1) for current row */ + BufferPtr reset() const; /**< Create a new RawRowBuffer with this buffer's unfinished work */ + + std::string buffer; /**< Buffer for storing text */ + SplitArray split_buffer = {}; /**< Array for storing indices (in buffer) + of where CSV fields start */ + ColNamesPtr col_names = nullptr; /**< Pointer to column names */ + + private: + size_t current_end = 0; /**< Where we are currently in the text buffer */ + size_t current_split_idx = 0; /**< Where we are currently in the split buffer */ + }; + + struct ColumnPositions { + ColumnPositions() : parent(nullptr) {}; + constexpr ColumnPositions(const RawRowBuffer& _parent, + size_t _start, unsigned short _size) : parent(&_parent), start(_start), n_cols(_size) {}; + + const RawRowBuffer * parent; /**< RawRowBuffer to grab data from */ + size_t start; /**< Where in split_buffer the array of column positions begins */ + unsigned short n_cols; /**< Number of columns */ + + /// Get the n-th column index + unsigned short split_at(int n) const; + }; + } +} \ No newline at end of file diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 7981b06c..0377cf63 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -1,17 +1,32 @@ add_executable(csv_info ${CMAKE_CURRENT_LIST_DIR}/csv_info.cpp) target_link_libraries(csv_info csv) -add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp) -target_link_libraries(csv_bench csv) - -add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp) -target_link_libraries(csv_guess_bench csv) - add_executable(csv_stats ${CMAKE_CURRENT_LIST_DIR}/csv_stats.cpp) target_link_libraries(csv_stats csv) -add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp) -target_link_libraries(csv_generator csv) +# Provide rudimentary benchmarks +if(CSV_DEVELOPER) + add_executable(csv_guess_bench ${CMAKE_CURRENT_LIST_DIR}/csv_guess_bench.cpp) + target_link_libraries(csv_guess_bench csv) + + add_executable(csv_generator ${CMAKE_CURRENT_LIST_DIR}/csv_generator.cpp) + target_link_libraries(csv_generator csv) + + # Benchmarks for parsing speed + add_executable(csv_bench ${CMAKE_CURRENT_LIST_DIR}/csv_bench.cpp) + target_link_libraries(csv_bench csv) + + add_custom_target(generate_csv_bench + COMMAND csv_bench 2015_StateDepartment.csv + WORKING_DIRECTORY ${CSV_TEST_DIR}/data/real_data + ) + + # Benchmarks for data_type() function + add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp) + target_link_libraries(data_type_bench csv) -add_executable(data_type_bench ${CMAKE_CURRENT_LIST_DIR}/data_type_bench.cpp) -target_link_libraries(data_type_bench csv) \ No newline at end of file + add_custom_target(generate_dtype_bench + COMMAND data_type_bench 2015_StateDepartment.csv "Regular Pay" + WORKING_DIRECTORY ${CSV_TEST_DIR}/data/real_data + ) +endif() \ No newline at end of file diff --git a/programs/csv_guess_bench.cpp b/programs/csv_guess_bench.cpp index faa267ed..bf6568b0 100644 --- a/programs/csv_guess_bench.cpp +++ b/programs/csv_guess_bench.cpp @@ -21,7 +21,7 @@ int main(int argc, char** argv) { auto start = std::chrono::system_clock::now(); // This reads just the first 500 kb of a file - CSVReader reader(filename, GUESS_CSV); + CSVReader reader(filename, CSVFormat::GUESS_CSV); auto end = std::chrono::system_clock::now(); std::chrono::duration diff = end - start; diff --git a/programs/data_type_bench.cpp b/programs/data_type_bench.cpp index e7be40c2..aa76be5c 100644 --- a/programs/data_type_bench.cpp +++ b/programs/data_type_bench.cpp @@ -5,25 +5,11 @@ #define NDEBUG #endif -int main(int argc, char** argv) { - using namespace csv; - - if (argc < 3) { - std::cout << "Usage: " << argv[0] << " [file] [column]" << std::endl; - exit(1); - } - - std::string file = argv[1], - column = argv[2]; - - bool use_std = false; - - if (argc == 4) { - use_std = true; - } +long double get_max(std::string file, std::string column, bool use_std = false); +long double get_max(std::string file, std::string column, bool use_std) { + using namespace csv; long double max = -std::numeric_limits::infinity(); - CSVReader reader(file); for (auto& row : reader) { @@ -47,6 +33,40 @@ int main(int argc, char** argv) { } } + return max; +} + +int main(int argc, char** argv) { + using namespace csv; + + if (argc < 3) { + std::cout << "Usage: " << argv[0] << " [file] [column]" << std::endl; + exit(1); + } + + std::string file = argv[1], + column = argv[2]; + + long double max = 0, std_avg = 0, csv_avg = 0; + const long double trials = 5; + + + for (size_t i = 0; i < trials; i++) { + auto start = std::chrono::system_clock::now(); + max = get_max(file, column, true); + auto end = std::chrono::system_clock::now(); + std::chrono::duration diff = end - start; + std_avg += diff.count() / trials; + + start = std::chrono::system_clock::now(); + max = get_max(file, column, false); + end = std::chrono::system_clock::now(); + diff = end - start; + csv_avg += diff.count() / trials; + } + + std::cout << "std::from_chars: " << std_avg << std::endl; + std::cout << "csv::data_type: " << csv_avg << std::endl; std::cout << "Maximum value: " << max << std::endl; return 0; diff --git a/single_include/csv.hpp b/single_include/csv.hpp index a921a4e7..80b0672d 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -1355,41 +1355,126 @@ nssv_RESTORE_WARNINGS() #define SUPPRESS_UNUSED_WARNING(x) (void)x namespace csv { - #if __cplusplus >= 201703L + #if CMAKE_CXX_STANDARD == 17 || __cplusplus >= 201703L + #define CSV_HAS_CXX17 + #endif + + #ifdef CSV_HAS_CXX17 #include + /** @typedef string_view + * The string_view class used by this library. + */ using string_view = std::string_view; #else + /** @typedef string_view + * The string_view class used by this library. + */ using string_view = nonstd::string_view; #endif + + // Resolves g++ bug with regard to constexpr methods + #ifdef __GNUC__ + #if __GNUC__ >= 7 + #if defined(CSV_HAS_CXX17) && (__GNUC_MINOR__ >= 2 || __GNUC__ >= 8) + #define CONSTEXPR constexpr + #endif + #endif + #else + #ifdef CSV_HAS_CXX17 + #define CONSTEXPR constexpr + #endif + #endif + + #ifndef CONSTEXPR + #define CONSTEXPR inline + #endif } +#include #include #include namespace csv { - /** - * @brief Stores information about how to parse a CSV file - * - * - Can be used to initialize a csv::CSVReader() object - * - The preferred way to pass CSV format information between functions - * - * @see csv::DEFAULT_CSV, csv::GUESS_CSV - * + class CSVReader; + + /** Stores information about how to parse a CSV file. + * Can be used to construct a csv::CSVReader. */ - struct CSVFormat { - char delim; - char quote_char; + class CSVFormat { + public: + /** Settings for parsing a RFC 4180 CSV file */ + CSVFormat() = default; + + /** Sets the delimiter of the CSV file */ + CSVFormat& delimiter(char delim); - /**< @brief Row number with columns (ignored if col_names is non-empty) */ - int header; + /** Sets a list of pootential delimiters + * + * @param[in] delim An array of possible delimiters to try parsing the CSV with + */ + CSVFormat& delimiter(const std::vector & delim); - /**< @brief Should be left empty unless file doesn't include header */ - std::vector col_names; + /** Sets the quote character */ + CSVFormat& quote(char quote); - /**< @brief RFC 4180 non-compliance -> throw an error */ - bool strict; + /** Sets the column names */ + CSVFormat& column_names(const std::vector& col_names); - /**< @brief Detect and strip out Unicode byte order marks */ - bool unicode_detect; + /** Sets the header row */ + CSVFormat& header_row(int row); + + /** Tells the parser to throw an std::runtime_error if an + * invalid CSV sequence is found + */ + CSVFormat& strict_parsing(bool strict = true); + + /** Tells the parser to detect and remove UTF-8 byte order marks */ + CSVFormat& detect_bom(bool detect = true); + + + #ifndef DOXYGEN_SHOULD_SKIP_THIS + char get_delim() { + // This error should never be received by end users. + if (this->possible_delimiters.size() > 1) { + throw std::runtime_error("There is more than one possible delimiter."); + } + + return this->possible_delimiters.at(0); + } + + int get_header() { + return this->header; + } + #endif + + /** CSVFormat for guessing the delimiter */ + static const CSVFormat GUESS_CSV; + + /** CSVFormat for strict RFC 4180 parsing */ + static const CSVFormat RFC4180_STRICT; + + friend CSVReader; + private: + bool guess_delim() { + return this->possible_delimiters.size() > 1; + } + + /**< Set of possible delimiters */ + std::vector possible_delimiters = { ',' }; + + /**< Quote character */ + char quote_char = '"'; + + /**< Row number with columns (ignored if col_names is non-empty) */ + int header = 0; + + /**< Should be left empty unless file doesn't include header */ + std::vector col_names = {}; + + /**< RFC 4180 non-compliance -> throw an error */ + bool strict = false; + + /**< Detect and strip out Unicode byte order marks */ + bool unicode_detect = true; }; } #include @@ -1522,24 +1607,26 @@ namespace csv { #include #include #include +#include namespace csv { /** Enumerates the different CSV field types that are - * recognized by this library - * - * - 0. CSV_NULL (empty string) - * - 1. CSV_STRING - * - 2. CSV_INT - * - 3. CSV_LONG_INT - * - 4. CSV_LONG_LONG_INT - * - 5. CSV_DOUBLE - * - * **Note**: Overflowing integers will be stored and classified as doubles. - * Furthermore, the same number may either be a CSV_LONG_INT or CSV_INT depending on - * compiler and platform. - */ + * recognized by this library + * + * - 0. CSV_NULL (empty string) + * - 1. CSV_STRING + * - 2. CSV_INT + * - 3. CSV_LONG_INT + * - 4. CSV_LONG_LONG_INT + * - 5. CSV_DOUBLE + * + * **Note**: Overflowing integers will be stored and classified as doubles. + * Furthermore, the same number may either be a CSV_LONG_INT or CSV_INT depending on + * compiler and platform. + */ enum DataType { + UNKNOWN = -1, CSV_NULL, CSV_STRING, CSV_INT, @@ -1549,6 +1636,36 @@ namespace csv { }; namespace internals { + /** Compute 10 to the power of n */ + template + CONSTEXPR long double pow10(const T& n) { + long double multiplicand = n > 0 ? 10 : 0.1, + ret = 1; + + // Make all numbers positive + T iterations = n > 0 ? n : -n; + + for (T i = 0; i < iterations; i++) { + ret *= multiplicand; + } + + return ret; + } + + /** Compute 10 to the power of n */ + template<> + CONSTEXPR long double pow10(const unsigned& n) { + long double multiplicand = n > 0 ? 10 : 0.1, + ret = 1; + + for (unsigned i = 0; i < n; i++) { + ret *= multiplicand; + } + + return ret; + } + +#ifndef DOXYGEN_SHOULD_SKIP_THIS template DataType type_num(); @@ -1560,59 +1677,204 @@ namespace csv { template<> inline DataType type_num() { return CSV_NULL; } template<> inline DataType type_num() { return CSV_STRING; } - /* Compute 10 to the power of n */ - template - const long double pow10(const T& n) { - long double multiplicand = n > 0 ? 10 : 0.1, - ret = 1; - T iterations = n > 0 ? n : -n; - - for (T i = 0; i < iterations; i++) { - ret *= multiplicand; + inline std::string type_name(const DataType& dtype) { + switch (dtype) { + case CSV_STRING: + return "string"; + case CSV_INT: + return "int"; + case CSV_LONG_INT: + return "long int"; + case CSV_LONG_LONG_INT: + return "long long int"; + case CSV_DOUBLE: + return "double"; + default: + return "null"; } + }; - return ret; + CONSTEXPR DataType data_type(csv::string_view in, long double* const out = nullptr); +#endif + + /** Largest number that can be stored in an integer */ + constexpr long double _INT_MAX = (long double)std::numeric_limits::max(); + + /** Largest number that can be stored in a long int */ + constexpr long double _LONG_MAX = (long double)std::numeric_limits::max(); + + /** Largest number that can be stored in an long long int */ + constexpr long double _LONG_LONG_MAX = (long double)std::numeric_limits::max(); + + /** Given a pointer to the start of what is start of + * the exponential part of a number written (possibly) in scientific notation + * parse the exponent + */ + CONSTEXPR DataType _process_potential_exponential( + csv::string_view exponential_part, + const long double& coeff, + long double * const out) { + long double exponent = 0; + auto result = data_type(exponential_part, &exponent); + + if (result >= CSV_INT && result <= CSV_DOUBLE) { + if (out) *out = coeff * pow10(exponent); + return CSV_DOUBLE; + } + + return CSV_STRING; } - std::string type_name(const DataType&); - DataType data_type(csv::string_view in, long double* const out = nullptr); - } -} -#include + /** Given the absolute value of an integer, determine what numeric type + * it fits in + */ + CONSTEXPR DataType _determine_integral_type(const long double& number) { + // We can assume number is always non-negative + assert(number >= 0); + if (number < _INT_MAX) + return CSV_INT; + else if (number < _LONG_MAX) + return CSV_LONG_INT; + else if (number < _LONG_LONG_MAX) + return CSV_LONG_LONG_INT; + else // Conversion to long long will cause an overflow + return CSV_DOUBLE; + } -namespace csv { - namespace internals { - /** Class for reducing number of new string malloc() calls */ - class GiantStringBuffer { - public: - csv::string_view get_row(); - size_t size() const; - std::string* get() const; - std::string* operator->() const; - std::shared_ptr buffer = std::make_shared(); - void reset(); + /** Distinguishes numeric from other text values. Used by various + * type casting functions, like csv_parser::CSVReader::read_row() + * + * #### Rules + * - Leading and trailing whitespace ("padding") ignored + * - A string of just whitespace is NULL + * + * @param[in] in String value to be examined + * @param[out] out Pointer to long double where results of numeric parsing + * get stored + */ + CONSTEXPR DataType data_type(csv::string_view in, long double* const out) { + // Empty string --> NULL + if (in.size() == 0) + return CSV_NULL; - private: - size_t current_end = 0; - }; - } -} -// Auxiliary data structures for CSV parser + bool ws_allowed = true, + neg_allowed = true, + dot_allowed = true, + digit_allowed = true, + has_digit = false, + prob_float = false; + unsigned places_after_decimal = 0; + long double integral_part = 0, + decimal_part = 0; -#include + for (size_t i = 0, ilen = in.size(); i < ilen; i++) { + const char& current = in[i]; + + switch (current) { + case ' ': + if (!ws_allowed) { + if (isdigit(in[i - 1])) { + digit_allowed = false; + ws_allowed = true; + } + else { + // Ex: '510 123 4567' + return CSV_STRING; + } + } + break; + case '-': + if (!neg_allowed) { + // Ex: '510-123-4567' + return CSV_STRING; + } + + neg_allowed = false; + break; + case '.': + if (!dot_allowed) { + return CSV_STRING; + } + + dot_allowed = false; + prob_float = true; + break; + case 'e': + case 'E': + // Process scientific notation + if (prob_float) { + size_t exponent_start_idx = i + 1; + + // Strip out plus sign + if (in[i + 1] == '+') { + exponent_start_idx++; + } + + return _process_potential_exponential( + in.substr(exponent_start_idx), + neg_allowed ? integral_part + decimal_part : -(integral_part + decimal_part), + out + ); + } + + return CSV_STRING; + break; + default: + short digit = current - '0'; + if (digit >= 0 && digit <= 9) { + // Process digit + has_digit = true; + + if (!digit_allowed) + return CSV_STRING; + else if (ws_allowed) // Ex: '510 456' + ws_allowed = false; + + // Build current number + if (prob_float) + decimal_part += digit / pow10(++places_after_decimal); + else + integral_part = (integral_part * 10) + digit; + } + else { + return CSV_STRING; + } + } + } + + // No non-numeric/non-whitespace characters found + if (has_digit) { + long double number = integral_part + decimal_part; + if (out) { + *out = neg_allowed ? number : -number; + } + + return prob_float ? CSV_DOUBLE : _determine_integral_type(number); + } + + // Just whitespace + return CSV_NULL; + } + } +} +#include #include -#include -#include -#include // For ColNames -#include // For CSVField -#include // For CSVField +#include + namespace csv { namespace internals { + class RawRowBuffer; + struct ColumnPositions; + struct ColNames; + using BufferPtr = std::shared_ptr; + using ColNamesPtr = std::shared_ptr; + using SplitArray = std::vector; + /** @struct ColNames - * @brief A data structure for handling column name information. + * A data structure for handling column name information. * * These are created by CSVReader and passed (via smart pointer) * to CSVRow objects it creates, thus @@ -1626,8 +1888,66 @@ namespace csv { std::vector get_col_names() const; size_t size() const; }; + + /** Class for reducing number of new string malloc() calls */ + class RawRowBuffer { + public: + RawRowBuffer() = default; + + /** Constructor mainly used for testing + * @param[in] _buffer CSV text without delimiters or newlines + * @param[in] _splits Positions in buffer where CSV fields begin + * @param[in] _col_names Pointer to a vector of column names + */ + RawRowBuffer(const std::string& _buffer, const std::vector& _splits, + const std::shared_ptr& _col_names) : + buffer(_buffer), split_buffer(_splits), col_names(_col_names) {}; + + csv::string_view get_row(); /**< Return a string_view over the current_row */ + ColumnPositions get_splits(); /**< Return the field start positions for the current row */ + + size_t size() const; /**< Return size of current row */ + size_t splits_size() const; /**< Return (num columns - 1) for current row */ + BufferPtr reset() const; /**< Create a new RawRowBuffer with this buffer's unfinished work */ + + std::string buffer; /**< Buffer for storing text */ + SplitArray split_buffer = {}; /**< Array for storing indices (in buffer) + of where CSV fields start */ + ColNamesPtr col_names = nullptr; /**< Pointer to column names */ + + private: + size_t current_end = 0; /**< Where we are currently in the text buffer */ + size_t current_split_idx = 0; /**< Where we are currently in the split buffer */ + }; + + struct ColumnPositions { + ColumnPositions() : parent(nullptr) {}; + constexpr ColumnPositions(const RawRowBuffer& _parent, + size_t _start, unsigned short _size) : parent(&_parent), start(_start), n_cols(_size) {}; + + const RawRowBuffer * parent; /**< RawRowBuffer to grab data from */ + size_t start; /**< Where in split_buffer the array of column positions begins */ + unsigned short n_cols; /**< Number of columns */ + + /// Get the n-th column index + unsigned short split_at(int n) const; + }; } +} +/** @file + * Defines the data type used for storing information about a CSV row + */ + +#include +#include +#include +#include +#include // For ColNames +#include // For CSVField +#include // For CSVField + +namespace csv { /** * @class CSVField * @brief Data type representing individual CSV values. @@ -1635,7 +1955,8 @@ namespace csv { */ class CSVField { public: - CSVField(csv::string_view _sv) : sv(_sv) { }; + /** Constructs a CSVField from a string_view */ + constexpr CSVField(csv::string_view _sv) : sv(_sv) { }; /** Returns the value casted to the requested type, performing type checking before. * An std::runtime_error will be thrown if a type mismatch occurs, with the exception @@ -1649,8 +1970,14 @@ namespace csv { * - long long * - double * - long double + * + @warning Any string_views returned are only guaranteed to be valid + * if the parent CSVRow is still alive. If you are concerned + * about object lifetimes, then grab a std::string or a + * numeric value. + * */ - template T get() { + template T get() { auto dest_type = internals::type_num(); if (dest_type >= CSV_INT && is_num()) { if (internals::type_num() < this->type()) @@ -1666,20 +1993,41 @@ namespace csv { bool operator==(csv::string_view other) const; bool operator==(const long double& other); - DataType type(); - bool is_null() { return type() == CSV_NULL; } - bool is_str() { return type() == CSV_STRING; } - bool is_num() { return type() >= CSV_INT; } - bool is_int() { + /** Returns true if field is an empty string or string of whitespace characters */ + CONSTEXPR bool is_null() { return type() == CSV_NULL; } + + /** Returns true if field is a non-numeric string */ + CONSTEXPR bool is_str() { return type() == CSV_STRING; } + + /** Returns true if field is an integer or float */ + CONSTEXPR bool is_num() { return type() >= CSV_INT; } + + /** Returns true if field is an integer */ + CONSTEXPR bool is_int() { return (type() >= CSV_INT) && (type() <= CSV_LONG_LONG_INT); } - bool is_float() { return type() == CSV_DOUBLE; }; + + /** Returns true if field is a float*/ + CONSTEXPR bool is_float() { return type() == CSV_DOUBLE; }; + + /** Return the type of the underlying CSV data */ + CONSTEXPR DataType type() { + this->get_value(); + return (DataType)_type; + } private: - long double value = 0; - csv::string_view sv = ""; - int _type = -1; - void get_value(); + long double value = 0; /**< Cached numeric value */ + csv::string_view sv = ""; /**< A pointer to this field's text */ + DataType _type = UNKNOWN; /**< Cached data type value */ + CONSTEXPR void get_value() { + /* Check to see if value has been cached previously, if not + * evaluate it + */ + if (_type < 0) { + this->_type = internals::data_type(this->sv, &this->value); + } + } }; /** @@ -1697,31 +2045,27 @@ namespace csv { class CSVRow { public: CSVRow() = default; - CSVRow( - std::shared_ptr _str, - csv::string_view _row_str, - std::vector&& _splits, - std::shared_ptr _cnames = nullptr) : - str(_str), - row_str(_row_str), - splits(std::move(_splits)), - col_names(_cnames) - {}; - - CSVRow( - std::string _row_str, - std::vector&& _splits, - std::shared_ptr _cnames = nullptr - ) : - str(std::make_shared(_row_str)), - splits(std::move(_splits)), - col_names(_cnames) + + /** Construct a CSVRow from a RawRowBuffer. Should be called by CSVReader::write_record. */ + CSVRow(const internals::BufferPtr& _str) : buffer(_str) { - row_str = csv::string_view(this->str->c_str()); + this->row_str = _str->get_row(); + + auto splits = _str->get_splits(); + this->start = splits.start; + this->n_cols = splits.n_cols; }; - bool empty() const { return this->row_str.empty(); } - size_t size() const; + /** Constructor for testing */ + CSVRow(const std::string& str, const std::vector splits, + const std::shared_ptr& col_names) + : CSVRow(internals::BufferPtr(new internals::RawRowBuffer(str, splits, col_names))) {}; + + /** Indicates whether row is empty or not */ + CONSTEXPR bool empty() const { return this->row_str.empty(); } + + /** @brief Return the number of fields in this row */ + CONSTEXPR size_t size() const { return this->n_cols; } /** @name Value Retrieval */ ///@{ @@ -1736,6 +2080,7 @@ namespace csv { */ class iterator { public: + #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVField; using difference_type = int; @@ -1749,6 +2094,7 @@ namespace csv { using reference = CSVField & ; using iterator_category = std::random_access_iterator_tag; + #endif iterator(const CSVRow*, int i); @@ -1789,30 +2135,41 @@ namespace csv { ///@} private: - std::shared_ptr str = nullptr; - csv::string_view row_str = ""; - std::vector splits = {}; - std::shared_ptr col_names = nullptr; + /** Get the index in CSVRow's text buffer where the n-th field begins */ + unsigned short split_at(size_t n) const; + + internals::BufferPtr buffer = nullptr; /**< Memory buffer containing data for this row. */ + csv::string_view row_str = ""; /**< Text data for this row */ + size_t start; /**< Where in split buffer this row begins */ + unsigned short n_cols; /**< Numbers of columns this row has */ }; - // get() specializations +#pragma region CSVField::get Specializations + /** Retrieve this field's original string */ template<> inline std::string CSVField::get() { return std::string(this->sv); } + /** Retrieve a view over this field's string + * + * @warning This string_view is only guaranteed to be valid as long as this + * CSVRow is still alive. + */ template<> - inline csv::string_view CSVField::get() { + CONSTEXPR csv::string_view CSVField::get() { return this->sv; } + /** Retrieve this field's value as a long double */ template<> - inline long double CSVField::get() { + CONSTEXPR long double CSVField::get() { if (!is_num()) throw std::runtime_error("Not a number."); return this->value; } +#pragma endregion CSVField::get Specializations } #include @@ -1831,6 +2188,7 @@ namespace csv { return sys_info.dwPageSize; } + /** Size of a memory page in bytes */ const int PAGE_SIZE = getpagesize(); #elif defined(__linux__) #include @@ -1842,26 +2200,15 @@ namespace csv { /** @brief For functions that lazy load a large CSV, this determines how * many bytes are read at a time */ - const size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB + const size_t ITERATION_CHUNK_SIZE = 50000000; // 50MB } /** @brief Used for counting number of rows */ using RowCount = long long int; using CSVCollection = std::deque; - - /** @name Global Constants */ - ///@{ - /** @brief A dummy variable used to indicate delimiter should be guessed */ - const CSVFormat GUESS_CSV = { '\0', '"', 0, {}, false, true }; - - /** @brief RFC 4180 CSV format */ - const CSVFormat DEFAULT_CSV = { ',', '"', 0, {}, false, true }; - - /** @brief RFC 4180 CSV format with strict parsing */ - const CSVFormat DEFAULT_CSV_STRICT = { ',', '"', 0, {}, true, true }; - ///@} } +#include #include #include #include @@ -1877,7 +2224,7 @@ namespace csv { */ namespace csv { /** @brief Integer indicating a requested column wasn't found. */ - const int CSV_NOT_FOUND = -1; + constexpr int CSV_NOT_FOUND = -1; /** @namespace csv::internals * @brief Stuff that is generally not of interest to end-users @@ -1908,11 +2255,13 @@ namespace csv { */ class iterator { public: + #ifndef DOXYGEN_SHOULD_SKIP_THIS using value_type = CSVRow; using difference_type = std::ptrdiff_t; using pointer = CSVRow * ; using reference = CSVRow & ; using iterator_category = std::input_iterator_tag; + #endif iterator() = default; iterator(CSVReader* reader) : daddy(reader) {}; @@ -1920,8 +2269,8 @@ namespace csv { reference operator*(); pointer operator->(); - iterator& operator++(); // Pre-inc - iterator operator++(int); // Post-inc + iterator& operator++(); /**< Pre-increment iterator */ + iterator operator++(int); /**< Post-increment ierator */ iterator& operator--(); bool operator==(const iterator&) const; @@ -1937,8 +2286,8 @@ namespace csv { * Constructors for iterating over large files and parsing in-memory sources. */ ///@{ - CSVReader(const std::string& filename, CSVFormat format = GUESS_CSV); - CSVReader(CSVFormat format = DEFAULT_CSV); + CSVReader(const std::string& filename, CSVFormat format = CSVFormat::GUESS_CSV); + CSVReader(CSVFormat format = CSVFormat()); ///@} CSVReader(const CSVReader&) = delete; // No copy constructor @@ -1975,18 +2324,15 @@ namespace csv { /** @name CSV Metadata: Attributes */ ///@{ - RowCount row_num = 0; /**< @brief How many lines have - * been parsed so far - */ - RowCount correct_rows = 0; /**< @brief How many correct rows - * (minus header) have been parsed so far + RowCount row_num = 0; /**< How many lines have been parsed so far */ + RowCount correct_rows = 0; /**< How many correct rows (minus header) + * have been parsed so far */ - bool utf8_bom = false; /**< @brief Set to true if UTF-8 BOM was detected */ + bool utf8_bom = false; /**< Set to true if UTF-8 BOM was detected */ ///@} - void close(); /**< @brief Close the open file handle. - * Automatically called by ~CSVReader(). - */ + /** Close the open file handle. Automatically called by ~CSVReader(). */ + void close(); friend CSVCollection parse(const std::string&, CSVFormat); protected: @@ -1998,80 +2344,94 @@ namespace csv { */ /** @typedef ParseFlags - * @brief An enum used for describing the significance of each character - * with respect to CSV parsing + * An enum used for describing the significance of each character + * with respect to CSV parsing */ enum ParseFlags { - NOT_SPECIAL, - QUOTE, - DELIMITER, - NEWLINE + NOT_SPECIAL, /**< Characters with no special meaning */ + QUOTE, /**< Characters which may signify a quote escape */ + DELIMITER, /**< Characters which may signify a new field */ + NEWLINE /**< Characters which may signify a new row */ }; - using WorkItem = std::pair, size_t>; /**< - @brief A string buffer and its size */ + /** A string buffer and its size. Consumed by read_csv_worker(). */ + using WorkItem = std::pair, size_t>; - std::vector make_flags() const; + /** Create a vector v where each index i corresponds to the + * ASCII number for a character and, v[i + 128] labels it according to + * the CSVReader::ParseFlags enum + */ + CONSTEXPR std::array make_flags() const; + + /** Open a file for reading. Implementation is compiler specific. */ + void fopen(const std::string& filename); + + /** Sets this reader's column names and associated data */ + void set_col_names(const std::vector&); - internals::GiantStringBuffer record_buffer; /**< - @brief Buffer for current row being parsed */ + /** Returns true if we have reached end of file */ + bool eof() { return !(this->infile); }; - std::vector split_buffer; /**< - @brief Positions where current row is split */ + /** Buffer for current row being parsed */ + internals::BufferPtr record_buffer = internals::BufferPtr(new internals::RawRowBuffer()); - std::deque records; /**< @brief Queue of parsed CSV rows */ - inline bool eof() { return !(this->infile); }; + /** Queue of parsed CSV rows */ + std::deque records; /** @name CSV Parsing Callbacks * The heart of the CSV parser. * These methods are called by feed(). - */ + */ ///@{ void write_record(); + + /** Handles possible Unicode byte order mark */ + CONSTEXPR void handle_unicode_bom(csv::string_view& in); virtual void bad_row_handler(std::vector); ///@} /** @name CSV Settings **/ ///@{ - char delimiter; /**< @brief Delimiter character */ - char quote_char; /**< @brief Quote character */ - int header_row; /**< @brief Line number of the header row (zero-indexed) */ - bool strict = false; /**< @brief Strictness of parser */ + char delimiter; /**< Delimiter character */ + char quote_char; /**< Quote character */ + int header_row; /**< Line number of the header row (zero-indexed) */ + bool strict = false; /**< Strictness of parser */ - std::vector parse_flags; /**< @brief - A table where the (i + 128)th slot gives the ParseFlags for ASCII character i */ + /** An array where the (i + 128)th slot gives the ParseFlags for ASCII character i */ + std::array parse_flags; ///@} /** @name Parser State */ ///@{ - /** <@brief Pointer to a object containing column information - */ - std::shared_ptr col_names = - std::make_shared(std::vector({})); + /** Pointer to a object containing column information */ + internals::ColNamesPtr col_names = std::make_shared( + std::vector({})); - /** <@brief Whether or not an attempt to find Unicode BOM has been made */ + /** Whether or not an attempt to find Unicode BOM has been made */ bool unicode_bom_scan = false; + + /** Whether or not we have parsed the header row */ + bool header_was_parsed = false; + + /** The number of columns in this CSV */ + size_t n_cols = 0; ///@} /** @name Multi-Threaded File Reading Functions */ ///@{ void feed(WorkItem&&); /**< @brief Helper for read_csv_worker() */ - void read_csv( - const std::string& filename, - const size_t& bytes = internals::ITERATION_CHUNK_SIZE - ); + CONSTEXPR void move_to_end_of_field(csv::string_view in, size_t & i, const size_t& in_size); + void read_csv(const size_t& bytes = internals::ITERATION_CHUNK_SIZE); void read_csv_worker(); ///@} /** @name Multi-Threaded File Reading: Flags and State */ ///@{ - std::FILE* infile = nullptr; /**< @brief Current file handle. + std::FILE* infile = nullptr; /**< Current file handle. Destroyed by ~CSVReader(). */ - - std::deque feed_buffer; /**< @brief Message queue for worker */ - - std::mutex feed_lock; /**< @brief Allow only one worker to write */ - std::condition_variable feed_cond; /**< @brief Wake up worker */ + std::deque feed_buffer; /**< Message queue for worker */ + std::mutex feed_lock; /**< Allow only one worker to write */ + std::condition_variable feed_cond; /**< Wake up worker */ ///@} /**@}*/ // End of parser internals @@ -2093,22 +2453,28 @@ namespace csv { }; public: - CSVGuesser(const std::string& _filename) : filename(_filename) {}; - std::vector delims = { ',', '|', '\t', ';', '^' }; - void guess_delim(); + CSVGuesser(const std::string& _filename, const std::vector& _delims) : + filename(_filename), delims(_delims) {}; + CSVFormat guess_delim(); bool first_guess(); void second_guess(); - char delim; - int header_row = 0; - private: - void get_csv_head(); - std::string filename; - std::string head; + std::string filename; /**< File to read */ + std::string head; /**< First x bytes of file */ + std::vector delims; /**< Candidate delimiters */ + + char delim; /**< Chosen delimiter (set by guess_delim()) */ + int header_row = 0; /**< Chosen header row (set by guess_delim()) */ + + void get_csv_head(); /**< Retrieve the first x bytes of a file */ }; } } +/** @file + * Calculates statistics from CSV files + */ + #include #include @@ -2133,8 +2499,8 @@ namespace csv { std::vector get_counts() const; std::vector get_dtypes() const; - CSVStat(std::string filename, CSVFormat format = GUESS_CSV); - CSVStat(CSVFormat format = DEFAULT_CSV) : CSVReader(format) {}; + CSVStat(std::string filename, CSVFormat format = CSVFormat::GUESS_CSV); + CSVStat(CSVFormat format = CSVFormat()) : CSVReader(format) {}; private: // An array of rolling averages // Each index corresponds to the rolling mean for the column at said index @@ -2174,19 +2540,20 @@ namespace csv { */ ///@{ CSVCollection operator ""_csv(const char*, size_t); - CSVCollection parse(const std::string& in, CSVFormat format = DEFAULT_CSV); + CSVCollection parse(const std::string& in, CSVFormat format = CSVFormat()); ///@} /** @name Utility Functions */ ///@{ std::unordered_map csv_data_types(const std::string&); CSVFileInfo get_file_info(const std::string& filename); - CSVFormat guess_format(const std::string& filename); + CSVFormat guess_format(const std::string& filename, + const std::vector& delims = { ',', '|', '\t', ';', '^', '~' }); std::vector get_col_names( const std::string& filename, - const CSVFormat format = GUESS_CSV); + const CSVFormat format = CSVFormat::GUESS_CSV); int get_col_pos(const std::string filename, const std::string col_name, - const CSVFormat format = GUESS_CSV); + const CSVFormat format = CSVFormat::GUESS_CSV); ///@} namespace internals { @@ -2198,6 +2565,69 @@ namespace csv { } } + +namespace csv { + CSVFormat create_default_csv_strict() { + CSVFormat format; + format.delimiter(',') + .quote('"') + .header_row(0) + .detect_bom(true) + .strict_parsing(true); + + return format; + } + + CSVFormat create_guess_csv() { + CSVFormat format; + format.delimiter({ ',', '|', '\t', ';', '^' }) + .quote('"') + .header_row(0) + .detect_bom(true); + + return format; + } + + const CSVFormat CSVFormat::RFC4180_STRICT = create_default_csv_strict(); + const CSVFormat CSVFormat::GUESS_CSV = create_guess_csv(); + + CSVFormat& CSVFormat::delimiter(char delim) { + this->possible_delimiters = { delim }; + return *this; + } + + CSVFormat& CSVFormat::delimiter(const std::vector & delim) { + this->possible_delimiters = delim; + return *this; + } + + CSVFormat& CSVFormat::quote(char quote) { + this->quote_char = quote; + return *this; + } + + CSVFormat& CSVFormat::column_names(const std::vector& col_names) { + this->col_names = col_names; + this->header = -1; + return *this; + } + + CSVFormat& CSVFormat::header_row(int row) { + this->header = row; + this->col_names = {}; + return *this; + } + + CSVFormat& CSVFormat::strict_parsing(bool throw_error) { + this->strict = throw_error; + return *this; + } + + CSVFormat& CSVFormat::detect_bom(bool detect) { + this->unicode_detect = detect; + return *this; + } +} #include #include // For read_csv() #include // For read_csv() @@ -2235,13 +2665,16 @@ namespace csv { } } - void CSVGuesser::guess_delim() { + CSVFormat CSVGuesser::guess_delim() { /** Guess the delimiter of a CSV by scanning the first 100 lines by * First assuming that the header is on the first row * If the first guess returns too few rows, then we move to the second * guess method */ + CSVFormat format; if (!first_guess()) second_guess(); + + return format.delimiter(this->delim).header_row(this->header_row); } bool CSVGuesser::first_guess() { @@ -2256,7 +2689,7 @@ namespace csv { * Returns True if guess was a good one and second guess isn't needed */ - CSVFormat format = DEFAULT_CSV; + CSVFormat format; char current_delim{ ',' }; RowCount max_rows = 0, temp_rows = 0; @@ -2265,8 +2698,8 @@ namespace csv { // Read first 500KB of the CSV file this->get_csv_head(); - for (char delim: this->delims) { - format.delim = delim; + for (char cand_delim: this->delims) { + format.delimiter(cand_delim); CSVReader guesser(format); guesser.feed(this->head); guesser.end_feed(); @@ -2279,7 +2712,7 @@ namespace csv { (guesser.get_col_names().size() > max_cols)) { max_rows = temp_rows; max_cols = guesser.get_col_names().size(); - current_delim = delim; + current_delim = cand_delim; } } @@ -2296,12 +2729,12 @@ namespace csv { * the mode row length. */ - CSVFormat format = DEFAULT_CSV; + CSVFormat format; size_t max_rlen = 0, header = 0; - for (char delim: this->delims) { - format.delim = delim; + for (char cand_delim: this->delims) { + format.delimiter(cand_delim); Guesser guess(format); guess.feed(this->head); guess.end_feed(); @@ -2350,30 +2783,25 @@ namespace csv { } /** @brief Guess the delimiter used by a delimiter-separated values file */ - CSVFormat guess_format(const std::string& filename) { - internals::CSVGuesser guesser(filename); - guesser.guess_delim(); - return { guesser.delim, '"', guesser.header_row }; + CSVFormat guess_format(const std::string& filename, const std::vector& delims) { + internals::CSVGuesser guesser(filename, delims); + return guesser.guess_delim(); } - std::vector CSVReader::make_flags() const { - /** Create a vector v where each index i corresponds to the - * ASCII number for a character and, v[i + 128] labels it according to - * the CSVReader::ParseFlags enum - */ - - std::vector ret; + CONSTEXPR std::array CSVReader::make_flags() const { + std::array ret = {}; for (int i = -128; i < 128; i++) { + const int arr_idx = i + 128; char ch = char(i); if (ch == this->delimiter) - ret.push_back(DELIMITER); + ret[arr_idx] = DELIMITER; else if (ch == this->quote_char) - ret.push_back(QUOTE); + ret[arr_idx] = QUOTE; else if (ch == '\r' || ch == '\n') - ret.push_back(NEWLINE); + ret[arr_idx] = NEWLINE; else - ret.push_back(NOT_SPECIAL); + ret[arr_idx] = NOT_SPECIAL; } return ret; @@ -2401,13 +2829,14 @@ namespace csv { * @brief Allows parsing in-memory sources (by calling feed() and end_feed()). */ CSVReader::CSVReader(CSVFormat format) : - delimiter(format.delim), quote_char(format.quote_char), + delimiter(format.get_delim()), quote_char(format.quote_char), header_row(format.header), strict(format.strict), unicode_bom_scan(!format.unicode_detect) { if (!format.col_names.empty()) { - this->header_row = -1; - this->col_names = std::make_shared(format.col_names); + this->set_col_names(format.col_names); } + + parse_flags = this->make_flags(); }; /** @@ -2426,27 +2855,35 @@ namespace csv { * */ CSVReader::CSVReader(const std::string& filename, CSVFormat format) { - if (format.delim == '\0') - format = guess_format(filename); + if (format.guess_delim()) + format = guess_format(filename, format.possible_delimiters); - this->col_names = std::make_shared(format.col_names); - delimiter = format.delim; + if (!format.col_names.empty()) { + this->set_col_names(format.col_names); + } + else { + header_row = format.header; + } + + delimiter = format.get_delim(); quote_char = format.quote_char; - header_row = format.header; strict = format.strict; + parse_flags = this->make_flags(); // Read first 500KB of CSV - read_csv(filename, 500000); + this->fopen(filename); + this->read_csv(500000); } /** @brief Return the format of the original raw CSV */ CSVFormat CSVReader::get_format() const { - return { - this->delimiter, - this->quote_char, - this->header_row, - this->col_names->col_names - }; + CSVFormat format; + format.delimiter(this->delimiter) + .quote(this->quote_char) + .header_row(this->header_row) + .column_names(this->col_names->col_names); + + return format; } /** @brief Return the CSV's column names as a vector of strings. */ @@ -2469,6 +2906,12 @@ namespace csv { this->feed( csv::string_view(buff.first.get(), buff.second) ); } + CONSTEXPR void CSVReader::move_to_end_of_field(csv::string_view in, size_t& i, const size_t& in_size) { + while (i + 1 < in_size && parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + i++; + } + } + void CSVReader::feed(csv::string_view in) { /** @brief Parse a CSV-formatted string. * @@ -2476,30 +2919,22 @@ namespace csv { * **Note**: end_feed() should be called after the last string */ - if (parse_flags.empty()) parse_flags = this->make_flags(); - + this->handle_unicode_bom(in); bool quote_escape = false; // Are we currently in a quote escaped field? - // Unicode BOM Handling - if (!this->unicode_bom_scan) { - if (in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xEF) { - in.remove_prefix(3); // Remove BOM from input string - this->utf8_bom = true; - } - - this->unicode_bom_scan = true; - } - - // Optimization - this->record_buffer->reserve(in.size()); - std::string& _record_buffer = *(this->record_buffer.get()); + // Optimizations + auto& row_buffer = *(this->record_buffer.get()); + auto& text_buffer = row_buffer.buffer; + auto& split_buffer = row_buffer.split_buffer; + text_buffer.reserve(in.size()); + split_buffer.reserve(in.size() / 10); const size_t in_size = in.size(); for (size_t i = 0; i < in_size; i++) { - switch (this->parse_flags[in[i] + 128]) { + switch (parse_flags[in[i] + 128]) { case DELIMITER: if (!quote_escape) { - this->split_buffer.push_back(this->record_buffer.size()); + split_buffer.push_back((unsigned short)row_buffer.size()); break; } case NEWLINE: @@ -2510,22 +2945,23 @@ namespace csv { this->write_record(); break; } + + // Treat as regular character + text_buffer += in[i]; + break; case NOT_SPECIAL: { // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous // sequences, use the loop below to avoid having to go through the outer // switch statement as much as possible - #if __cplusplus >= 201703L + #ifdef CSV_HAS_CXX17 size_t start = i; - while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { - i++; - } - - _record_buffer += in.substr(start, i - start + 1); + this->move_to_end_of_field(in, i, in_size); + text_buffer += in.substr(start, i - start + 1); #else - _record_buffer += in[i]; + text_buffer += in[i]; - while (i + 1 < in_size && this->parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { - _record_buffer += in[++i]; + while (i + 1 < in_size && parse_flags[in[i + 1] + 128] == NOT_SPECIAL) { + text_buffer += in[++i]; } #endif @@ -2534,7 +2970,7 @@ namespace csv { default: // Quote if (!quote_escape) { // Don't deref past beginning - if (i && this->parse_flags[in[i - 1] + 128] >= DELIMITER) { + if (i && parse_flags[in[i - 1] + 128] >= DELIMITER) { // Case: Previous character was delimiter or newline quote_escape = true; } @@ -2542,7 +2978,7 @@ namespace csv { break; } - auto next_ch = this->parse_flags[in[i + 1] + 128]; + auto next_ch = parse_flags[in[i + 1] + 128]; if (next_ch >= DELIMITER) { // Case: Delim or newline => end of field quote_escape = false; @@ -2550,7 +2986,7 @@ namespace csv { } // Case: Escaped quote - _record_buffer += in[i]; + text_buffer += in[i]; if (next_ch == QUOTE) ++i; // Case: Two consecutive quotes @@ -2563,7 +2999,7 @@ namespace csv { } } - this->record_buffer.reset(); + this->record_buffer = row_buffer.reset(); } void CSVReader::end_feed() { @@ -2573,45 +3009,43 @@ namespace csv { this->write_record(); } + CONSTEXPR void CSVReader::handle_unicode_bom(csv::string_view& in) { + if (!this->unicode_bom_scan) { + if (in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xEF) { + in.remove_prefix(3); // Remove BOM from input string + this->utf8_bom = true; + } + + this->unicode_bom_scan = true; + } + } + void CSVReader::write_record() { /** Push the current row into a queue if it is the right length. * Drop it otherwise. - */ - - size_t col_names_size = this->col_names->size(); - - auto row = CSVRow( - this->record_buffer.buffer, - this->record_buffer.get_row(), - std::move(this->split_buffer), - this->col_names - ); + */ - if (this->row_num > this->header_row) { + if (header_was_parsed) { // Make sure record is of the right length - if (row.size() == col_names_size) { + const size_t row_size = this->record_buffer->splits_size(); + if (row_size + 1 == this->n_cols) { this->correct_rows++; - this->records.push_back(std::move(row)); + this->records.push_back(CSVRow(this->record_buffer)); } else { /* 1) Zero-length record, probably caused by extraneous newlines * 2) Too short or too long */ this->row_num--; - if (!row.empty()) - bad_row_handler(std::vector(row)); + if (row_size > 0) + bad_row_handler(std::vector(CSVRow( + this->record_buffer))); } } else if (this->row_num == this->header_row) { - this->col_names = std::make_shared( - std::vector(row)); + this->set_col_names(std::vector(CSVRow(this->record_buffer))); } // else: Ignore rows before header row - // Some memory allocation optimizations - this->split_buffer = {}; - if (this->split_buffer.capacity() < col_names_size) - split_buffer.reserve(col_names_size); - this->row_num++; } @@ -2636,28 +3070,41 @@ namespace csv { } } - /** - * @brief Parse a CSV file using multiple threads - * - * @param[in] nrows Number of rows to read. Set to -1 to read entire file. - * - * @see CSVReader::read_row() - * - */ - void CSVReader::read_csv(const std::string& filename, const size_t& bytes) { + void CSVReader::fopen(const std::string& filename) { if (!this->infile) { - #ifdef _MSC_BUILD +#ifdef _MSC_BUILD // Silence compiler warnings in Microsoft Visual C++ size_t err = fopen_s(&(this->infile), filename.c_str(), "rb"); if (err) throw std::runtime_error("Cannot open file " + filename); - #else +#else this->infile = std::fopen(filename.c_str(), "rb"); if (!this->infile) throw std::runtime_error("Cannot open file " + filename); - #endif +#endif } + } + + /** + * @param[in] names Column names + */ + void CSVReader::set_col_names(const std::vector& names) + { + this->col_names = std::make_shared(names); + this->record_buffer->col_names = this->col_names; + this->header_was_parsed = true; + this->n_cols = names.size(); + } + /** + * Parse a CSV file using multiple threads + * + * @pre CSVReader::infile points to a valid file handle, i.e. CSVReader::fopen was called + * + * @param[in] bytes Number of bytes to read. + * @see CSVReader::read_row() + */ + void CSVReader::read_csv(const size_t& bytes) { const size_t BUFFER_UPPER_LIMIT = std::min(bytes, (size_t)1000000); std::unique_ptr buffer(new char[BUFFER_UPPER_LIMIT]); auto line_buffer = buffer.get(); @@ -2721,7 +3168,9 @@ namespace csv { bool CSVReader::read_row(CSVRow &row) { if (this->records.empty()) { if (!this->eof()) { - this->read_csv("", internals::ITERATION_CHUNK_SIZE); + // TODO/Suggestion: Make this call non-blocking, + // i.e. move to it another thread + this->read_csv(internals::ITERATION_CHUNK_SIZE); } else return false; // Stop reading } @@ -2734,19 +3183,15 @@ namespace csv { } namespace csv { - /** - * @brief Return an iterator to the first row in the reader - * - */ + /** Return an iterator to the first row in the reader */ CSVReader::iterator CSVReader::begin() { CSVReader::iterator ret(this, std::move(this->records.front())); this->records.pop_front(); return ret; } - /** - * @brief A placeholder for the imaginary past the end row in a CSV. - * Attempting to deference this will lead to bad things. + /** A placeholder for the imaginary past the end row in a CSV. + * Attempting to deference this will lead to bad things. */ CSVReader::iterator CSVReader::end() { return CSVReader::iterator(); @@ -2800,36 +3245,14 @@ namespace csv { return (this->daddy == other.daddy) && (this->i == other.i); } } +/** @file + * Defines the data type used for storing information about a CSV row + */ + #include #include namespace csv { - namespace internals { - ////////////// - // ColNames // - ////////////// - - ColNames::ColNames(const std::vector& _cnames) - : col_names(_cnames) { - for (size_t i = 0; i < _cnames.size(); i++) { - this->col_pos[_cnames[i]] = i; - } - } - - std::vector ColNames::get_col_names() const { - return this->col_names; - } - - size_t ColNames::size() const { - return this->col_names.size(); - } - } - - /** @brief Return the number of fields in this row */ - size_t CSVRow::size() const { - return splits.size() + 1; - } - /** @brief Return a string view of the nth field * @complexity Constant */ @@ -2842,16 +3265,16 @@ namespace csv { if (n >= r_size) throw std::runtime_error("Index out of bounds."); - if (!splits.empty()) { + if (r_size > 1) { if (n == 0) { - end = this->splits[0]; + end = this->split_at(0); } else if (r_size == 2) { - beg = this->splits[0]; + beg = this->split_at(0); } else { - beg = this->splits[n - 1]; - if (n != r_size - 1) end = this->splits[n]; + beg = this->split_at(n - 1); + if (n != r_size - 1) end = this->split_at(n); } } @@ -2887,18 +3310,19 @@ namespace csv { * @param[in] col_name The column to look for */ CSVField CSVRow::operator[](const std::string& col_name) const { - auto col_pos = this->col_names->col_pos.find(col_name); - if (col_pos != this->col_names->col_pos.end()) + auto & col_names = this->buffer->col_names; + auto col_pos = col_names->col_pos.find(col_name); + if (col_pos != col_names->col_pos.end()) return this->operator[](col_pos->second); throw std::runtime_error("Can't find a column named " + col_name); } + /** Convert this CSVRow into a vector of strings. + * **Note**: This is a less efficient method of + * accessing data than using the [] operator. + */ CSVRow::operator std::vector() const { - /** Convert this CSVRow into a vector of strings. - * **Note**: This is a less efficient method of - * accessing data than using the [] operator. - */ std::vector ret; for (size_t i = 0; i < size(); i++) @@ -2907,34 +3331,7 @@ namespace csv { return ret; } - ////////////////////// - // CSVField Methods // - ////////////////////// - - /**< @brief Return the type number of the stored value in - * accordance with the DataType enum - */ - DataType CSVField::type() { - this->get_value(); - return (DataType)_type; - } - - #ifndef DOXYGEN_SHOULD_SKIP_THIS - void CSVField::get_value() { - /* Check to see if value has been cached previously, if not - * evaluate it - */ - if (_type < 0) { - auto dtype = internals::data_type(this->sv, &this->value); - this->_type = (int)dtype; - } - } - #endif - - // - // CSVField Utility Methods - // - +#pragma region CSVField Methods bool CSVField::operator==(csv::string_view other) const { return other == this->sv; } @@ -2943,10 +3340,9 @@ namespace csv { return other == this->get(); } - ///////////////////// - // CSVRow Iterator // - ///////////////////// +#pragma endregion CSVField Methods +#pragma region CSVRow Iterator /** @brief Return an iterator pointing to the first field. */ CSVRow::iterator CSVRow::begin() const { return CSVRow::iterator(this, 0); @@ -2968,6 +3364,11 @@ namespace csv { return std::reverse_iterator(this->begin()); } + unsigned short CSVRow::split_at(size_t n) const + { + return this->buffer->split_buffer[this->start + n]; + } + CSVRow::iterator::iterator(const CSVRow* _reader, int _i) : daddy(_reader), i(_i) { if (_i < (int)this->daddy->size()) @@ -3037,14 +3438,15 @@ namespace csv { bool CSVRow::iterator::operator==(const iterator& other) const { return this->i == other.i; } +#pragma endregion CSVRow Iterator } +/** @file + * Calculates statistics from CSV files + */ + #include namespace csv { - /** @file - * Calculates statistics from CSV files - */ - CSVStat::CSVStat(std::string filename, CSVFormat format) : CSVReader(filename, format) { /** Lazily calculate statistics for a potentially large file. Once this constructor @@ -3052,7 +3454,7 @@ namespace csv { * methods like get_mean(), get_counts(), etc... can be used to retrieve statistics. */ while (!this->eof()) { - this->read_csv("", internals::ITERATION_CHUNK_SIZE); + this->read_csv(internals::ITERATION_CHUNK_SIZE); this->calc(); } @@ -3355,7 +3757,7 @@ namespace csv { CSVFileInfo info = { filename, reader.get_col_names(), - format.delim, + format.get_delim(), reader.correct_rows, (int)reader.get_col_names().size() }; @@ -3363,231 +3765,73 @@ namespace csv { return info; } } -#include - - -/** @file - * @brief Provides numeric parsing functionality - */ namespace csv { namespace internals { - #ifndef DOXYGEN_SHOULD_SKIP_THIS - std::string type_name(const DataType& dtype) { - switch (dtype) { - case CSV_STRING: - return "string"; - case CSV_INT: - return "int"; - case CSV_LONG_INT: - return "long int"; - case CSV_LONG_LONG_INT: - return "long long int"; - case CSV_DOUBLE: - return "double"; - default: - return "null"; - } - }; - #endif - - constexpr long double _INT_MAX = (long double)std::numeric_limits::max(); - constexpr long double _LONG_MAX = (long double)std::numeric_limits::max(); - constexpr long double _LONG_LONG_MAX = (long double)std::numeric_limits::max(); - - /** Given a pointer to the start of what is start of - * the exponential part of a number written (possibly) in scientific notation - * parse the exponent - */ - inline DataType _process_potential_exponential( - csv::string_view exponential_part, - const long double& coeff, - long double * const out) { - long double exponent = 0; - auto result = data_type(exponential_part, &exponent); + ////////////// + // ColNames // + ////////////// - if (result >= CSV_INT && result <= CSV_DOUBLE) { - if (out) *out = coeff * pow10(exponent); - return CSV_DOUBLE; + ColNames::ColNames(const std::vector& _cnames) + : col_names(_cnames) { + for (size_t i = 0; i < _cnames.size(); i++) { + this->col_pos[_cnames[i]] = i; } - - return CSV_STRING; } - /** Given the absolute value of an integer, determine what numeric type - * it fits in - */ - inline DataType _determine_integral_type(const long double& number) { - // We can assume number is always non-negative - assert(number >= 0); - - if (number < _INT_MAX) - return CSV_INT; - else if (number < _LONG_MAX) - return CSV_LONG_INT; - else if (number < _LONG_LONG_MAX) - return CSV_LONG_LONG_INT; - else // Conversion to long long will cause an overflow - return CSV_DOUBLE; + std::vector ColNames::get_col_names() const { + return this->col_names; } - DataType data_type(csv::string_view in, long double* const out) { - /** Distinguishes numeric from other text values. Used by various - * type casting functions, like csv_parser::CSVReader::read_row() - * - * #### Rules - * - Leading and trailing whitespace ("padding") ignored - * - A string of just whitespace is NULL - * - * @param[in] in String value to be examined - */ - - // Empty string --> NULL - if (in.size() == 0) - return CSV_NULL; - - bool ws_allowed = true, - neg_allowed = true, - dot_allowed = true, - digit_allowed = true, - has_digit = false, - prob_float = false; - - unsigned places_after_decimal = 0; - long double integral_part = 0, - decimal_part = 0; - - for (size_t i = 0, ilen = in.size(); i < ilen; i++) { - const char& current = in[i]; - - switch (current) { - case ' ': - if (!ws_allowed) { - if (isdigit(in[i - 1])) { - digit_allowed = false; - ws_allowed = true; - } - else { - // Ex: '510 123 4567' - return CSV_STRING; - } - } - break; - case '-': - if (!neg_allowed) { - // Ex: '510-123-4567' - return CSV_STRING; - } - - neg_allowed = false; - break; - case '.': - if (!dot_allowed) { - return CSV_STRING; - } - - dot_allowed = false; - prob_float = true; - break; - case 'e': - case 'E': - // Process scientific notation - if (prob_float) { - size_t exponent_start_idx = i + 1; - - // Strip out plus sign - if (in[i + 1] == '+') { - exponent_start_idx++; - } - - return _process_potential_exponential( - in.substr(exponent_start_idx), - neg_allowed ? integral_part + decimal_part : -(integral_part + decimal_part), - out - ); - } - - return CSV_STRING; - break; - default: - if (isdigit(current)) { - // Process digit - has_digit = true; - - if (!digit_allowed) - return CSV_STRING; - else if (ws_allowed) // Ex: '510 456' - ws_allowed = false; - - // Build current number - unsigned digit = current - '0'; - if (prob_float) { - decimal_part += digit / pow10(++places_after_decimal); - } - else { - integral_part = (integral_part * 10) + digit; - } - } - else { - return CSV_STRING; - } - } - } - - // No non-numeric/non-whitespace characters found - if (has_digit) { - long double number = integral_part + decimal_part; - if (out) { - *out = neg_allowed ? number : -number; - } - - return prob_float ? CSV_DOUBLE : _determine_integral_type(number); - } - - // Just whitespace - return CSV_NULL; + size_t ColNames::size() const { + return this->col_names.size(); } - } -} -namespace csv { - namespace internals { - /** - * Return a string_view over the current_row - */ - csv::string_view GiantStringBuffer::get_row() { + csv::string_view RawRowBuffer::get_row() { csv::string_view ret( - this->buffer->c_str() + this->current_end, // Beginning of string - (this->buffer->size() - this->current_end) // Count + this->buffer.c_str() + this->current_end, // Beginning of string + (this->buffer.size() - this->current_end) // Count ); - this->current_end = this->buffer->size(); + this->current_end = this->buffer.size(); return ret; } - /** Return size of current row */ - size_t GiantStringBuffer::size() const { - return (this->buffer->size() - this->current_end); + ColumnPositions RawRowBuffer::get_splits() + { + const size_t head_idx = this->current_split_idx, + new_split_idx = this->split_buffer.size(); + + this->current_split_idx = new_split_idx; + return ColumnPositions(*this, head_idx, new_split_idx - head_idx + 1); } - std::string* GiantStringBuffer::get() const { - return this->buffer.get(); + size_t RawRowBuffer::size() const { + return this->buffer.size() - this->current_end; } - std::string* GiantStringBuffer::operator->() const { - return this->buffer.operator->(); + size_t RawRowBuffer::splits_size() const { + return this->split_buffer.size() - this->current_split_idx; } - /** Clear out the buffer, but save current row in progress */ - void GiantStringBuffer::reset() { + BufferPtr RawRowBuffer::reset() const { // Save current row in progress - auto temp_str = this->buffer->substr( + auto new_buff = BufferPtr(new RawRowBuffer()); + + new_buff->buffer = this->buffer.substr( this->current_end, // Position - (this->buffer->size() - this->current_end) // Count + (this->buffer.size() - this->current_end) // Count ); - this->current_end = 0; - this->buffer = std::make_shared(temp_str); + new_buff->col_names = this->col_names; + + // No need to remove unnecessary bits from this buffer + // (memory savings would be marginal anyways) + return new_buff; + } + + unsigned short ColumnPositions::split_at(int n) const { + return this->parent->split_buffer[this->start + n]; } } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d3f6ebb9..569ad253 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,11 +13,18 @@ target_sources(csv_test test_data_type.cpp ) target_link_libraries(csv_test csv) -add_custom_command( - TARGET csv_test POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory - data $/tests/data -) -enable_testing() -add_test(test csv_test) \ No newline at end of file +if(MSVC) + # Workaround to enable debugging unit tests in Visual Studio + add_custom_command( + TARGET csv_test POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CSV_TEST_DIR}/data $/tests/data + ) +endif() + +add_test( + NAME test + COMMAND csv_test + WORKING_DIRECTORY ${CSV_ROOT_DIR} +) \ No newline at end of file diff --git a/tests/test_csv_buffer.cpp b/tests/test_csv_buffer.cpp index 00ea6553..5055995d 100644 --- a/tests/test_csv_buffer.cpp +++ b/tests/test_csv_buffer.cpp @@ -1,23 +1,46 @@ -// Tests for GiantStringBuffer +// Tests for RawRowBuffer #include "catch.hpp" #include "csv.hpp" using namespace csv::internals; TEST_CASE("GiantStringBufferTest", "[test_giant_string_buffer]") { - GiantStringBuffer buffer; + RawRowBuffer buffer; - buffer->append("1234"); + buffer.buffer.append("1234"); std::string first_row = std::string(buffer.get_row()); - buffer->append("5678"); + buffer.buffer.append("5678"); std::string second_row = std::string(buffer.get_row()); buffer.reset(); - buffer->append("abcd"); + buffer.buffer.append("abcd"); std::string third_row = std::string(buffer.get_row()); REQUIRE(first_row == "1234"); REQUIRE(second_row == "5678"); REQUIRE(third_row == "abcd"); +} + +TEST_CASE("GiantSplitBufferTest", "[test_giant_split_buffer]") { + RawRowBuffer buffer; + auto & splits = buffer.split_buffer; + + splits.push_back(1); + splits.push_back(2); + splits.push_back(3); + + auto pos = buffer.get_splits(); + REQUIRE(pos.split_at(0) == 1); + REQUIRE(pos.split_at(1) == 2); + REQUIRE(pos.split_at(2) == 3); + REQUIRE(pos.n_cols == 4); + + splits.push_back(4); + splits.push_back(5); + + pos = buffer.get_splits(); + REQUIRE(pos.split_at(0) == 4); + REQUIRE(pos.split_at(1) == 5); + REQUIRE(pos.n_cols == 3); } \ No newline at end of file diff --git a/tests/test_csv_row.cpp b/tests/test_csv_row.cpp index c0031146..605341e9 100644 --- a/tests/test_csv_row.cpp +++ b/tests/test_csv_row.cpp @@ -16,13 +16,9 @@ TEST_CASE("CSVRow Test", "[test_csv_row]") { "Col3" "Col4"; - std::vector splits = { 4, 8, 12 }; + std::vector splits = { 4, 8, 12 }; - CSVRow row( - std::move(str), - std::move(splits), - col_names - ); + CSVRow row(str, splits, col_names); bool error_caught = false; @@ -77,11 +73,11 @@ TEST_CASE("CSVField operator==", "[test_csv_field_equal]") { "3" "3.14"; - std::vector splits = { 1, 2, 3 }; - CSVRow row(std::move(str), std::move(splits), col_names); + std::vector splits = { 1, 2, 3 }; + CSVRow row(str, splits, col_names); REQUIRE(row["A"] == 1); REQUIRE(row["B"] == 2); REQUIRE(row["C"] == 3); - REQUIRE(row["D"].get() == 3.14); + REQUIRE(internals::is_equal(row["D"].get(), 3.14L)); } \ No newline at end of file diff --git a/tests/test_csv_stat.cpp b/tests/test_csv_stat.cpp index b10948c8..dd511b39 100644 --- a/tests/test_csv_stat.cpp +++ b/tests/test_csv_stat.cpp @@ -13,8 +13,8 @@ TEST_CASE("Calculating Statistics from Direct Input", "[read_csv_stat_direct]" ) } // Expected results - CSVFormat format = DEFAULT_CSV; - format.col_names = { "A", "B", "C" }; + CSVFormat format; + format.column_names({ "A", "B", "C" }); CSVStat reader(format); reader.feed(int_list); diff --git a/tests/test_read_csv.cpp b/tests/test_read_csv.cpp index e21c36b1..61b11c09 100644 --- a/tests/test_read_csv.cpp +++ b/tests/test_read_csv.cpp @@ -23,22 +23,22 @@ TEST_CASE("col_pos() Test", "[test_col_pos]") { TEST_CASE("guess_delim() Test - Pipe", "[test_guess_pipe]") { CSVFormat format = guess_format( "./tests/data/real_data/2009PowerStatus.txt"); - REQUIRE(format.delim == '|'); - REQUIRE(format.header == 0); + REQUIRE(format.get_delim() == '|'); + REQUIRE(format.get_header() == 0); } TEST_CASE("guess_delim() Test - Semi-Colon", "[test_guess_scolon]") { CSVFormat format = guess_format( "./tests/data/real_data/YEAR07_CBSA_NAC3.txt"); - REQUIRE(format.delim == ';'); - REQUIRE(format.header == 0); + REQUIRE(format.get_delim() == ';'); + REQUIRE(format.get_header() == 0); } TEST_CASE("guess_delim() Test - CSV with Comments", "[test_guess_comment]") { CSVFormat format = guess_format( "./tests/data/fake_data/ints_comments.csv"); - REQUIRE(format.delim == ','); - REQUIRE(format.header == 5); + REQUIRE(format.get_delim() == ','); + REQUIRE(format.get_header() == 5); } // get_file_info() @@ -136,10 +136,7 @@ TEST_CASE( "Test Escaped Quote", "[read_csv_quote]" ) { std::string error_message(""); try { - auto strict_format = DEFAULT_CSV; - strict_format.strict = true; - - auto should_fail = parse(csv_string, strict_format); + auto should_fail = parse(csv_string, CSVFormat::RFC4180_STRICT); } catch (std::runtime_error& err) { caught_single_quote = true; @@ -160,7 +157,7 @@ TEST_CASE("Test Bad Row Handling", "[read_csv_strict]") { bool error_caught = false; try { - parse(csv_string, DEFAULT_CSV_STRICT); + parse(csv_string, CSVFormat::RFC4180_STRICT); } catch (std::runtime_error& err) { error_caught = true; @@ -189,7 +186,7 @@ TEST_CASE("Non-Existent CSV", "[read_ghost_csv]") { TEST_CASE( "Test Read CSV with Header Row", "[read_csv_header]" ) { // Header on first row const std::string data_file = "./tests/data/real_data/2015_StateDepartment.csv"; - CSVReader reader(data_file, DEFAULT_CSV); + CSVReader reader(data_file, CSVFormat()); CSVRow row; reader.read_row(row); // Populate row with first line @@ -242,8 +239,8 @@ TEST_CASE("Test read_row() CSVField - Easy", "[read_row_csvf1]") { //! [CSVField Example] TEST_CASE("Test read_row() CSVField - Memory", "[read_row_csvf2]") { - CSVFormat format = DEFAULT_CSV; - format.col_names = { "A", "B" }; + CSVFormat format; + format.column_names({ "A", "B" }); std::stringstream csv_string; double big_num = ((double)std::numeric_limits::max() * 2.0);