Skip to content

Commit

Permalink
Added UTF-8 BOM Detection
Browse files Browse the repository at this point in the history
  • Loading branch information
vincentlaucsb committed Sep 17, 2018
1 parent 95f56ca commit a26a6bc
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 136 deletions.
13 changes: 10 additions & 3 deletions src/csv_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ namespace csv {

/**< @brief RFC 4180 non-compliance -> throw an error */
bool strict;

/**< @brief Detect and strip out Unicode byte order marks */
bool unicode_detect;
};

/** Returned by get_file_info() */
Expand Down Expand Up @@ -98,13 +101,13 @@ namespace csv {
const size_t ITERATION_CHUNK_SIZE = 10000000; // 10MB

/** @brief A dummy variable used to indicate delimiter should be guessed */
const CSVFormat GUESS_CSV = { '\0', '"', 0, {}, false };
const CSVFormat GUESS_CSV = { '\0', '"', 0, {}, false, true };

/** @brief RFC 4180 CSV format */
const CSVFormat DEFAULT_CSV = { ',', '"', 0, {}, false };
const CSVFormat DEFAULT_CSV = { ',', '"', 0, {}, false, true };

/** @brief RFC 4180 CSV format with strict parsing */
const CSVFormat DEFAULT_CSV_STRICT = { ',', '"', 0, {}, true };
const CSVFormat DEFAULT_CSV_STRICT = { ',', '"', 0, {}, true, true };
///@}

/** @class CSVReader
Expand Down Expand Up @@ -201,6 +204,7 @@ namespace csv {
RowCount correct_rows = 0; /**< @brief How many correct rows
* (minus header) have been parsed so far
*/
bool utf8_bom = false; /**< @brief Set to true if UTF-8 BOM was detected */
///@}

void close(); /**< @brief Close the open file handle.
Expand Down Expand Up @@ -264,6 +268,9 @@ namespace csv {
*/
std::shared_ptr<internals::ColNames> col_names =
std::make_shared<internals::ColNames>(std::vector<std::string>({}));

/** <@brief Whether or not an attempt to find Unicode BOM has been made */
bool unicode_bom_scan = false;
///@}

/** @name Multi-Threaded File Reading Functions */
Expand Down
15 changes: 13 additions & 2 deletions src/csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace csv {
//
std::string_view GiantStringBuffer::get_row() {
/**
* Return a string_viewo ver the current_row
* Return a string_view over the current_row
*/

std::string_view ret(
Expand Down Expand Up @@ -299,7 +299,8 @@ namespace csv {
*/
CSVReader::CSVReader(CSVFormat format) :
delimiter(format.delim), quote_char(format.quote_char),
header_row(format.header), strict(format.strict) {
header_row(format.header), strict(format.strict),
unicode_bom_scan(!format.unicode_detect) {
if (!format.col_names.empty()) {
this->header_row = -1;
this->col_names = std::make_shared<internals::ColNames>(format.col_names);
Expand Down Expand Up @@ -376,6 +377,16 @@ namespace csv {

bool quote_escape = false; // Are we currently in a quote escaped field?

// Unicode BOM Handling
if (!this->unicode_bom_scan) {
if (in[0] == 0xEF && in[1] == 0xBB && in[2] == 0xEF) {
in.remove_prefix(3); // Remove BOM from input string
this->utf8_bom = true;
}

this->unicode_bom_scan = true;
}

// Optimization
this->record_buffer->reserve(in.size());
std::string& _record_buffer = *(this->record_buffer.get());
Expand Down
136 changes: 62 additions & 74 deletions tests/test_csv_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,74 +6,65 @@
#include "csv_parser.hpp"
using namespace csv;

auto make_csv_row();
auto make_csv_row() {
//////////////////////
// CSVRow Iterators //
//////////////////////

TEST_CASE("Test CSVRow Interator", "[test_csv_row_iter]") {
auto rows = "A,B,C\r\n" // Header row
"123,234,345\r\n"
"1,2,3\r\n"
"1,2,3"_csv;
auto row = rows.front();

return rows.front();
}

//////////////////////
// CSVRow Iterators //
//////////////////////
SECTION("Forwards and Backwards Iterators") {
// Forwards
REQUIRE(row.begin()->get<int>() == 123);
REQUIRE((row.end() - 1)->get<>() == "345");

TEST_CASE("Test CSVRow Iterator", "[csv_iter]") {
auto row = make_csv_row();
size_t i = 0;
for (auto it = row.begin(); it != row.end(); ++it) {
if (i == 0) REQUIRE(it->get<>() == "123");
else if (i == 1) REQUIRE(it->get<>() == "234");
else REQUIRE(it->get<>() == "345");

// Forwards
REQUIRE(row.begin()->get<int>() == 123);
REQUIRE((row.end() - 1)->get<>() == "345");
i++;
}

size_t i = 0;
for (auto it = row.begin(); it != row.end(); ++it) {
if (i == 0) REQUIRE(it->get<>() == "123");
else if (i == 1) REQUIRE(it->get<>() == "234");
else REQUIRE(it->get<>() == "345");

i++;
// Backwards
REQUIRE(row.rbegin()->get<int>() == 345);
REQUIRE((row.rend() - 1)->get<>() == "123");
}

// Backwards
REQUIRE(row.rbegin()->get<int>() == 345);
REQUIRE((row.rend() - 1)->get<>() == "123");
}

TEST_CASE("Test CSVRow Iterator Arithmetic", "[csv_iter_math]") {
auto row = make_csv_row();
SECTION("Iterator Arithmetic") {
REQUIRE(row.begin()->get<int>() == 123);
REQUIRE((row.end() - 1)->get<>() == "345");

REQUIRE(row.begin()->get<int>() == 123);
REQUIRE((row.end() - 1)->get<>() == "345");
auto row_start = row.begin();
REQUIRE(*(row_start + 1) == "234");
REQUIRE(*(row_start + 2) == "345");

auto row_start = row.begin();
REQUIRE(*(row_start + 1) == "234");
REQUIRE(*(row_start + 2) == "345");
}

}
SECTION("Post-Increment Iterator") {
auto it = row.begin();

TEST_CASE("Test CSVRow Post-Increment Iterator", "[csv_iter_postinc]") {
auto row = make_csv_row();
auto it = row.begin();

REQUIRE(it++->get<int>() == 123);
REQUIRE(it->get<int>() == 234);

REQUIRE(it--->get<int>() == 234);
REQUIRE(it->get<int>() == 123);
}
REQUIRE(it++->get<int>() == 123);
REQUIRE(it->get<int>() == 234);

TEST_CASE("Test CSVRow Range Based For", "[csv_iter_for]") {
auto row = make_csv_row();
REQUIRE(it--->get<int>() == 234);
REQUIRE(it->get<int>() == 123);
}

size_t i = 0;
for (auto& field: row) {
if (i == 0) REQUIRE(field.get<>() == "123");
else if (i == 1) REQUIRE(field.get<>() == "234");
else REQUIRE(field.get<>() == "345");
SECTION("Range Based For") {
size_t i = 0;
for (auto& field : row) {
if (i == 0) REQUIRE(field.get<>() == "123");
else if (i == 1) REQUIRE(field.get<>() == "234");
else REQUIRE(field.get<>() == "345");

i++;
i++;
}
}
}

Expand All @@ -83,38 +74,35 @@ TEST_CASE("Test CSVRow Range Based For", "[csv_iter_for]") {

//! [CSVReader Iterator 1]
TEST_CASE("Basic CSVReader Iterator Test", "[read_ints_iter]") {
// A file where each value in the ith row is the number i
// There are 100 rows
CSVReader reader("./tests/data/fake_data/ints.csv");

size_t i = 1;
for (auto it = reader.begin(); it != reader.end(); ++it) {
REQUIRE((*it)[0].get<int>() == i);
i++;
}
}

TEST_CASE("Basic CSVReader Range-Based For Test", "[read_ints_range]") {
// A file with 100 rows and columns A, B, ... J
// where every value in the ith row is the number i
CSVReader reader("./tests/data/fake_data/ints.csv");
std::vector<std::string> col_names = {
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J"
};

size_t i = 1;
for (auto& row : reader) {
for (auto& j : col_names) REQUIRE(row[j].get<int>() == i);
i++;

SECTION("Basic Iterator") {
for (auto it = reader.begin(); it != reader.end(); ++it) {
REQUIRE((*it)[0].get<int>() == i);
i++;
}
}
}
//! [CSVReader Iterator 1]

TEST_CASE("CSVReader Post-Increment Iterator", "[read_ints_post_iter]") {
CSVReader reader("./tests/data/fake_data/ints.csv");
SECTION("Iterator Post-Increment") {
auto it = reader.begin();
REQUIRE((it++)->operator[]("A").get<int>() == 1);
REQUIRE(it->operator[]("A").get<int>() == 2);
}

auto it = reader.begin();
REQUIRE((it++)->operator[]("A").get<int>() == 1);
REQUIRE(it->operator[]("A").get<int>() == 2);
SECTION("Range-Based For Loop") {
for (auto& row : reader) {
for (auto& j : col_names) REQUIRE(row[j].get<int>() == i);
i++;
}
}
}
//! [CSVReader Iterator 1]

//! [CSVReader Iterator 2]
TEST_CASE("CSVReader Iterator + std::max_elem", "[iter_max_elem]") {
Expand Down
105 changes: 49 additions & 56 deletions tests/test_csv_row.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,69 @@
#include "csv_parser.hpp"
using namespace csv;

CSVRow make_row();
CSVRow make_numeric_row();

CSVRow make_row() {
// Construct a CSVRow and assert that its interface works as expected
TEST_CASE("CSVRow Test", "[test_csv_row]") {
// Create a row of size 4
auto col_names = std::make_shared<internals::ColNames>(
std::vector<std::string>({ "A", "B", "C", "D" })
);
);

std::string str;
str += "Col1"
std::string str = "Col1"
"Col2"
"Col3"
"Col4";

std::vector<size_t> splits = { 4, 8, 12 };
return CSVRow(

CSVRow row(
std::move(str),
std::move(splits),
col_names
);

bool error_caught = false;

SECTION("size() Check") {
REQUIRE(row.size() == 4);
}

SECTION("operator[]") {
REQUIRE(row[1] == "Col2");
REQUIRE(row["B"] == "Col2");

REQUIRE(row[2] == "Col3");
REQUIRE(row["C"] == "Col3");
}

SECTION("operator[] Out of Bounds") {
try {
auto dne = row[4].get<>();
}
catch (std::runtime_error& err) {
error_caught = true;
}

REQUIRE(error_caught);
}

SECTION("operator[] Access Non-Existent Column") {
try {
row["Col5"].get<>();
}
catch (std::runtime_error& err) {
error_caught = true;
}

REQUIRE(error_caught);
}

SECTION("Content Check") {
REQUIRE(std::vector<std::string>(row) ==
std::vector<std::string>({ "Col1", "Col2", "Col3", "Col4" }));
}
}

CSVRow make_numeric_row() {
TEST_CASE("CSVField operator==", "[test_csv_field_equal]") {
auto col_names = std::make_shared<internals::ColNames>(
std::vector<std::string>({ "A", "B", "C", "D" })
);
Expand All @@ -39,54 +78,8 @@ CSVRow make_numeric_row() {
"3.14";

std::vector<size_t> splits = { 1, 2, 3 };
return CSVRow(std::move(str), std::move(splits), col_names);
}

TEST_CASE("CSVRow Size Check", "[test_csv_row_size]") {
auto row = make_row();
REQUIRE(row.size() == 4);
}

TEST_CASE("CSVRow operator[]", "[test_csv_row_index]") {
auto row = make_row();
REQUIRE(row[1] == "Col2");
REQUIRE(row["B"] == "Col2");
CSVRow row(std::move(str), std::move(splits), col_names);

REQUIRE(row[2] == "Col3");
REQUIRE(row["C"] == "Col3");
}

TEST_CASE("CSVRow operator[] Out of Bounds", "[test_csv_row_index_error]") {
auto row = make_row();
bool error_caught = false;
try {
auto dne = row[4].get<>();
}
catch (std::runtime_error& err) {
error_caught = true;
}

REQUIRE(error_caught);

// Try accessing a non-existent column
try {
auto dne = row["Col5"].get<>();
}
catch (std::runtime_error& err) {
error_caught = true;
}

REQUIRE(error_caught);
}

TEST_CASE("CSVRow Content Check", "[test_csv_row_contents]") {
auto row = make_row();
REQUIRE(std::vector<std::string>(row) ==
std::vector<std::string>({ "Col1", "Col2", "Col3", "Col4" }));
}

TEST_CASE("CSVField operator==", "[test_csv_field_equal]") {
auto row = make_numeric_row();
REQUIRE(row["A"] == 1);
REQUIRE(row["B"] == 2);
REQUIRE(row["C"] == 3);
Expand Down
Loading

0 comments on commit a26a6bc

Please sign in to comment.