From 9a708ce462c72b51755ba4bb24e408811b45a5c3 Mon Sep 17 00:00:00 2001 From: hirohira Date: Thu, 17 Oct 2024 01:07:34 +0900 Subject: [PATCH] perf: reduce execution time by pre-calculating header column count --- include/internal/csv_stat.cpp | 20 +++++++++++--------- include/internal/csv_stat.hpp | 1 + single_include/csv.hpp | 21 ++++++++++++--------- single_include_test/csv.hpp | 21 ++++++++++++--------- 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/include/internal/csv_stat.cpp b/include/internal/csv_stat.cpp index 874a6682..85d184e1 100644 --- a/include/internal/csv_stat.cpp +++ b/include/internal/csv_stat.cpp @@ -24,7 +24,7 @@ namespace csv { /** Return current means */ CSV_INLINE std::vector CSVStat::get_mean() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->rolling_means[i]); } return ret; @@ -33,7 +33,7 @@ namespace csv { /** Return current variances */ CSV_INLINE std::vector CSVStat::get_variance() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); } return ret; @@ -42,7 +42,7 @@ namespace csv { /** Return current mins */ CSV_INLINE std::vector CSVStat::get_mins() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->mins[i]); } return ret; @@ -51,7 +51,7 @@ namespace csv { /** Return current maxes */ CSV_INLINE std::vector CSVStat::get_maxes() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->maxes[i]); } return ret; @@ -60,7 +60,7 @@ namespace csv { /** Get counts for each column */ CSV_INLINE std::vector CSVStat::get_counts() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->counts[i]); } return ret; @@ -69,7 +69,7 @@ namespace csv { /** Get data type counts for each column */ CSV_INLINE std::vector CSVStat::get_dtypes() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->dtypes[i]); } return ret; @@ -79,7 +79,7 @@ namespace csv { /** Only create stats counters the first time **/ if (dtypes.empty()) { /** Go through all records and calculate specified statistics */ - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { dtypes.push_back({}); counts.push_back({}); rolling_means.push_back(0); @@ -92,7 +92,7 @@ namespace csv { // Start threads std::vector pool; - for (size_t i = 0; i < this->get_col_names().size(); i++) + for (size_t i = 0; i < this->n_cols; i++) pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); // Block until done @@ -105,6 +105,8 @@ namespace csv { CSV_INLINE void CSVStat::calc() { constexpr size_t CALC_CHUNK_SIZE = 5000; + this->n_cols = reader.get_col_names().size(); + for (auto& row : reader) { this->records.push_back(std::move(row)); @@ -128,7 +130,7 @@ namespace csv { auto current_record = this->records.begin(); for (size_t processed = 0; current_record != this->records.end(); processed++) { - if (current_record->size() == this->get_col_names().size()) { + if (current_record->size() == this->n_cols) { auto current_field = (*current_record)[i]; // Optimization: Don't count() if there's too many distinct values in the first 1000 rows diff --git a/include/internal/csv_stat.hpp b/include/internal/csv_stat.hpp index eab73434..0d054580 100644 --- a/include/internal/csv_stat.hpp +++ b/include/internal/csv_stat.hpp @@ -55,6 +55,7 @@ namespace csv { void calc_worker(const size_t&); CSVReader reader; + size_t n_cols; std::deque records = {}; }; } \ No newline at end of file diff --git a/single_include/csv.hpp b/single_include/csv.hpp index 811c8e14..1d45a709 100644 --- a/single_include/csv.hpp +++ b/single_include/csv.hpp @@ -6494,6 +6494,7 @@ namespace csv { void calc_worker(const size_t&); CSVReader reader; + size_t n_cols; std::deque records = {}; }; } @@ -8261,7 +8262,7 @@ namespace csv { /** Return current means */ CSV_INLINE std::vector CSVStat::get_mean() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->rolling_means[i]); } return ret; @@ -8270,7 +8271,7 @@ namespace csv { /** Return current variances */ CSV_INLINE std::vector CSVStat::get_variance() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); } return ret; @@ -8279,7 +8280,7 @@ namespace csv { /** Return current mins */ CSV_INLINE std::vector CSVStat::get_mins() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->mins[i]); } return ret; @@ -8288,7 +8289,7 @@ namespace csv { /** Return current maxes */ CSV_INLINE std::vector CSVStat::get_maxes() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->maxes[i]); } return ret; @@ -8297,7 +8298,7 @@ namespace csv { /** Get counts for each column */ CSV_INLINE std::vector CSVStat::get_counts() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->counts[i]); } return ret; @@ -8306,7 +8307,7 @@ namespace csv { /** Get data type counts for each column */ CSV_INLINE std::vector CSVStat::get_dtypes() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->dtypes[i]); } return ret; @@ -8316,7 +8317,7 @@ namespace csv { /** Only create stats counters the first time **/ if (dtypes.empty()) { /** Go through all records and calculate specified statistics */ - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { dtypes.push_back({}); counts.push_back({}); rolling_means.push_back(0); @@ -8329,7 +8330,7 @@ namespace csv { // Start threads std::vector pool; - for (size_t i = 0; i < this->get_col_names().size(); i++) + for (size_t i = 0; i < this->n_cols; i++) pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); // Block until done @@ -8342,6 +8343,8 @@ namespace csv { CSV_INLINE void CSVStat::calc() { constexpr size_t CALC_CHUNK_SIZE = 5000; + this->n_cols = reader.get_col_names().size(); + for (auto& row : reader) { this->records.push_back(std::move(row)); @@ -8365,7 +8368,7 @@ namespace csv { auto current_record = this->records.begin(); for (size_t processed = 0; current_record != this->records.end(); processed++) { - if (current_record->size() == this->get_col_names().size()) { + if (current_record->size() == this->n_cols) { auto current_field = (*current_record)[i]; // Optimization: Don't count() if there's too many distinct values in the first 1000 rows diff --git a/single_include_test/csv.hpp b/single_include_test/csv.hpp index 811c8e14..1d45a709 100644 --- a/single_include_test/csv.hpp +++ b/single_include_test/csv.hpp @@ -6494,6 +6494,7 @@ namespace csv { void calc_worker(const size_t&); CSVReader reader; + size_t n_cols; std::deque records = {}; }; } @@ -8261,7 +8262,7 @@ namespace csv { /** Return current means */ CSV_INLINE std::vector CSVStat::get_mean() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->rolling_means[i]); } return ret; @@ -8270,7 +8271,7 @@ namespace csv { /** Return current variances */ CSV_INLINE std::vector CSVStat::get_variance() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->rolling_vars[i]/(this->n[i] - 1)); } return ret; @@ -8279,7 +8280,7 @@ namespace csv { /** Return current mins */ CSV_INLINE std::vector CSVStat::get_mins() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->mins[i]); } return ret; @@ -8288,7 +8289,7 @@ namespace csv { /** Return current maxes */ CSV_INLINE std::vector CSVStat::get_maxes() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->maxes[i]); } return ret; @@ -8297,7 +8298,7 @@ namespace csv { /** Get counts for each column */ CSV_INLINE std::vector CSVStat::get_counts() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->counts[i]); } return ret; @@ -8306,7 +8307,7 @@ namespace csv { /** Get data type counts for each column */ CSV_INLINE std::vector CSVStat::get_dtypes() const { std::vector ret; - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { ret.push_back(this->dtypes[i]); } return ret; @@ -8316,7 +8317,7 @@ namespace csv { /** Only create stats counters the first time **/ if (dtypes.empty()) { /** Go through all records and calculate specified statistics */ - for (size_t i = 0; i < this->get_col_names().size(); i++) { + for (size_t i = 0; i < this->n_cols; i++) { dtypes.push_back({}); counts.push_back({}); rolling_means.push_back(0); @@ -8329,7 +8330,7 @@ namespace csv { // Start threads std::vector pool; - for (size_t i = 0; i < this->get_col_names().size(); i++) + for (size_t i = 0; i < this->n_cols; i++) pool.push_back(std::thread(&CSVStat::calc_worker, this, i)); // Block until done @@ -8342,6 +8343,8 @@ namespace csv { CSV_INLINE void CSVStat::calc() { constexpr size_t CALC_CHUNK_SIZE = 5000; + this->n_cols = reader.get_col_names().size(); + for (auto& row : reader) { this->records.push_back(std::move(row)); @@ -8365,7 +8368,7 @@ namespace csv { auto current_record = this->records.begin(); for (size_t processed = 0; current_record != this->records.end(); processed++) { - if (current_record->size() == this->get_col_names().size()) { + if (current_record->size() == this->n_cols) { auto current_field = (*current_record)[i]; // Optimization: Don't count() if there's too many distinct values in the first 1000 rows