Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic Programming c++ implementation #9

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
bb10165
added Dynp class
npkamath Jan 2, 2024
d13dfc7
reverted extra spaces
npkamath Jan 2, 2024
5bbbaf2
reverted spaces
npkamath Jan 2, 2024
930430a
removed read_input function
npkamath Jan 2, 2024
379897a
Merge remote-tracking branch 'refs/remotes/origin/main'
npkamath Jan 2, 2024
e35cff1
removed read_input function
npkamath Jan 2, 2024
e8ccb7c
Delete PULL_REQUEST_TEMPLATE.md
npkamath Jan 2, 2024
fccca35
refactor: Remove Dynp directory
b-butler Jan 9, 2024
0c64950
fixed namespace, naming, and cleaned up comments
npkamath Jan 12, 2024
fe6bac1
added dynp.py file, fixed constructors, added column_indices for fast…
npkamath Jan 14, 2024
245a92f
fixed index system
npkamath Jan 15, 2024
0af1b08
reorganized class variables and added dynp.py functions and fit
npkamath Jan 15, 2024
66e2104
fit function added with parameter input, removed whitespace with pre…
npkamath Jan 19, 2024
a4cff6f
Merge remote-tracking branch 'upstream/main'
b-butler Jan 24, 2024
9ccf1fe
build: Switch to scikit-build-core.
b-butler Jan 24, 2024
8da0fe0
ci: Add reusable workflow to install system packages
b-butler Jan 24, 2024
943a6c0
ci: Fix package installation by using composite action
b-butler Jan 24, 2024
98ee45c
ci: Correctly specify shell for custom action
b-butler Jan 24, 2024
744b928
ci: Fix action step name formatting
b-butler Jan 24, 2024
b8dd0f2
ci: Remove trailing ":"
b-butler Jan 24, 2024
c7e6920
ci: Update package manager caches before installing
b-butler Jan 24, 2024
7771157
ci: Fix apt-get package names
b-butler Jan 24, 2024
f7287f8
ci: Fix one last package name
b-butler Jan 24, 2024
47aedc0
ci: Yet another package name change
b-butler Jan 24, 2024
4fd3a58
documentation and renaming added
npkamath Feb 26, 2024
4dd27f8
upper triangular restructured and dynp restructured
npkamath Feb 26, 2024
f51ff1c
upper triangular restructured and dynp restructured
npkamath Feb 26, 2024
c0afe6f
conditionals
npkamath Feb 26, 2024
77e3e00
naming and other cpp changes
npkamath Feb 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
cmake_minimum_required(VERSION 3.8)
project(_dupin VERSION 0.0.1)

set(DEFAULT_BUILD_TYPE "Release")
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE})
endif()

find_package(Eigen3 REQUIRED)
find_package(TBB REQUIRED)
find_package(pybind11 CONFIG REQUIRED)

include_directories(${PROJECT_SOURCE_DIR}/src)
add_subdirectory(src)
Empty file added dupin/detect/dynp.py
Empty file.
17 changes: 17 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pybind11_add_module(_dupin dupininterface.cpp
dupin.h dupin.cpp
)

set_target_properties(_dupin PROPERTIES
CXX_STANDARD 17
CMAKE_CXX_STANDARD_REQUIRED True
)

target_include_directories(_dupin PRIVATE
${EIGEN3_INCLUDE_DIR}
${TBB_INCLUDE_DIRS}
)

target_link_libraries(_dupin PRIVATE TBB::tbb)
target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION})
target_compile_options(_dupin PRIVATE -O2 -march=native)
175 changes: 175 additions & 0 deletions src/dupin.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#include <iostream>
#include <iomanip>
#include <limits>
#include <unordered_map>
#include <vector>
#include <Eigen/Dense>
#include <tbb/blocked_range2d.h>
#include <tbb/global_control.h>
#include <tbb/parallel_for.h>
#include "dupin.h"

using namespace std;
using namespace Eigen;
Comment on lines +12 to +13
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use the namespace, explicitly scope them. It makes the code more readable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
using namespace std;
using namespace Eigen;

If you use std:: and Eigen:: then you don't need these lines.


DynamicProgramming::DynamicProgramming()
: num_bkps(1), num_parameters(0), num_timesteps(0), jump(1), min_size(3) {}

DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_,
int jump_, int min_size_)
: data(data), num_bkps(num_bkps_),
jump(jump_), min_size(min_size_) {
num_timesteps = data.rows();
num_parameters = data.cols();
}

void DynamicProgramming::scale_data() {
Eigen::VectorXd min_val = data.colwise().minCoeff();
Eigen::VectorXd max_val = data.colwise().maxCoeff();
Eigen::VectorXd range = max_val - min_val;

for (int j = 0; j < num_parameters; ++j) {
if (range(j) == 0.0) {
data.col(j).setZero();
} else {
data.col(j) = (data.col(j).array() - min_val(j)) / range(j);
}
}
}
void DynamicProgramming::regression_setup(linear_fit_struct &lfit) {
lfit.x = Eigen::VectorXd::LinSpaced(num_timesteps, 0, num_timesteps - 1) /
(num_timesteps - 1);
lfit.y = data;
}

Eigen::VectorXd DynamicProgramming::regression_line(int start, int end, int dim,
linear_fit_struct &lfit) {
int n = end - start;
Eigen::VectorXd x = lfit.x.segment(start, n);
Eigen::VectorXd y = lfit.y.col(dim).segment(start, n);

double x_mean = x.mean();
double y_mean = y.mean();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could probably save some time by saving and computing the cumulative sum and then computing the mean through ((sum_[end] - sum_[begin]) / (end - begin)).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What were your thoughts @npkamath?


Eigen::VectorXd x_centered = x.array() - x_mean;
Eigen::VectorXd y_centered = y.array() - y_mean;

double slope = x_centered.dot(y_centered) / x_centered.squaredNorm();
double intercept = y_mean - slope * x_mean;

return x.unaryExpr(
[slope, intercept](double xi) { return slope * xi + intercept; });
}

double DynamicProgramming::l2_cost(Eigen::MatrixXd &predicted_y, int start, int end) {
Eigen::MatrixXd diff = predicted_y.block(start, 0, end - start, num_parameters) -
data.block(start, 0, end - start, num_parameters);
return std::sqrt(diff.array().square().sum());
}

Eigen::MatrixXd DynamicProgramming::predicted(int start, int end,
linear_fit_struct &lfit) {
Eigen::MatrixXd predicted_y(num_timesteps, num_parameters);
for (int i = 0; i < num_parameters; ++i) {
predicted_y.block(start, i, end - start, 1) =
regression_line(start, end, i, lfit);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should compute the regression for all dimensions simultaneously; it should be faster. Also, is there a way to avoid the copying of data and have the prediction method take a reference to the data to store in?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had an attempt at it but it is not returning the right type to pass into the next function; I will try a different approach, maybe not with Eigen

}
return predicted_y;
}

double DynamicProgramming::cost_function(int start, int end) {
linear_fit_struct lfit;
regression_setup(lfit);
Comment on lines +86 to +87
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make the constructor for linear_fit_struct set up the values.

Eigen::MatrixXd predicted_y = predicted(start, end, lfit);
return l2_cost(predicted_y, start, end);
}

void DynamicProgramming::initialize_cost_matrix() {
scale_data();
cost_matrix.initialize(num_timesteps);

tbb::parallel_for(tbb::blocked_range<int>(0, num_timesteps),
[&](const tbb::blocked_range<int> &r) {
for (int i = r.begin(); i < r.end(); ++i) {
for (int j = i + min_size; j < num_timesteps; ++j) {
cost_matrix(i, j) = cost_function(i, j);
}
}
});
}

std::pair<double, std::vector<int>> DynamicProgramming::seg(int start, int end,
int num_bkps) {
MemoKey key = {start, end, num_bkps};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
MemoKey key = {start, end, num_bkps};
MemoKey key{start, end, num_bkps};

auto it = memo.find(key);
if (it != memo.end()) {
return it->second;
}
if (num_bkps == 0) {
return {cost_matrix(start, end), {end}};
}

std::pair<double, std::vector<int>> best = {std::numeric_limits<double>::infinity(), {}};

for (int bkp = start + min_size; bkp < end; bkp++) {
if ((bkp - start) >= min_size && (end - bkp) >= min_size) {
npkamath marked this conversation as resolved.
Show resolved Hide resolved
auto left = seg(start, bkp, num_bkps - 1);
auto right = seg(bkp, end, 0);
double cost = left.first + right.first;
if (cost < best.first) {
best.first = cost;
best.second = left.second;
best.second.push_back(bkp);
best.second.insert(best.second.end(), right.second.begin(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't right always be one element long? If so, we could just push back.

right.second.end());
}
}
}

memo[key] = best;
return best;
}

std::vector<int> DynamicProgramming::return_breakpoints() {
npkamath marked this conversation as resolved.
Show resolved Hide resolved
auto result = seg(0, num_timesteps - 1, num_bkps);
std::vector<int> breakpoints = result.second;
std::sort(breakpoints.begin(), breakpoints.end());
npkamath marked this conversation as resolved.
Show resolved Hide resolved
breakpoints.erase(std::unique(breakpoints.begin(), breakpoints.end()),
breakpoints.end());
npkamath marked this conversation as resolved.
Show resolved Hide resolved
return breakpoints;
}

void set_parallelization(int num_threads) {
static tbb::global_control gc(tbb::global_control::max_allowed_parallelism,
num_threads);
}

int DynamicProgramming::get_num_timesteps() { return num_timesteps; }

int DynamicProgramming::get_num_parameters() { return num_parameters; }

int DynamicProgramming::get_num_bkps() { return num_bkps; }

Eigen::MatrixXd &DynamicProgramming::getDatum() { return data; }

DynamicProgramming::UpperTriangularMatrix &
DynamicProgramming::getCostMatrix() {
return cost_matrix;
}

void DynamicProgramming::set_num_timesteps(int value) { num_timesteps = value; }

void DynamicProgramming::set_num_parameters(int value) {
num_parameters = value;
}

void DynamicProgramming::setDatum(const Eigen::MatrixXd &value) {
data = value;
}

void DynamicProgramming::setCostMatrix(
const DynamicProgramming::UpperTriangularMatrix &value) {
cost_matrix = value;
}
Comment on lines +165 to +168
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should anyone be setting the full cost matrix?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope! honestly the setter functions aren't needed anymore. I can probably delete them and just set the getters as actual functions rather than properties.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Resolved but not deleted.


int main() { return 0; }
144 changes: 144 additions & 0 deletions src/dupin.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#pragma once

#include <algorithm>
#include <iostream>
#include <limits>
#include <unordered_map>
#include <vector>
#include <Eigen/Dense>


// DynamicProgramming class for dynamic programming based segmentation.
class DynamicProgramming {
npkamath marked this conversation as resolved.
Show resolved Hide resolved
private:
class UpperTriangularMatrix {
private:
std::vector<double> matrix;
std::vector<int> row_indices;
int length;

int index(int row, int col) const {
return row_indices[row] + col - row;
}

public:
UpperTriangularMatrix() : length(0) {}

UpperTriangularMatrix(int n) : length(n), matrix(n * (n + 1) / 2, 0.0),
row_indices(n) {
for (int row = 0; row < n; ++row) {
row_indices[row] = row * (2 * length - row + 1) / 2;
}
}

void initialize(int n) {
length = n;
matrix.resize(n * (n + 1) / 2, 0.0);
row_indices.resize(n);
for (int row = 0; row < n; ++row) {
row_indices[row] = row * (2 * length - row + 1) / 2;
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this function?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think yes because without a global cost_matrix variable, I will have to pass by reference the entire cost_matrix through the recursion

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean can't we just initialize the matrix and discard the initialization function. Why do we create it and then size it?


double &operator()(int row, int col) {
return matrix[index(row, col)];
}
int getSize() const { return length; }
};
UpperTriangularMatrix cost_matrix;

// Struct for memoization key, combining start, end, and number of
// breakpoints.
struct MemoKey {
int start;
int end;
int num_bkps;

// Comparison operator for MemoKey.
bool operator==(const MemoKey &other) const {
return start == other.start && end == other.end &&
num_bkps == other.num_bkps;
}
};

// Custom XOR-bit hash function for MemoKey, avoids clustering of data in
// unordered map to improve efficiency.
struct MemoKeyHash {
std::size_t operator()(const MemoKey &key) const {
return ((std::hash<int>()(key.start) ^
(std::hash<int>()(key.end) << 1)) >>
1) ^
std::hash<int>()(key.num_bkps);
}
};

// Memoization map to store the cost and partition for given parameters.
std::unordered_map<MemoKey, std::pair<double, std::vector<int>>, MemoKeyHash>
memo;

int num_bkps; // Number of breakpoints to detect.
int num_parameters; // Number of features in the dataset.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be known from the passed matrix no?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some of this was just for testing/setting back in ye old days of input.txt. I can just make it use data.size now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still to be done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we keep this we should be more elaborate and use num_features and num_breakpoints.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, if possible, expand on the variable names throughout. It will help future developers work with the code.

int num_timesteps; // Number of data points (time steps).
b-butler marked this conversation as resolved.
Show resolved Hide resolved
int jump; // Interval for checking potential breakpoints.
int min_size; // Minimum size of a segment.
Eigen::MatrixXd data; // Matrix storing the dataset.

// Structure for storing linear regression parameters.
struct linear_fit_struct {
Comment on lines +80 to +81
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Structure for storing linear regression parameters.
struct linear_fit_struct {
// Structure for storing linear regression parameters.
struct linear_fit {

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think this would work better as a fleshed out class that had methods to fit and such my thoughts (though by no means the only way to do this) would be to

Create a fit method which computes the slopes and intercepts in a line struct which has a call operator that returns y's for given x. The x's could be stored in the struct itself.

If you have other ideas that is fine. I just like the idea of separating the logic for line fitting from the DynP class. Ideally, the entire cost function would be abstracted out for future development.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this idea, I'm thinking maybe I just create a regression.h file and just make a separate class? the dupin.h file is getting a little messy

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this would be a good idea, and hopefully not too much work as the code is pretty modular already.

Eigen::MatrixXd y; // Dependent variable (labels).
Eigen::VectorXd x; // z Independent variable (time steps).
};

public:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of these should be private or protected.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many of these methods either don't need to exist the n_timesteps n_parameters stuff or should be private/protected. The constructor, setters, getters, and anything exported to Python should remain here.

// Default constructor.
DynamicProgramming();

// Parameterized constructor.
DynamicProgramming(const Eigen::MatrixXd &data, int num_bkps_, int jump_,
int min_size_);

// Scales the dataset using min-max normalization.
void scale_data();

// Prepares data for linear regression.
void regression_setup(linear_fit_struct &lfit);

// Calculates the regression line for a given data segment.
Eigen::VectorXd regression_line(int start, int end, int dim,
linear_fit_struct &lfit);

// Generates predicted values based on the linear regression model.
Eigen::MatrixXd predicted(int start, int end, linear_fit_struct &lfit);

// Calculates L2 cost (Euclidean distance) between predicted and actual data.
double l2_cost(Eigen::MatrixXd &predicted_y, int start, int end);

// Computes the cost of a specific data segment using linear regression.
double cost_function(int start, int end);

// Initializes and fills the cost matrix for all data segments.
void initialize_cost_matrix();

// Recursive function for dynamic programming segmentation.
std::pair<double, std::vector<int>> seg(int start, int end, int num_bkps);

//sets number of threads for parallelization
void set_parallelization(int num_threads);

// Returns the optimal set of breakpoints after segmentation.
std::vector<int> return_breakpoints();

// Getter functions for accessing private class members.
int get_num_timesteps();
int get_num_parameters();
int get_num_bkps();
Eigen::MatrixXd &getDatum();
DynamicProgramming::UpperTriangularMatrix &getCostMatrix();

// Setter functions for modifying private class members.
void set_num_timesteps(int value);
void set_num_parameters(int value);
void setDatum(const Eigen::MatrixXd &value);
void
setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value);
};
25 changes: 25 additions & 0 deletions src/dupininterface.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include "dupin.h"
npkamath marked this conversation as resolved.
Show resolved Hide resolved
#include <pybind11/eigen.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>

namespace py = pybind11;

PYBIND11_MODULE(_dupin, m) {
py::class_<DynamicProgramming>(m, "DynamicProgramming")
.def(py::init<>())
.def_property("data", &DynamicProgramming::getDatum,
&DynamicProgramming::setDatum)
.def_property("cost_matrix", &DynamicProgramming::getCostMatrix,
&DynamicProgramming::setCostMatrix)
.def_property("num_bkps", &DynamicProgramming::get_num_bkps,
&DynamicProgramming::set_num_bkps)
.def_property("num_timesteps", &DynamicProgramming::get_num_timesteps,
&DynamicProgramming::set_num_timesteps)
.def_property("num_parameters", &DynamicProgramming::get_num_parameters,
&DynamicProgramming::set_num_parameters)
npkamath marked this conversation as resolved.
Show resolved Hide resolved
.def("initialize_cost_matrix",
&DynamicProgramming::initialize_cost_matrix)
npkamath marked this conversation as resolved.
Show resolved Hide resolved
.def("return_breakpoints", &DynamicProgramming::return_breakpoints)
npkamath marked this conversation as resolved.
Show resolved Hide resolved
.def("set_threads", &DynamicProgramming::set_parallelization);
}
Loading