diff --git a/.github/actions/install-nopython-dependencies/action.yml b/.github/actions/install-nopython-dependencies/action.yml new file mode 100644 index 0000000..67dbd04 --- /dev/null +++ b/.github/actions/install-nopython-dependencies/action.yml @@ -0,0 +1,21 @@ +name: Install Non-Python Dependencies +description: Installs system packages needed to build dupin. +runs: + using: "composite" + steps: + - name: install-dependencies-linux + if: runner.os == 'Linux' + shell: bash + run: | + sudo apt-get update + sudo apt-get install libtbb12 libtbb-dev libeigen3-dev ninja-build + - name: install-dependencies-macos + if: runner.os == 'macOS' + shell: bash + run: | + brew update + brew install tbb eigen ninja + - name: install-dependencies-windows + if: runner.os == 'Windows' + shell: bash + run: choco install tbb eigen3 ninja diff --git a/.github/workflows/publish-packages.yml b/.github/workflows/publish-packages.yml index 71ecc96..bf86484 100644 --- a/.github/workflows/publish-packages.yml +++ b/.github/workflows/publish-packages.yml @@ -34,10 +34,11 @@ jobs: -r requirements/requirements-test.txt \ -r requirements/requirements-jit.txt \ -r requirements/requirements-data.txt - - name: Install pypa/build run: python -m pip install build + - name: Install System Packages + uses: ./.github/actions/install-nopython-dependencies - name: Build a binary wheel and a source tarball run: python -m build --outdir dist/ . diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index d7526b1..4b2fa04 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -45,7 +45,7 @@ jobs: python-version: ${{ matrix.python }} - name: Update pip/build packages run: | - pip install setuptools --upgrade + pip install pip --upgrade - name: Install newest dependencies run: | pip install -r requirements/requirements-test.txt @@ -60,6 +60,8 @@ jobs: run: | pip install -r requirements/requirements-jit.txt if: ${{ matrix.python != '3.12' }} + - name: Install system packages + uses: ./.github/actions/install-nopython-dependencies - name: Install the package run: | pip install -e . diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c3227aa --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.8) +project(dupin VERSION 0.0.1 LANGUAGES CXX) + +set(DEFAULT_BUILD_TYPE "Release") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE}) +endif() + +find_package(Eigen3 REQUIRED) +find_package(TBB REQUIRED) +# Use modern method for Python binding +set(PYBIND11_NEWPYTHON ON) +find_package(pybind11 CONFIG REQUIRED) + +add_subdirectory(src) +add_subdirectory(dupin) diff --git a/dupin/CMakeLists.txt b/dupin/CMakeLists.txt new file mode 100644 index 0000000..37fcf95 --- /dev/null +++ b/dupin/CMakeLists.txt @@ -0,0 +1,2 @@ + # Defaults to site-packages so only need package name + install(DIRECTORY ./ DESTINATION dupin FILES_MATCHING PATTERN "*.py") diff --git a/dupin/detect/dynp.py b/dupin/detect/dynp.py new file mode 100644 index 0000000..201b6cb --- /dev/null +++ b/dupin/detect/dynp.py @@ -0,0 +1,76 @@ +"""Implements dynamic programming class for optimal segementation algorithm.""" +import _dupin +import numpy as np + + +class DynP: + """Detects the change points in a time series. + + Attributes + ---------- + data: np.ndarray + Matrix storing the time series data. + num_bkps: int + Number of change points to detect. + jump: int + Interval for checking potential change points. Changing will + not provide optimal detection, but will reduce runtime. + min_size: int + Minimum size of a segment. Changing will not provide optimal + detection, but will reduce runtime. + + + Methods + ------- + __init__(self, data: np.ndarray, num_bkps: int, jump: int, min_size: int) + Initializes the DynamicProgramming instance with the time series data + and parameters. + set_num_threads(self, num_threads: int) + Sets the number of threads to be used for parallel computation. + fit(self, num_bkps: int) -> list + Calculates the cost matrix and identifies the optimal breakpoints in + the time series data. + + Example Usage + ------------- + >>> import numpy as np + >>> from dynp import DynP + >>> data = np.random.rand(100, 1) # Simulated time series data + >>> num_bkps = 3 # Number of breakpoints to detect + >>> jump = 1 # Interval for checking potential breakpoints + >>> min_size = 3 # Minimum size of a segment + >>> model = Dynp(data, num_bkps, jump, min_size) + >>> breakpoints = model.fit(num_bkps) + >>> print(breakpoints) + """ + + def __init__( + self, data: np.ndarray, num_bkps: int, jump: int, min_size: int + ): + """Initialize the DynamicProgramming instance with given parameters.""" + self._dupin = _dupin.DynamicProgramming(data, num_bkps, jump, min_size) + + def set_num_threads(self, num_threads: int): + """Set the number of threads for parallelization. + + Parameters + ---------- + num_threads: int + The number of threads to use during computation. Default + is determined automatically. + """ + self._dupin.set_threads(num_threads) + + def fit(self, num_breakpoints: int) -> list[int]: + """Calculate the cost matrix and return the breakpoints. + + Parameters + ---------- + num_bkps: int + number of change points to detect. + + Returns + ------- + list: A list of integers representing the breakpoints. + """ + return self._dupin.fit(num_breakpoints) diff --git a/pyproject.toml b/pyproject.toml index f701a67..d1433f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -build-backend = "setuptools.build_meta" -requires = ["setuptools >= 64.0.0"] +requires = ["scikit-build-core>=0.7.0", "pybind11"] +build-backend = "scikit_build_core.build" [project] name = "dupin" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..7fda63a --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,19 @@ +list(APPEND dupin_cxx "module.cpp" "dupin.h" "dupin.cpp") + +pybind11_add_module(_dupin ${dupin_cxx}) + +set_target_properties(_dupin PROPERTIES + CXX_STANDARD 17 + CMAKE_CXX_STANDARD_REQUIRED True +) + +target_include_directories(_dupin PRIVATE + ${EIGEN3_INCLUDE_DIR} + ${TBB_INCLUDE_DIRS} +) + +target_link_libraries(_dupin PRIVATE TBB::tbb) +target_compile_definitions(_dupin PRIVATE VERSION_INFO=${PROJECT_VERSION}) +target_compile_options(_dupin PRIVATE -O2 -march=native) +# Installs C++ extension into the root of the Python package +install(TARGETS _dupin LIBRARY DESTINATION dupin) diff --git a/src/dupin.cpp b/src/dupin.cpp new file mode 100644 index 0000000..a5681c5 --- /dev/null +++ b/src/dupin.cpp @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dupin.h" + +using namespace std; +using namespace Eigen; + +DynamicProgramming::DynamicProgramming() + : num_features(0), num_timesteps(0), jump(1), min_size(3), cost_matrix(0) {} + + +DynamicProgramming::DynamicProgramming(const Eigen::MatrixXd &data, + int jump_, int min_size_) + : data(data), jump(jump_), min_size(min_size_), cost_matrix(data.rows()) { + num_timesteps = data.rows(); + num_features = data.cols(); +} + +void DynamicProgramming::scale_data() { + Eigen::VectorXd min_val = data.colwise().minCoeff(); + Eigen::VectorXd max_val = data.colwise().maxCoeff(); + Eigen::VectorXd range = max_val - min_val; + + for (int j = 0; j (0, num_timesteps), + [&](const tbb::blocked_range &r) { + for (int i = r.begin(); i < r.end(); ++i) { + for (int j = i + min_size; j < num_timesteps; ++j) { + cost_matrix(i, j) = cost_function(i, j); + } + } + }); + cost_computed = true; +} + +std::pair> DynamicProgramming::seg(int start, int end, + int num_bkps) { + MemoKey key = {start, end, num_bkps}; + auto it = memo.find(key); + if (it != memo.end()) { + return it->second; + } + if (num_bkps == 0) { + return {cost_matrix(start, end), {end}}; + } + + std::pair> best = {std::numeric_limits::infinity(), {}}; + + for (int bkp = start + min_size; bkp < end; bkp++) { + if ((bkp - start) < min_size || (end - bkp) < min_size) { + continue; + } + auto left = seg(start, bkp, num_bkps - 1); + auto right = seg(bkp, end, 0); + double cost = left.first + right.first; + if (cost < best.first) { + best.first = cost; + best.second = left.second; + best.second.push_back(bkp); + best.second.insert(best.second.end(), right.second.begin(), + right.second.end()); + } + } + } + + memo[key] = best; + return best; +} + +std::vector DynamicProgramming::compute_breakpoints(int num_bkps) { + auto result = seg(0, num_timesteps - 1, num_bkps); + std::vector breakpoints = result.second; + return breakpoints; +} + +std::vector DynamicProgramming::fit(int num_bkps){ + if (!cost_computed){ + initialize_cost_matrix(); + } + return compute_breakpoints(num_bkps); +} + +void set_parallelization(int num_threads) { + static tbb::global_control gc(tbb::global_control::max_allowed_parallelism, + num_threads); +} + +DynamicProgramming::UpperTriangularMatrix & +DynamicProgramming::getCostMatrix() { + return cost_matrix; +} + +void DynamicProgramming::setCostMatrix( + const DynamicProgramming::UpperTriangularMatrix &value) { + cost_matrix = value; +} + +int main() { return 0; } diff --git a/src/dupin.h b/src/dupin.h new file mode 100644 index 0000000..18df333 --- /dev/null +++ b/src/dupin.h @@ -0,0 +1,129 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +// Calculates optimal breakpoints in time-series data using memoization +class DynamicProgramming { +private: + + //stores upper triangular cost matrix efficiently + class UpperTriangularMatrix { + private: + std::vector matrix; + std::vector row_indices; + int length; + + // Helper function to compute the row_indices vector + void compute_row_indices() { + row_indices.resize(length); + for (int row = 0; row < length; ++row) { + row_indices[row] = row * (2 * length - row + 1) / 2; + } + } + + public: + // Constructor that initializes the matrix and row_indices + UpperTriangularMatrix(int n) : length(n), matrix(n * (n + 1) / 2, 0.0) { + compute_row_indices(); + } + + double &operator()(int row, int col) { + int idx = row_indices[row] + col - row; // Use precomputed index + return matrix[idx]; + } + + int getSize() const { return length; } +}; + // Struct for memoization key, combining start, end, and number of + // breakpoints. + struct MemoKey { + int start; + int end; + int num_bkps; + + // Comparison operator for MemoKey. + bool operator==(const MemoKey &other) const { + return start == other.start && end == other.end && + num_bkps == other.num_bkps; + } + }; + + // Custom XOR-bit hash function for MemoKey, avoids clustering of data in + // unordered map to improve efficiency. + struct MemoKeyHash { + std::size_t operator()(const MemoKey &key) const { + return ((std::hash()(key.start) ^ + (std::hash()(key.end) << 1)) >> + 1) ^ + std::hash()(key.num_bkps); + } + }; + + // Memoization map to store the cost and partition for given parameters. + std::unordered_map>, MemoKeyHash> + memo; + + int num_features; // Number of features in the dataset. + int num_timesteps; // Number of data points (time steps). + int jump; // Interval for checking potential breakpoints. + int min_size; // Minimum size of a segment. + + Eigen::MatrixXd data; // Matrix storing the dataset. + UpperTriangularMatrix cost_matrix; //Matrix storing costs + bool cost_computed = false; + // Structure for storing linear regression parameters. + struct linear_fit_struct { + Eigen::MatrixXd y; // Dependent variable (labels). + Eigen::VectorXd x; // z Independent variable (time steps). + }; + // Scales the dataset using min-max normalization. + void scale_data(); + + // Prepares data for linear regression, setting up the independent variable 'x'. + void regression_setup(linear_fit_struct &lfit); + + // Computes regression parameters (slope and intercept) for all dimensions simultaneously. + Eigen::MatrixXd regression_lines(int start, int end, linear_fit_struct &lfit); + + // Generates predicted values based on the linear regression model for all features. + void predicted(int start, int end, linear_fit_struct &lfit, Eigen::MatrixXd &predicted_y); + + // Calculates L2 cost (Euclidean distance) between predicted and actual data for a given segment. + double l2_cost(const Eigen::MatrixXd &predicted_y, int start, int end); + + // Computes the cost of a specific data segment using linear regression and L2 cost. + double cost_function(int start, int end); + + // Recursive function for dynamic programming segmentation. + std::pair> seg(int start, int end, int num_bkps); + +// Initializes and fills the cost matrix for all data segments. + void initialize_cost_matrix(); + + // Returns the optimal set of breakpoints after segmentation. + std::vector compute_breakpoints(int num_bkps); + +public: + // Default constructor. + DynamicProgramming(); + + // Parameterized constructor. + DynamicProgramming(const Eigen::MatrixXd &data, int jump_, + int min_size_); + + //Sets number of threads for parallelization + void set_parallelization(int num_threads); + + // Calculates optimal breakpoints with given number of points. + std::vector fit(int num_bkps_in); + + // Getter functions for cost matrix. + DynamicProgramming::UpperTriangularMatrix &getCostMatrix(); + void setCostMatrix(const DynamicProgramming::UpperTriangularMatrix &value); +}; diff --git a/src/module.cpp b/src/module.cpp new file mode 100644 index 0000000..4ec5843 --- /dev/null +++ b/src/module.cpp @@ -0,0 +1,15 @@ +#include "dupin.h" +#include +#include +#include + +namespace py = pybind11; + +PYBIND11_MODULE(_dupin, m) { + py::class_(m, "DynamicProgramming") + .def(py::init<>()) + .def_property("cost_matrix", &DynamicProgramming::getCostMatrix, + &DynamicProgramming::setCostMatrix) + .def("fit", &DynamicProgramming::fit) + .def("set_threads", &DynamicProgramming::set_parallelization); +}