From 087c4f6920aea9a14d00d049306bb69180c11191 Mon Sep 17 00:00:00 2001 From: sronilsson Date: Thu, 12 Dec 2024 10:22:29 -0500 Subject: [PATCH] egocentric align --- docs/nb/geometry_ex_3.ipynb | 2 +- docs/nb/geometry_example_3.ipynb | 2 +- docs/simba.model_mixin.rst | 14 ++ simba/model/regression/model.py | 141 ++++++++++++------ .../third_party_appender.py | 22 +++ simba/utils/read_write.py | 4 +- 6 files changed, 133 insertions(+), 52 deletions(-) diff --git a/docs/nb/geometry_ex_3.ipynb b/docs/nb/geometry_ex_3.ipynb index 89a7d927f..798eab64a 100644 --- a/docs/nb/geometry_ex_3.ipynb +++ b/docs/nb/geometry_ex_3.ipynb @@ -5,7 +5,7 @@ "id": "3fe482b5", "metadata": {}, "source": [ - "# Geometry computations: Example 3" + "# Geometry computations Example 3: Animal paths" ] }, { diff --git a/docs/nb/geometry_example_3.ipynb b/docs/nb/geometry_example_3.ipynb index f4785be1f..ed97be0ef 100644 --- a/docs/nb/geometry_example_3.ipynb +++ b/docs/nb/geometry_example_3.ipynb @@ -5,7 +5,7 @@ "id": "536d575d", "metadata": {}, "source": [ - "# Geometry computations Example 3: Slice animal videos on CPU" + "# Geometry computations Example 4: Slice animal videos on CPU" ] }, { diff --git a/docs/simba.model_mixin.rst b/docs/simba.model_mixin.rst index 142f5eaa9..1675bfc72 100644 --- a/docs/simba.model_mixin.rst +++ b/docs/simba.model_mixin.rst @@ -66,4 +66,18 @@ Ordinal classifier methods :undoc-members: +Regression - metrics +------------------------------------------------- + +.. automodule:: simba.model.regression.metrics + :members: + :show-inheritance: + +Regression - fit and transform +------------------------------------------------- + +.. automodule:: simba.model.regression.model + :members: + :show-inheritance: + diff --git a/simba/model/regression/model.py b/simba/model/regression/model.py index 6c518c618..fdf6542a9 100644 --- a/simba/model/regression/model.py +++ b/simba/model/regression/model.py @@ -1,10 +1,10 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple +from itertools import product import numpy as np import pandas as pd import xgboost as xgb from sklearn.model_selection import StratifiedKFold - from simba.model.regression.metrics import (mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score, @@ -15,39 +15,44 @@ from simba.utils.enums import Formats from simba.utils.errors import DataHeaderError - def fit_xgb(x: pd.DataFrame, y: np.ndarray, - objective: Optional[str] = 'reg:squarederror', - n_estimators: Optional[int] = 100, - max_depth: Optional[int] = 6, - verbosity: Optional[int] = 1, - learning_rate: Optional[float] = 0.3, - tree_method: Optional[str] = 'auto'): + xgb_reg: xgb.XGBRegressor) -> xgb.XGBRegressor: """ + Fits an XGBoost regressor model to the given data. + + :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types. + :param np.ndarray y: Target values, must be a 1-dimensional array of numeric types with the same number of rows as `x`. + :param xgb.XGBRegressor xgb_reg: Defined xgb.XGBRegressor. E.g., can be defined with :func:`simba.model.regression.model.xgb_define`, + :return: Trained XGBoost regressor model. + :rtype: xgb.XGBRegressor + :example: >>> x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) >>> y = np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) """ - OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror') - TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist') - check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) - check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES) - check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS) - check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1) - check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1) - check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3) - check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0) - xgb_reg = xgb.XGBRegressor(objective=objective, max_depth=max_depth, n_estimators=n_estimators, verbosity=verbosity) - + check_instance(source=f'{fit_xgb.__name__} fit_xgb', instance=xgb_reg, accepted_types=(xgb.XGBRegressor,)) return xgb_reg.fit(X=x, y=y) -def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor): +def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor) -> np.ndarray: """ + Transforms the input data using the provided XGBoost model by making predictions. + + :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types. + :param xgb.XGBRegressor model: Trained XGBoost model to use for making predictions. + :return: Predictions rounded to 2 decimal places. + :rtype: np.ndarray + + :example: + >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,)) + >>> mdl = fit_xgb(x=x, y=y) + >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) + >>> results = transform_xgb(x=new_x, model=mdl) + :example: >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,)) >>> mdl = fit_xgb(x=x, y=y) @@ -110,37 +115,77 @@ def evaluate_xgb(y_pred: np.ndarray, return results +def xgb_define(objective: str = 'reg:squarederror', + n_estimators: int = 100, + max_depth: int = 6, + verbosity: int = 1, + learning_rate: float = 0.3, + eta: float = 0.3, + gamma: float = 0.0, + tree_method: str = 'auto') -> xgb.XGBRegressor: + + OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror') + TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist') + check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES) + check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS) + check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1) + check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1) + check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3) + check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0) + check_float(name=f'{fit_xgb.__name__} eta', value=eta, min_value=0.0, max_value=1.0) + check_float(name=f'{fit_xgb.__name__} gamma', value=gamma, min_value=0.0) + + return xgb.XGBRegressor(objective=objective, max_depth=max_depth, n_estimators=n_estimators, verbosity=verbosity, learning_rate=learning_rate, eta=eta, gamma=gamma, tree_method=tree_method) -def kfold_fit_xgb(x: pd.DataFrame, - y: np.ndarray, - objective: Optional[str] = 'reg:squarederror', - n_estimators: Optional[int] = 100, - max_depth: Optional[int] = 6, - verbosity: Optional[int] = 1, - learning_rate: Optional[float] = 0.3, - tree_method: Optional[str] = 'auto', - k: Optional[int] = 5): - # USE xgb.cv - # - # OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror') - # TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist') +def xgb_grid_define(objective: Tuple[str] = ('reg:squarederror',), + n_estimators: Tuple[int] = (100,), + max_depth: Tuple[int] =(6,), + verbosity: Tuple[int] = (1,), + learning_rate: Tuple[float] = (0.3,), + eta: Tuple[float] = (0.3,), + gamma: Tuple[float] = (0.0,), + tree_method: Tuple[str] = ('auto',)) -> List[xgb.XGBRegressor]: + + grid = list(product(objective, n_estimators, max_depth, verbosity, learning_rate, eta, gamma, tree_method)) + mdls = [] + for i in grid: + mdl = xgb_define(objective=i[0], n_estimators=i[1], max_depth=i[2], verbosity=i[3], learning_rate=i[4], eta=i[5], gamma=i[6], tree_method=i[7]) + mdls.append(mdl) + return mdls + + + + +def xgb_grid_fit(x: pd.DataFrame, + y: np.ndarray, + xgb_regs: List[xgb.XGBRegressor]) -> List[xgb.XGBRegressor]: + check_valid_lst(data=xgb_regs, source=xgb_grid_fit.__name__, valid_dtypes=()) + + + + + + + #xgb_define() + + + + + + # grid_df = pd.DataFrame(grid, columns=['objective', 'n_estimators', 'max_depth', 'learning_rate', 'eta', 'gamma']) + # grid_df['verbosity'], grid_df['tree_method'] = verbosity, tree_method # - # check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value) - # check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) - # check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES) - # check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS) - # check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1) - # check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1) - # check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3) - # check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0) - # check_int(name=f'{fit_xgb.__name__} k', value=k, min_value=2) - # k_fold = StratifiedKFold(n_splits=k, shuffle=True) - # for fold_cnt, (train_index, test_index) in enumerate(k_fold.split(x, y)): - # x_fold, y_fold = x.loc[train_index], y[test_index] - - pass + # grid_df.apply(fit, axis=1, result_type='expand') + + + + + + + +xgb_grid_define(max_depth=(6, 3), gamma=(0, 0.3)) # x = pd.DataFrame(np.random.randint(0, 500, (100, 20))) # y = np.random.randint(1, 6, (100,)) diff --git a/simba/third_party_label_appenders/third_party_appender.py b/simba/third_party_label_appenders/third_party_appender.py index 79e67b780..208d13e11 100644 --- a/simba/third_party_label_appenders/third_party_appender.py +++ b/simba/third_party_label_appenders/third_party_appender.py @@ -193,6 +193,28 @@ def run(self): + +log = True +file_format = 'xlsx' +error_settings = {'INVALID annotations file data format': 'ERROR', + 'ADDITIONAL third-party behavior detected': 'NONE', + 'Annotations EVENT COUNT conflict': 'WARNING', + 'Annotations OVERLAP inaccuracy': 'WARNING', + 'ZERO third-party video behavior annotations found': 'WARNING', + 'Annotations and pose FRAME COUNT conflict': 'WARNING', + 'Annotations data file NOT FOUND': 'WARNING'} + +test = ThirdPartyLabelAppender(config_path=r"C:\troubleshooting\boris_test_2\project_folder\project_config.ini", + data_dir=r"C:\troubleshooting\boris_test_2\project_folder\boris_files", + app='BORIS', + file_format='.csv', + error_settings=error_settings, + log=log) +test.run() + + + + # log = True # file_format = 'xlsx' # error_settings = {'INVALID annotations file data format': 'WARNING', diff --git a/simba/utils/read_write.py b/simba/utils/read_write.py index 9b50be13d..3ee989333 100644 --- a/simba/utils/read_write.py +++ b/simba/utils/read_write.py @@ -2334,11 +2334,11 @@ def read_boris_file(file_path: Union[str, os.PathLike], expected_headers = [TIME, MEDIA_FILE_PATH, BEHAVIOR, STATUS] df = pd.read_csv(file_path) check_valid_dataframe(df=df, source=f'{read_boris_file.__name__} {file_path}', required_fields=expected_headers) + df = df.dropna(how='all').reset_index(drop=True) numeric_check = pd.to_numeric(df[TIME], errors='coerce').notnull().all() if not numeric_check: if raise_error: - raise InvalidInputError( - msg=f'SimBA found TIME DATA annotation in file {file_path} that could not be interpreted as numeric values (seconds or frame numbers)') + raise InvalidInputError(msg=f'SimBA found TIME DATA annotation in file {file_path} that could not be interpreted as numeric values (seconds or frame numbers)') else: ThirdPartyAnnotationsInvalidFileFormatWarning(annotation_app="BORIS", file_path=file_path, source=read_boris_file.__name__, log_status=log_setting) return {}