From 087c4f6920aea9a14d00d049306bb69180c11191 Mon Sep 17 00:00:00 2001
From: sronilsson <sronilsson@gmail.com>
Date: Thu, 12 Dec 2024 10:22:29 -0500
Subject: [PATCH] egocentric align

---
 docs/nb/geometry_ex_3.ipynb                   |   2 +-
 docs/nb/geometry_example_3.ipynb              |   2 +-
 docs/simba.model_mixin.rst                    |  14 ++
 simba/model/regression/model.py               | 141 ++++++++++++------
 .../third_party_appender.py                   |  22 +++
 simba/utils/read_write.py                     |   4 +-
 6 files changed, 133 insertions(+), 52 deletions(-)

diff --git a/docs/nb/geometry_ex_3.ipynb b/docs/nb/geometry_ex_3.ipynb
index 89a7d927f..798eab64a 100644
--- a/docs/nb/geometry_ex_3.ipynb
+++ b/docs/nb/geometry_ex_3.ipynb
@@ -5,7 +5,7 @@
    "id": "3fe482b5",
    "metadata": {},
    "source": [
-    "# Geometry computations: Example 3"
+    "# Geometry computations Example 3: Animal paths"
    ]
   },
   {
diff --git a/docs/nb/geometry_example_3.ipynb b/docs/nb/geometry_example_3.ipynb
index f4785be1f..ed97be0ef 100644
--- a/docs/nb/geometry_example_3.ipynb
+++ b/docs/nb/geometry_example_3.ipynb
@@ -5,7 +5,7 @@
    "id": "536d575d",
    "metadata": {},
    "source": [
-    "# Geometry computations Example 3: Slice animal videos on CPU"
+    "# Geometry computations Example 4: Slice animal videos on CPU"
    ]
   },
   {
diff --git a/docs/simba.model_mixin.rst b/docs/simba.model_mixin.rst
index 142f5eaa9..1675bfc72 100644
--- a/docs/simba.model_mixin.rst
+++ b/docs/simba.model_mixin.rst
@@ -66,4 +66,18 @@ Ordinal classifier methods
    :undoc-members:
 
 
+Regression - metrics
+-------------------------------------------------
+
+.. automodule:: simba.model.regression.metrics
+   :members:
+   :show-inheritance:
+
+Regression - fit and transform
+-------------------------------------------------
+
+.. automodule:: simba.model.regression.model
+   :members:
+   :show-inheritance:
+
 
diff --git a/simba/model/regression/model.py b/simba/model/regression/model.py
index 6c518c618..fdf6542a9 100644
--- a/simba/model/regression/model.py
+++ b/simba/model/regression/model.py
@@ -1,10 +1,10 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
+from itertools import product
 
 import numpy as np
 import pandas as pd
 import xgboost as xgb
 from sklearn.model_selection import StratifiedKFold
-
 from simba.model.regression.metrics import (mean_absolute_error,
                                             mean_absolute_percentage_error,
                                             mean_squared_error, r2_score,
@@ -15,39 +15,44 @@
 from simba.utils.enums import Formats
 from simba.utils.errors import DataHeaderError
 
-
 def fit_xgb(x: pd.DataFrame,
             y: np.ndarray,
-            objective: Optional[str] = 'reg:squarederror',
-            n_estimators: Optional[int] = 100,
-            max_depth: Optional[int] = 6,
-            verbosity: Optional[int] = 1,
-            learning_rate: Optional[float] = 0.3,
-            tree_method: Optional[str] = 'auto'):
+            xgb_reg: xgb.XGBRegressor) -> xgb.XGBRegressor:
     """
+    Fits an XGBoost regressor model to the given data.
+
+    :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types.
+    :param np.ndarray y: Target values, must be a 1-dimensional array of numeric types with the same number  of rows as `x`.
+    :param xgb.XGBRegressor xgb_reg: Defined xgb.XGBRegressor. E.g., can be defined with :func:`simba.model.regression.model.xgb_define`,
+    :return: Trained XGBoost regressor model.
+    :rtype: xgb.XGBRegressor
+
     :example:
     >>> x = pd.DataFrame(np.random.randint(0, 500, (100, 20)))
     >>> y = np.random.randint(1, 6, (100,))
     >>> mdl = fit_xgb(x=x, y=y)
     """
-    OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror')
-    TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist')
-
     check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value)
     check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
-    check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES)
-    check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS)
-    check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1)
-    check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1)
-    check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3)
-    check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0)
-    xgb_reg = xgb.XGBRegressor(objective=objective, max_depth=max_depth, n_estimators=n_estimators, verbosity=verbosity)
-
+    check_instance(source=f'{fit_xgb.__name__} fit_xgb', instance=xgb_reg, accepted_types=(xgb.XGBRegressor,))
     return xgb_reg.fit(X=x, y=y)
 
-def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor):
+def transform_xgb(x: pd.DataFrame, model: xgb.XGBRegressor) -> np.ndarray:
 
     """
+    Transforms the input data using the provided XGBoost model by making predictions.
+
+    :param pd.DataFrame x: Input feature matrix where each row represents a sample and each column a feature. The data must have numeric types.
+    :param xgb.XGBRegressor model: Trained XGBoost model to use for making predictions.
+    :return: Predictions rounded to 2 decimal places.
+    :rtype: np.ndarray
+
+    :example:
+    >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,))
+    >>> mdl = fit_xgb(x=x, y=y)
+    >>> new_x = pd.DataFrame(np.random.randint(0, 500, (100, 20)))
+    >>> results = transform_xgb(x=new_x, model=mdl)
+
     :example:
     >>> x, y = pd.DataFrame(np.random.randint(0, 500, (100, 20))), np.random.randint(1, 6, (100,))
     >>> mdl = fit_xgb(x=x, y=y)
@@ -110,37 +115,77 @@ def evaluate_xgb(y_pred: np.ndarray,
 
     return results
 
+def xgb_define(objective: str = 'reg:squarederror',
+               n_estimators: int = 100,
+               max_depth: int = 6,
+               verbosity: int = 1,
+               learning_rate: float = 0.3,
+               eta: float = 0.3,
+               gamma: float = 0.0,
+               tree_method: str = 'auto') -> xgb.XGBRegressor:
+
+    OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror')
+    TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist')
+    check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES)
+    check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS)
+    check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1)
+    check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1)
+    check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3)
+    check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0)
+    check_float(name=f'{fit_xgb.__name__} eta', value=eta, min_value=0.0, max_value=1.0)
+    check_float(name=f'{fit_xgb.__name__} gamma', value=gamma, min_value=0.0)
+
+    return xgb.XGBRegressor(objective=objective, max_depth=max_depth, n_estimators=n_estimators, verbosity=verbosity, learning_rate=learning_rate, eta=eta, gamma=gamma, tree_method=tree_method)
 
-def kfold_fit_xgb(x: pd.DataFrame,
-                  y: np.ndarray,
-                  objective: Optional[str] = 'reg:squarederror',
-                  n_estimators: Optional[int] = 100,
-                  max_depth: Optional[int] = 6,
-                  verbosity: Optional[int] = 1,
-                  learning_rate: Optional[float] = 0.3,
-                  tree_method: Optional[str] = 'auto',
-                  k: Optional[int] = 5):
 
-    # USE xgb.cv
-    #
-    # OBJECTIVES = ('reg:squarederror', 'reg:squaredlogerror', 'reg:logistic', 'reg:pseudohubererror')
-    # TREE_METHODS = ('auto', 'exact', 'approx', 'hist', 'gpu_hist')
+def xgb_grid_define(objective: Tuple[str] = ('reg:squarederror',),
+                    n_estimators: Tuple[int] = (100,),
+                    max_depth: Tuple[int] =(6,),
+                    verbosity: Tuple[int] = (1,),
+                    learning_rate: Tuple[float] = (0.3,),
+                    eta: Tuple[float] = (0.3,),
+                    gamma: Tuple[float] = (0.0,),
+                    tree_method: Tuple[str] = ('auto',)) -> List[xgb.XGBRegressor]:
+
+    grid = list(product(objective, n_estimators, max_depth, verbosity, learning_rate, eta, gamma, tree_method))
+    mdls = []
+    for i in grid:
+        mdl = xgb_define(objective=i[0], n_estimators=i[1], max_depth=i[2], verbosity=i[3], learning_rate=i[4], eta=i[5], gamma=i[6], tree_method=i[7])
+        mdls.append(mdl)
+    return mdls
+
+
+
+
+def xgb_grid_fit(x: pd.DataFrame,
+                 y: np.ndarray,
+                 xgb_regs: List[xgb.XGBRegressor]) -> List[xgb.XGBRegressor]:
+    check_valid_lst(data=xgb_regs, source=xgb_grid_fit.__name__, valid_dtypes=())
+
+
+
+
+
+
+    #xgb_define()
+
+
+
+
+
+    # grid_df = pd.DataFrame(grid, columns=['objective', 'n_estimators', 'max_depth', 'learning_rate', 'eta', 'gamma'])
+    # grid_df['verbosity'], grid_df['tree_method'] = verbosity, tree_method
     #
-    # check_valid_dataframe(df=x, source=f'{fit_xgb.__name__} x', valid_dtypes=Formats.NUMERIC_DTYPES.value)
-    # check_valid_array(data=y, source=f'{fit_xgb.__name__} y', accepted_ndims=(1,), accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value)
-    # check_str(name=f'{fit_xgb.__name__} objective', value=objective, options=OBJECTIVES)
-    # check_str(name=f'{fit_xgb.__name__} tree_method', value=tree_method, options=TREE_METHODS)
-    # check_int(name=f'{fit_xgb.__name__} n_estimators', value=n_estimators, min_value=1)
-    # check_int(name=f'{fit_xgb.__name__} max_depth', value=max_depth, min_value=1)
-    # check_int(name=f'{fit_xgb.__name__} verbosity', value=verbosity, min_value=0, max_value=3)
-    # check_float(name=f'{fit_xgb.__name__} learning_rate', value=learning_rate, min_value=0.1, max_value=1.0)
-    # check_int(name=f'{fit_xgb.__name__} k', value=k, min_value=2)
-    # k_fold = StratifiedKFold(n_splits=k, shuffle=True)
-    # for fold_cnt, (train_index, test_index) in enumerate(k_fold.split(x, y)):
-    #     x_fold, y_fold = x.loc[train_index], y[test_index]
-
-    pass
+    # grid_df.apply(fit, axis=1, result_type='expand')
+
+
+
+
+
+
+
 
+xgb_grid_define(max_depth=(6, 3), gamma=(0, 0.3))
 
 # x = pd.DataFrame(np.random.randint(0, 500, (100, 20)))
 # y = np.random.randint(1, 6, (100,))
diff --git a/simba/third_party_label_appenders/third_party_appender.py b/simba/third_party_label_appenders/third_party_appender.py
index 79e67b780..208d13e11 100644
--- a/simba/third_party_label_appenders/third_party_appender.py
+++ b/simba/third_party_label_appenders/third_party_appender.py
@@ -193,6 +193,28 @@ def run(self):
 
 
 
+
+log = True
+file_format = 'xlsx'
+error_settings = {'INVALID annotations file data format': 'ERROR',
+                  'ADDITIONAL third-party behavior detected': 'NONE',
+                  'Annotations EVENT COUNT conflict': 'WARNING',
+                  'Annotations OVERLAP inaccuracy': 'WARNING',
+                  'ZERO third-party video behavior annotations found': 'WARNING',
+                  'Annotations and pose FRAME COUNT conflict': 'WARNING',
+                  'Annotations data file NOT FOUND': 'WARNING'}
+
+test = ThirdPartyLabelAppender(config_path=r"C:\troubleshooting\boris_test_2\project_folder\project_config.ini",
+                               data_dir=r"C:\troubleshooting\boris_test_2\project_folder\boris_files",
+                               app='BORIS',
+                               file_format='.csv',
+                               error_settings=error_settings,
+                               log=log)
+test.run()
+
+
+
+
 # log = True
 # file_format = 'xlsx'
 # error_settings = {'INVALID annotations file data format': 'WARNING',
diff --git a/simba/utils/read_write.py b/simba/utils/read_write.py
index 9b50be13d..3ee989333 100644
--- a/simba/utils/read_write.py
+++ b/simba/utils/read_write.py
@@ -2334,11 +2334,11 @@ def read_boris_file(file_path: Union[str, os.PathLike],
         expected_headers = [TIME, MEDIA_FILE_PATH, BEHAVIOR, STATUS]
         df = pd.read_csv(file_path)
     check_valid_dataframe(df=df, source=f'{read_boris_file.__name__} {file_path}', required_fields=expected_headers)
+    df = df.dropna(how='all').reset_index(drop=True)
     numeric_check = pd.to_numeric(df[TIME], errors='coerce').notnull().all()
     if not numeric_check:
         if raise_error:
-            raise InvalidInputError(
-                msg=f'SimBA found TIME DATA annotation in file {file_path} that could not be interpreted as numeric values (seconds or frame numbers)')
+            raise InvalidInputError(msg=f'SimBA found TIME DATA annotation in file {file_path} that could not be interpreted as numeric values (seconds or frame numbers)')
         else:
             ThirdPartyAnnotationsInvalidFileFormatWarning(annotation_app="BORIS", file_path=file_path, source=read_boris_file.__name__, log_status=log_setting)
             return {}