From 0901665f34faf5e60f2ef12a7f18bfef9d90e6d2 Mon Sep 17 00:00:00 2001 From: sronilsson Date: Sat, 9 Nov 2024 09:43:22 -0500 Subject: [PATCH] outlier correction mp --- docs/simba.outlier_tools.rst | 20 +- simba/SimBA.py | 2 +- simba/mixins/statistics_mixin.py | 15 +- .../outlier_corrector_location.py | 266 +++++++----------- 4 files changed, 132 insertions(+), 171 deletions(-) diff --git a/docs/simba.outlier_tools.rst b/docs/simba.outlier_tools.rst index 40f527831..c74021b9c 100644 --- a/docs/simba.outlier_tools.rst +++ b/docs/simba.outlier_tools.rst @@ -1,17 +1,33 @@ -Location outlier methods +Location outlier removed -------------------------------------------------------- .. automodule:: simba.outlier_tools.outlier_corrector_location.OutlierCorrecterLocation :members: :show-inheritance: -Movement outlier methods +Movement outlier remover -------------------------------------------------------- .. automodule:: simba.outlier_tools.outlier_corrector_movement.OutlierCorrecterMovement :members: :show-inheritance: + +Movement outlier remover: multi-core +-------------------------------------------------------- + +.. automodule:: simba.outlier_tools.outlier_corrector_movement_mp.OutlierCorrecterMovementMultiProcess + :members: + :show-inheritance: + + +Location outlier remover: multi-core +-------------------------------------------------------- + +.. automodule:: simba.outlier_tools.outlier_corrector_location_mp.OutlierCorrecterLocationMultiprocess + :members: + :show-inheritance: + Advanced movement outlier correction -------------------------------------------------------- diff --git a/simba/SimBA.py b/simba/SimBA.py index 088771906..d992929a4 100644 --- a/simba/SimBA.py +++ b/simba/SimBA.py @@ -324,7 +324,7 @@ def __init__(self, config_path: str): label_outliercorrection = CreateLabelFrameWithIcon(parent=tab4, header="OUTLIER CORRECTION", icon_name=Keys.DOCUMENTATION.value, icon_link=Links.OUTLIERS_DOC.value) button_settings_outlier = SimbaButton(parent=label_outliercorrection, txt="SETTINGS", txt_clr='blue', img='settings', font=Formats.FONT_REGULAR.value, cmd=OutlierSettingsPopUp, cmd_kwargs={'config_path': lambda:self.config_path}) - button_outliercorrection = SimbaButton(parent=label_outliercorrection, txt="RUN OUTLIER CORRECTION", txt_clr='green', img='rocket', font=Formats.FONT_REGULAR.value, cmd=self.correct_outlier, thread=True) + button_outliercorrection = SimbaButton(parent=label_outliercorrection, txt="RUN OUTLIER CORRECTION", txt_clr='green', img='rocket', font=Formats.FONT_REGULAR.value, cmd=self.correct_outlier, thread=False) button_skipOC = SimbaButton(parent=label_outliercorrection, txt="SKIP OUTLIER CORRECTION (CAUTION)", txt_clr='red', img='skip_2', font=Formats.FONT_REGULAR.value, cmd=self.initiate_skip_outlier_correction, thread=True) label_extractfeatures = CreateLabelFrameWithIcon(parent=tab5, header="EXTRACT FEATURES", icon_name=Keys.DOCUMENTATION.value, icon_link=Links.EXTRACT_FEATURES.value) diff --git a/simba/mixins/statistics_mixin.py b/simba/mixins/statistics_mixin.py index f6178bea9..a3e4049d6 100644 --- a/simba/mixins/statistics_mixin.py +++ b/simba/mixins/statistics_mixin.py @@ -4192,6 +4192,13 @@ def symmetry_index(x: np.ndarray, y: np.ndarray, agg_type: Literal['mean', 'medi Zero indicates perfect symmetry. Positive values pepresent increasing asymmetry between the two measurements. + The Symmetry Index (SI) is calculated as: + + .. math:: + SI = \frac{|x_i - y_i|}{0.5 \times (x_i + y_i)} \times 100 + + where :math:`x_i` and :math:`y_i` are the values of the two measurements at each time point. + :param np.ndarray x: A 1-dimensional array of measurements from one side (e.g., left side), representing a time series or sequence of measurements. :param np.ndarray y: A 1-dimensional array of measurements from the other side (e.g., right side), of the same length as `x`. :param Literal['mean', 'median'] agg_type: The aggregation method used to summarize the Symmetry Index across all time points. @@ -4201,13 +4208,11 @@ def symmetry_index(x: np.ndarray, y: np.ndarray, agg_type: Literal['mean', 'medi :example: >>> x = np.random.randint(0, 155, (100,)) >>>y = np.random.randint(0, 155, (100,)) - >>> symmetry_index(x=x, y=y) + >>> Statistics.symmetry_index(x=x, y=y) """ - check_valid_array(data=x, source=f'{Statistics.symmetry_index.__name__} x', accepted_ndims=(1,), min_axis_0=1, - accepted_dtypes=Formats.NUMERIC_DTYPES.value) - check_valid_array(data=x, source=f'{Statistics.symmetry_index.__name__} y', accepted_ndims=(1,), min_axis_0=1, - accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=x, source=f'{Statistics.symmetry_index.__name__} x', accepted_ndims=(1,), min_axis_0=1, accepted_dtypes=Formats.NUMERIC_DTYPES.value) + check_valid_array(data=x, source=f'{Statistics.symmetry_index.__name__} y', accepted_ndims=(1,), min_axis_0=1, accepted_axis_0_shape=[x.shape[0]], accepted_dtypes=Formats.NUMERIC_DTYPES.value) check_str(name=f'{Statistics.symmetry_index.__name__} agg_type', value=agg_type, options=('mean', 'median')) si_values = np.abs(x - y) / (0.5 * (x + y)) * 100 if agg_type == 'mean': diff --git a/simba/outlier_tools/outlier_corrector_location.py b/simba/outlier_tools/outlier_corrector_location.py index b09d30ab2..3606f43dd 100644 --- a/simba/outlier_tools/outlier_corrector_location.py +++ b/simba/outlier_tools/outlier_corrector_location.py @@ -1,20 +1,22 @@ __author__ = "Simon Nilsson" +import functools +import multiprocessing import os -from typing import Union +from typing import Dict, Optional, Union import numpy as np import pandas as pd from simba.mixins.config_reader import ConfigReader +from simba.mixins.feature_extraction_mixin import FeatureExtractionMixin +from simba.utils.checks import check_float, check_if_dir_exists from simba.utils.enums import ConfigKey, Dtypes from simba.utils.printing import SimbaTimer, stdout_success -from simba.utils.read_write import (get_fn_ext, read_config_entry, read_df, - write_df) +from simba.utils.read_write import (find_files_of_filetypes_in_directory, get_fn_ext, read_config_entry, read_df, write_df) - -class OutlierCorrecterLocation(ConfigReader): +class OutlierCorrecterLocation(ConfigReader, FeatureExtractionMixin): """ Detect and amend outliers in pose-estimation data based in the location of the body-parts in the current frame relative to the location of the body-part in the preceding frame using heuristic rules. @@ -29,188 +31,126 @@ class OutlierCorrecterLocation(ConfigReader): :align: center :param Union[str, os.PathLike] config_path: path to SimBA project config file in Configparser format + :param Optional[Union[str, os.PathLike]] data_dir: The directory storing the input data. If None, then the ``outlier_corrected_movement`` directory of the SimBA project. + :param Optional[Union[str, os.PathLike]] save_dir: The directory to store the results. If None, then the ``outlier_corrected_movement_location`` directory of the SimBA project. + :param Optional[Dict[str, Dict[str, str]]] animal_dict: Dictionary holding the animal names, and the two body-parts to use to measure the mean or median size of the animals. If None, grabs the info from the SimBA project config. + :param Optional[float] criterion: The criterion multiplier. If None, grabs the info from the SimBA project config. :example: >>> _ = OutlierCorrecterLocation(config_path='MyProjectConfig').run() """ def __init__(self, - config_path: Union[str, os.PathLike]): - - super().__init__(config_path=config_path) + config_path: Union[str, os.PathLike], + data_dir: Optional[Union[str, os.PathLike]] = None, + save_dir: Optional[Union[str, os.PathLike]] = None, + animal_dict: Optional[Dict[str, Dict[str, str]]] = None, + criterion: Optional[float] = None): + + ConfigReader.__init__(self, config_path=config_path, create_logger=False, read_video_info=False) + FeatureExtractionMixin.__init__(self) if not os.path.exists(self.outlier_corrected_dir): os.makedirs(self.outlier_corrected_dir) - if self.animal_cnt == 1: - self.animal_id = read_config_entry(self.config, - ConfigKey.MULTI_ANIMAL_ID_SETTING.value, - ConfigKey.MULTI_ANIMAL_IDS.value, - Dtypes.STR.value) - if self.animal_id != "None": - self.animal_bp_dict[self.animal_id] = self.animal_bp_dict.pop("Animal_1") - self.above_criterion_dict_dict = {} - self.below_criterion_dict_dict = {} - self.criterion = read_config_entry(self.config, ConfigKey.OUTLIER_SETTINGS.value, ConfigKey.LOCATION_CRITERION.value, Dtypes.FLOAT.value) - self.outlier_bp_dict = {} - for animal_name in self.animal_bp_dict.keys(): - self.outlier_bp_dict[animal_name] = {} - self.outlier_bp_dict[animal_name]["bp_1"] = read_config_entry(self.config, ConfigKey.OUTLIER_SETTINGS.value, "location_bodypart1_{}".format(animal_name.lower()),"str") - self.outlier_bp_dict[animal_name]["bp_2"] = read_config_entry(self.config, ConfigKey.OUTLIER_SETTINGS.value, "location_bodypart2_{}".format(animal_name.lower()), "str") - - def __find_location_outliers(self): - for animal_name, animal_data in self.bp_dict.items(): - animal_criterion = self.animal_criteria[animal_name] - self.above_criterion_dict_dict[self.video_name][animal_name] = {} - self.below_criterion_dict_dict[self.video_name][animal_name] = {} - for body_part_name, body_part_data in animal_data.items(): - self.above_criterion_dict_dict[self.video_name][animal_name][ - body_part_name - ] = [] - self.below_criterion_dict_dict[self.video_name][animal_name][ - body_part_name - ] = [] - for frame in range(body_part_data.shape[0]): - second_bp_names = [ - x for x in list(animal_data.keys()) if x != body_part_name - ] - first_bp_cord = body_part_data[frame] - distance_above_criterion_counter = 0 - for second_bp in second_bp_names: - second_bp_cord = animal_data[second_bp][frame] - distance = np.sqrt( - (first_bp_cord[0] - second_bp_cord[0]) ** 2 - + (first_bp_cord[1] - second_bp_cord[1]) ** 2 - ) - if distance > animal_criterion: - distance_above_criterion_counter += 1 - if distance_above_criterion_counter > 1: - self.above_criterion_dict_dict[self.video_name][animal_name][ - body_part_name - ].append(frame) - else: - self.below_criterion_dict_dict[self.video_name][animal_name][ - body_part_name - ].append(frame) - - def __correct_outliers(self): - above_citeria_dict = self.above_criterion_dict_dict[self.video_name] - for animal_name, animal_bp_data in above_citeria_dict.items(): - for bp_name, outlier_idx_lst in animal_bp_data.items(): - body_part_x, body_part_y = bp_name + "_x", bp_name + "_y" - for outlier_idx in outlier_idx_lst: - try: - closest_idx = max( - [ - i - for i in self.below_criterion_dict_dict[ - self.video_name - ][animal_name][bp_name] - if outlier_idx > i - ] - ) - except ValueError: - closest_idx = outlier_idx - self.data_df.loc[[outlier_idx], body_part_x] = self.data_df.loc[ - [closest_idx], body_part_x - ].values[0] - self.data_df.loc[[outlier_idx], body_part_y] = self.data_df.loc[ - [closest_idx], body_part_y - ].values[0] + if criterion is None: + self.criterion = read_config_entry(self.config, ConfigKey.OUTLIER_SETTINGS.value, ConfigKey.LOCATION_CRITERION.value, Dtypes.FLOAT.value) + else: + check_float(name=f'{criterion} criterion', value=criterion, min_value=10e-10) + self.criterion = criterion + if data_dir is not None: + check_if_dir_exists(in_dir=data_dir, source=self.__class__.__name__) + self.data_dir = data_dir + else: + self.data_dir = self.outlier_corrected_movement_dir + if save_dir is not None: + check_if_dir_exists(in_dir=save_dir, source=self.__class__.__name__) + self.save_dir = save_dir + else: + self.save_dir = self.outlier_corrected_dir + + self.above_criterion_dict_dict, self.below_criterion_dict_dict = {},{} + if animal_dict is None: + self.outlier_bp_dict = {} + if self.animal_cnt == 1: + self.animal_id = read_config_entry(self.config, ConfigKey.MULTI_ANIMAL_ID_SETTING.value, ConfigKey.MULTI_ANIMAL_IDS.value, Dtypes.STR.value) + if self.animal_id != "None": + self.animal_bp_dict[self.animal_id] = self.animal_bp_dict.pop("Animal_1") + + for animal_name in self.animal_bp_dict.keys(): + self.outlier_bp_dict[animal_name] = {} + self.outlier_bp_dict[animal_name]["bp_1"] = read_config_entry(self.config, ConfigKey.OUTLIER_SETTINGS.value, "location_bodypart1_{}".format(animal_name.lower()),"str") + self.outlier_bp_dict[animal_name]["bp_2"] = read_config_entry(self.config, ConfigKey.OUTLIER_SETTINGS.value, "location_bodypart2_{}".format(animal_name.lower()),"str") + else: + self.outlier_bp_dict = animal_dict + + def __find_location_outliers(self, bp_dict: dict, animal_criteria: dict): + above_criteria_dict, below_criteria_dict = {}, {} + for animal_name, animal_data in bp_dict.items(): + animal_criterion = animal_criteria[animal_name] + above_criteria_dict[animal_name]= {} + for first_bp_cnt, (first_body_part_name, first_bp_cords) in enumerate(animal_data.items()): + second_bp_names = [x for x in list(animal_data.keys()) if x != first_body_part_name] + above_criterion_frms = [] + for second_bp_cnt, second_bp in enumerate(second_bp_names): + second_bp_cords = animal_data[second_bp] + distances = self.framewise_euclidean_distance(location_1=first_bp_cords, location_2=second_bp_cords, px_per_mm=1.0, centimeter=False) + above_criterion_frms.extend(np.argwhere(distances > animal_criterion).flatten()) + unique, counts = np.unique(above_criterion_frms, return_counts=True) + above_criteria_dict[animal_name][first_body_part_name] = np.sort(unique[counts > 1]) + return above_criteria_dict + + + def __correct_outliers(self, df: pd.DataFrame, above_criteria_dict: dict): + for animal_name, animal_data in above_criteria_dict.items(): + for body_part_name, frm_idx in animal_data.items(): + col_names = [f'{body_part_name}_x', f'{body_part_name}_y'] + if len(frm_idx) > 0: + df.loc[frm_idx, col_names] = np.nan + return df.fillna(method='ffill', axis=1).fillna(0) def run(self): - """ - Runs outlier detection and correction. Results are stored in the - ``project_folder/csv/outlier_corrected_movement_location`` directory of the SimBA project. - """ - - for file_cnt, file_path in enumerate(self.outlier_corrected_movement_paths): + self.logs, self.frm_cnts = {}, {} + data_paths = find_files_of_filetypes_in_directory(directory=self.data_dir, extensions=[f'.{self.file_type}'], raise_error=True) + for file_cnt, data_path in enumerate(data_paths): video_timer = SimbaTimer(start=True) - _, self.video_name, _ = get_fn_ext(file_path) - print( - f"Processing video {self.video_name}. Video {file_cnt+1}/{len(self.outlier_corrected_movement_paths)}.." - ) - self.above_criterion_dict_dict[self.video_name] = {} - self.below_criterion_dict_dict[self.video_name] = {} - save_path = os.path.join( - self.outlier_corrected_dir, self.video_name + "." + self.file_type - ) - self.data_df = read_df(file_path, self.file_type) - self.animal_criteria = {} + _, video_name, _ = get_fn_ext(data_path) + print(f"Processing video {video_name}..") + save_path = os.path.join(self.save_dir, f"{video_name}.{self.file_type}") + above_criterion_dict, below_criterion_dict, animal_criteria, bp_dict = {}, {}, {}, {} + df = read_df(data_path, self.file_type) for animal_name, animal_bps in self.outlier_bp_dict.items(): - animal_bp_distances = np.sqrt( - ( - self.data_df[animal_bps["bp_1"] + "_x"] - - self.data_df[animal_bps["bp_2"] + "_x"] - ) - ** 2 - + ( - self.data_df[animal_bps["bp_1"] + "_y"] - - self.data_df[animal_bps["bp_2"] + "_y"] - ) - ** 2 - ) - self.animal_criteria[animal_name] = ( - animal_bp_distances.mean() * self.criterion - ) - self.bp_dict = {} + animal_bp_distances = np.sqrt((df[animal_bps["bp_1"] + "_x"] - df[animal_bps["bp_2"] + "_x"]) ** 2 + (df[animal_bps["bp_1"] + "_y"] - df[animal_bps["bp_2"] + "_y"]) ** 2) + animal_criteria[animal_name] = (animal_bp_distances.mean() * self.criterion) for animal_name, animal_bps in self.animal_bp_dict.items(): - bp_col_names = np.array( - [[i, j] for i, j in zip(animal_bps["X_bps"], animal_bps["Y_bps"])] - ).ravel() - animal_arr = self.data_df[bp_col_names].to_numpy() - self.bp_dict[animal_name] = {} + bp_col_names = np.array([[i, j] for i, j in zip(animal_bps["X_bps"], animal_bps["Y_bps"])]).ravel() + animal_arr = df[bp_col_names].to_numpy() + bp_dict[animal_name] = {} for bp_cnt, bp_col_start in enumerate(range(0, animal_arr.shape[1], 2)): bp_name = animal_bps["X_bps"][bp_cnt][:-2] - self.bp_dict[animal_name][bp_name] = animal_arr[ - :, bp_col_start : bp_col_start + 2 - ] - self.__find_location_outliers() - self.__correct_outliers() - write_df(df=self.data_df, file_type=self.file_type, save_path=save_path) + bp_dict[animal_name][bp_name] = animal_arr[:, bp_col_start: bp_col_start + 2] + above_criteria_dict = self.__find_location_outliers(bp_dict=bp_dict, animal_criteria=animal_criteria) + + df = self.__correct_outliers(df=df, above_criteria_dict=above_criteria_dict) + write_df(df=df, file_type=self.file_type, save_path=save_path) + self.logs[video_name], self.frm_cnts[video_name] = above_criteria_dict, len(df) video_timer.stop_timer() - print( - f"Corrected location outliers for file {self.video_name} (elapsed time: {video_timer.elapsed_time_str}s)..." - ) + print(f"Corrected location outliers for file {video_name} (elapsed time: {video_timer.elapsed_time_str}s)...") self.__save_log_file() def __save_log_file(self): - out_df_lst = [] - for video_name, video_data in self.above_criterion_dict_dict.items(): + out_df = pd.DataFrame(columns=['VIDEO', 'ANIMAL', 'BODY-PART', 'CORRECTION COUNT', 'CORRECTION RATIO']) + for video_name, video_data in self.logs.items(): for animal_name, animal_data in video_data.items(): - for bp_name, vid_idx_lst in animal_data.items(): - correction_ratio = round(len(vid_idx_lst) / len(self.data_df), 6) - out_df_lst.append( - pd.DataFrame( - [ - [ - video_name, - animal_name, - bp_name, - len(vid_idx_lst), - correction_ratio, - ] - ], - columns=[ - "Video", - "Animal", - "Body-part", - "Corrections", - "Correction ratio (%)", - ], - ) - ) - out_df = pd.concat(out_df_lst, axis=0).reset_index(drop=True) - self.logs_path = os.path.join( - self.logs_path, f"Outliers_location_{self.datetime}.csv" - ) + for bp_name, bp_data in animal_data.items(): + correction_ratio = round(len(bp_data) / self.frm_cnts[video_name], 6) + out_df.loc[len(out_df)] = [video_name, animal_name, bp_name, len(bp_data), correction_ratio] + self.logs_path = os.path.join(self.logs_path, f"Outliers_location_{self.datetime}.csv") out_df.to_csv(self.logs_path) self.timer.stop_timer() - stdout_success( - msg='Log for corrected "location outliers" saved in project_folder/logs', - elapsed_time=self.timer.elapsed_time_str, - ) + stdout_success(msg='Log for corrected "location outliers" saved in project_folder/logs', elapsed_time=self.timer.elapsed_time_str) + -# test = OutlierCorrecterLocation(config_path='/Users/simon/Desktop/envs/troubleshooting/naresh/project_folder/project_config.ini') +# test = OutlierCorrecterLocation(config_path=r"C:\troubleshooting\two_black_animals_14bp\project_folder\project_config.ini") # test.run() # test = OutlierCorrecterLocation(config_path='/Users/simon/Desktop/envs/troubleshooting/two_black_animals_14bp/project_folder/project_config.ini')