Skip to content

Commit

Permalink
bento append
Browse files Browse the repository at this point in the history
  • Loading branch information
sronilsson committed Aug 28, 2024
1 parent dc0c5b6 commit 6b035d0
Show file tree
Hide file tree
Showing 8 changed files with 354 additions and 208 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# Setup configuration
setuptools.setup(
name="Simba-UW-tf-dev",
version="2.0.7",
version="2.0.8",
author="Simon Nilsson, Jia Jie Choong, Sophia Hwang",
author_email="[email protected]",
description="Toolkit for computer classification and analysis of behaviors in experimental animals",
Expand Down
2 changes: 0 additions & 2 deletions simba/data_processors/cuda/convex_hull.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
__email__ = "[email protected]"

from numba import cuda, njit
from copy import deepcopy
import numpy as np
import time

THREADS_PER_BLOCK = 128

Expand Down
40 changes: 40 additions & 0 deletions simba/data_processors/cuda/convex_hull_area.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import Optional
import cupy as cp

from simba.utils.checks import check_float, check_valid_array
import numpy as np
from simba.utils.enums import Formats

def poly_area(data: np.ndarray,
pixels_per_mm: Optional[float] = 1.0,
batch_size: Optional[int] = int(0.5e+7)) -> np.ndarray:

"""
Compute the area of a polygon using GPU acceleration.
This function calculates the area of polygons defined by sets of points in a 3D array.
Each 2D slice along the first dimension represents a polygon, with each row corresponding
to a point in the polygon and each column representing the x and y coordinates.
The computation is done in batches to handle large datasets efficiently.
:param data: A 3D numpy array of shape (N, M, 2), where N is the number of polygons, M is the number of points per polygon, and 2 represents the x and y coordinates.
:param pixels_per_mm: Optional scaling factor to convert the area from pixels squared to square millimeters. Default is 1.0.
:param batch_size: Optional batch size for processing the data in chunks to fit in memory. Default is 0.5e+7.
:return: A 1D numpy array of shape (N,) containing the computed area of each polygon in square millimeters.
"""

check_valid_array(data=data, source=f'{poly_area} data', accepted_ndims=(3,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
check_float(name=f'{poly_area} pixels_per_mm', min_value=10e-16, value=pixels_per_mm)
results = cp.full((data.shape[0]), fill_value=cp.nan, dtype=cp.int32)
for l in range(0, data.shape[0], batch_size):
r = l + batch_size
x = cp.asarray(data[l:r, :, 0])
y = cp.asarray(data[l:r, :, 1])
x_r = cp.roll(x, shift=1, axis=1)
y_r = cp.roll(y, shift=1, axis=1)
dot_xy_roll_y = cp.sum(x * y_r, axis=1)
dot_y_roll_x = cp.sum(y * x_r, axis=1)
results[l:r] = (0.5 * cp.abs(dot_xy_roll_y - dot_y_roll_x)) / pixels_per_mm

return results.get()
2 changes: 1 addition & 1 deletion simba/data_processors/cuda/imgs_to_grayscale_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def img_stack_to_grayscale_cupy(imgs: np.ndarray,
check_if_valid_img(data=imgs[0], source=img_stack_to_grayscale_cupy.__name__)
if imgs.ndim != 4:
return imgs
results = cp.zeros((imgs.shape[0], imgs.shape[1], imgs.shape[2]), dtype=np.uint8)
results = cp.zeros((imgs.shape[0], imgs.shape[1], imgs.shape[2]), dtype=np.uint8)
n = int(np.ceil((imgs.shape[0] / batch_size)))
imgs = np.array_split(imgs, n)
start = 0
Expand Down
12 changes: 3 additions & 9 deletions simba/mixins/feature_extraction_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,17 +713,11 @@ def minimum_bounding_rectangle(points: np.ndarray) -> np.ndarray:
angles = np.arctan2(edges[:, 1], edges[:, 0])
angles = np.abs(np.mod(angles, pi2))
angles = np.unique(angles)
rotations = np.vstack(
[np.cos(angles), np.cos(angles - pi2), np.cos(angles + pi2), np.cos(angles)]
).T
rotations = np.vstack([np.cos(angles), np.cos(angles - pi2), np.cos(angles + pi2), np.cos(angles)]).T
rotations = rotations.reshape((-1, 2, 2))
rot_points = np.dot(rotations, hull_points.T)
min_x, max_x = np.nanmin(rot_points[:, 0], axis=1), np.nanmax(
rot_points[:, 0], axis=1
)
min_y, max_y = np.nanmin(rot_points[:, 1], axis=1), np.nanmax(
rot_points[:, 1], axis=1
)
min_x, max_x = np.nanmin(rot_points[:, 0], axis=1), np.nanmax(rot_points[:, 0], axis=1)
min_y, max_y = np.nanmin(rot_points[:, 1], axis=1), np.nanmax(rot_points[:, 1], axis=1)
areas = (max_x - min_x) * (max_y - min_y)
best_idx = np.argmin(areas)
x1, x2 = max_x[best_idx], min_x[best_idx]
Expand Down
265 changes: 144 additions & 121 deletions simba/third_party_label_appenders/BENTO_appender.py

Large diffs are not rendered by default.

134 changes: 62 additions & 72 deletions simba/third_party_label_appenders/tools.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,74 @@
from typing import Dict, List
from typing import Dict, List, Union, Optional
try:
from typing import Literal
except:
from typing_extensions import Literal

import numpy as np
import pandas as pd
import os

from simba.utils.data import detect_bouts
from simba.utils.enums import Methods
from simba.utils.errors import ColumnNotFoundError, InvalidFileTypeError
from simba.utils.read_write import get_fn_ext, read_video_info
from simba.utils.read_write import get_fn_ext, read_video_info, bento_file_reader, read_video_info_csv, find_files_of_filetypes_in_directory
from simba.utils.warnings import ThirdPartyAnnotationsInvalidFileFormatWarning
from simba.utils.checks import (check_valid_lst,
check_valid_dataframe,
check_all_file_names_are_represented_in_video_log,
check_str,
check_valid_boolean,
check_file_exist_and_readable,
check_if_dir_exists)

BENTO = "Bento"


def read_bento_files(data_paths: Union[List[str], str, os.PathLike],
video_info_df: Union[str, os.PathLike, pd.DataFrame],
error_setting: Literal[Union[None, Methods.ERROR.value, Methods.WARNING.value]] = None,
log_setting: Optional[bool] = False) -> Dict[str, pd.DataFrame]:

"""
Reads multiple BENTO annotation files and processes them into a dictionary of DataFrames, each representing the
combined annotations for a corresponding video. The function verifies that all files exist and that the file names
match the video information provided.
:param Union[List[str], str, os.PathLike] data_paths: Paths to BENTO annotation files or a directory containing such files. If a directory is provided, all files with the extension '.annot' will be processed.
:param Union[str, os.PathLike, pd.DataFrame] video_info_df: Path to a CSV file containing video information or a preloaded DataFrame with the same data. This information is used to match BENTO files with their corresponding videos and extract the FPS.
:param Literal[Union[None, Methods.ERROR.value, Methods.WARNING.value]] error_setting: Determines the error handling mode. If set to `Methods.ERROR.value`, errors will raise exceptions. If set to `Methods.WARNING.value`, errors will generate warnings instead. If None, no error handling modifications are applied.
:param Optional[bool] = False) -> Dict[str, pd.DataFrame] log_setting: If True, logging will be enabled for the process, providing detailed information about the steps being executed.
:return: A dictionary where the keys are video names and the values are DataFrames containing the combined annotations for each video.
:rtype: Dict[str, pd.DataFrame]
:example:
>>> dfs = read_bento_files(data_paths=r"C:\troubleshooting\bento_test\bento_files", error_setting='WARNING', log_setting=False, video_info_df=r"C:\troubleshooting\bento_test\project_folder\logs\video_info.csv")
"""

if error_setting is not None:
check_str(name=f'{read_bento_files.__name__} error_setting', value=error_setting, options=(Methods.ERROR.value, Methods.WARNING.value))
check_valid_boolean(value=log_setting, source=f'{read_bento_files.__name__} log_setting')
raise_error = False
if error_setting == Methods.ERROR.value:
raise_error = True
if isinstance(video_info_df, str):
check_file_exist_and_readable(file_path=video_info_df)
video_info_df = read_video_info_csv(file_path=video_info_df)
if isinstance(data_paths, list):
check_valid_lst(data=data_paths, source=f'{read_bento_files.__name__} data_paths', min_len=1, valid_dtypes=(str,))
elif isinstance(data_paths, str):
check_if_dir_exists(in_dir=data_paths, source=f'{read_bento_files.__name__} data_paths')
data_paths = find_files_of_filetypes_in_directory(directory=data_paths, extensions=['.annot'], raise_error=True)
check_all_file_names_are_represented_in_video_log(video_info_df=video_info_df, data_paths=data_paths)
check_valid_dataframe(df=video_info_df, source=read_bento_files.__name__)
dfs = {}
for file_cnt, file_path in enumerate(data_paths):
_, video_name, ext = get_fn_ext(filepath=file_path)
_, _, fps = read_video_info(vid_info_df=video_info_df, video_name=video_name)
bento_dict = bento_file_reader(file_path=file_path, fps=fps, orient='columns', save_path=None, raise_error=raise_error, log_setting=log_setting)
dfs[video_name] = pd.concat(bento_dict.values(), ignore_index=True)

return dfs

def observer_timestamp_corrector(timestamps: List[str]) -> List[str]:
corrected_ts = []
Expand Down Expand Up @@ -314,76 +374,6 @@ def read_solomon_files(
# video_info_df=video_info_df)


def read_bento_files(
data_paths: List[str],
error_setting: str,
video_info_df: pd.DataFrame,
log_setting: bool = False,
) -> Dict[str, pd.DataFrame]:
BENTO = "Bento"
CHANNEL = "Ch1----------"

dfs = {}
for file_cnt, file_path in enumerate(data_paths):
_, video_name, ext = get_fn_ext(filepath=file_path)
_, _, fps = read_video_info(vid_info_df=video_info_df, video_name=video_name)
try:
data_df = pd.read_csv(
file_path, delim_whitespace=True, index_col=False, low_memory=False
)
start_idx = data_df.index[data_df[BENTO] == CHANNEL].values[0]
sliced_annot = data_df.iloc[start_idx + 1 :]
clfs = sliced_annot[sliced_annot[BENTO].str.contains(">")]["Bento"].tolist()
video_events = []
for clf_name in clfs:
start_idx = sliced_annot.index[
sliced_annot[BENTO] == f"{clf_name}"
].values[0]
clf_df = sliced_annot.loc[start_idx + 2 :, :]
end_idx = (
clf_df.isnull()[clf_df.isnull().any(axis=1)].idxmax(axis=1).index
)
if end_idx.values:
end_idx = end_idx.values[0]
else:
end_idx = max(clf_df.index + 1)
clf_df = (
clf_df.loc[: end_idx - 1, :]
.reset_index(drop=True)
.drop("file", axis=1)
.astype(float)
)
clf_df.columns = ["START", "STOP"]
clf_df = clf_df * fps
for obs in clf_df.values:
video_events.append([clf_name, "START", obs[0]])
video_events.append([clf_name, "STOP", obs[1]])
video_df = pd.DataFrame(
video_events, columns=["BEHAVIOR", "EVENT", "FRAME"]
)
video_df["FRAME"] = video_df["FRAME"].astype(int)
video_df["BEHAVIOR"] = video_df["BEHAVIOR"].str[1:]
dfs[video_name] = video_df
except Exception as e:
if error_setting == Methods.WARNING.value:
ThirdPartyAnnotationsInvalidFileFormatWarning(
annotation_app="BENTO", file_path=file_path, log_status=log_setting
)
elif error_setting == Methods.ERROR.value:
raise InvalidFileTypeError(
msg=f"{file_path} is not a valid BENTO file. See the docs for expected file format."
)
else:
pass
return dfs


# video_info_df = read_video_info_csv(file_path='/Users/simon/Desktop/envs/troubleshooting/two_black_animals_14bp/project_folder/logs/video_info.csv')
#
# df = read_bento_files(data_paths=['/Users/simon/Desktop/envs/simba_dev/tests/test_data/bento_example/Together_1.annot'],
# error_setting='WARNING',
# log_setting=False,
# video_info_df=video_info_df)


def read_deepethogram_files(
Expand Down
105 changes: 103 additions & 2 deletions simba/utils/read_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
check_if_dir_exists,
check_if_filepath_list_is_empty,
check_if_string_value_is_valid_video_timestamp,
check_instance, check_int,
check_instance, check_int, check_str,
check_nvidea_gpu_available, check_valid_lst)
from simba.utils.enums import ConfigKey, Dtypes, Formats, Keys, Options
from simba.utils.errors import (DataHeaderError, DuplicationError,
Expand All @@ -50,7 +50,8 @@
ParametersFileError, PermissionError)
from simba.utils.printing import SimbaTimer, stdout_success
from simba.utils.warnings import (FileExistWarning, InvalidValueWarning,
NoDataFoundWarning, NoFileFoundWarning)
NoDataFoundWarning, NoFileFoundWarning,
ThirdPartyAnnotationsInvalidFileFormatWarning)

# from simba.utils.keyboard_listener import KeyboardListener

Expand Down Expand Up @@ -2091,3 +2092,103 @@ def find_largest_blob_location(imgs: dict, verbose: Optional[bool] = False, vide
print(e.args)
results[frm_idx] = np.array([np.nan, np.nan])
return results



def bento_file_reader(file_path: Union[str, os.PathLike],
fps: Optional[float] = None,
orient: Optional[Literal['index', 'columns']] = 'index',
save_path: Optional[Union[str, os.PathLike]] = None,
raise_error: Optional[bool] = False,
log_setting: Optional[bool] = False) -> Union[None, Dict[str, pd.DataFrame]]:

"""
Reads a BENTO annotation file and processes it into a dictionary of DataFrames, each representing a classified behavior.
Optionally, the results can be saved to a specified path.
The function handles both frame-based and second-based annotations, converting the latter to frame-based
annotations if the frames-per-second (FPS) is provided or can be inferred from the file.
:param Union[str, os.PathLike] file_path: Path to the BENTO annotation file.
:param Optional[float] fps: Frames per second (FPS) for converting second-based annotations to frames. If not provided, the function will attempt to infer FPS from the file. If FPS is required and cannot be inferred, an error is raised.
:param Optional[Union[str, os.PathLike]] save_path: Path to save the processed results as a pickle file. If None, results are returned instead of saved.
:return: A dictionary where the keys are classifier names and the values are DataFrames with 'START' and 'STOP' columns representing the start and stop frames of each behavior.
:rtype: Dict[str, pd.DataFrame]
:example:
>>> bento_file_reader(file_path=r"C:\troubleshooting\bento_test\bento_files\20240812_crumpling3.annot")
"""

def _orient_columns_melt(df: pd.DataFrame) -> pd.DataFrame:
df = df[['START', 'STOP']].astype(np.int32).reset_index()
df = df.melt(id_vars='index', var_name=None).drop('index', axis=1)
df["BEHAVIOR"] = clf_name
df.columns = ["EVENT", "FRAME", 'BEHAVIOR']
return df.sort_values(by='FRAME', ascending=True)[['BEHAVIOR', "EVENT", "FRAME"]].reset_index(drop=True)

check_file_exist_and_readable(file_path=file_path)
check_str(name=f'{bento_file_reader.__name__} orient', value=orient, options=('index', 'columns'))
if fps is not None:
check_int(name=f'{bento_file_reader.__name__} fps', value=fps, min_value=1)
_, video_name, _ = get_fn_ext(filepath=file_path)
try:
df = pd.read_csv(file_path, index_col=False, low_memory=False, header=None, encoding='utf-8').astype(str)
except:
df = pd.read_csv(file_path, index_col=False, low_memory=False, header=None, encoding='ascii').astype(str)
idx = df[0].str.contains(pat='>', regex=True)
idx = list(idx.index[idx])
results = {}
if len(idx) == 0:
if raise_error:
raise NoDataError(f"{file_path} is not a valid BENTO file. See the docs for expected file format.", source=bento_file_reader.__name__)
else:
ThirdPartyAnnotationsInvalidFileFormatWarning(annotation_app="BENTO", file_path=file_path, source=bento_file_reader.__name__, log_status=log_setting)
return results
idx.append(len(df))
idx_mod = [0] + idx + [max(idx) + 1]
clf_dfs = [df.iloc[idx_mod[n]:idx_mod[n + 1]] for n in range(len(idx_mod) - 1)][1:-1]
for clf_idx in range(len(clf_dfs)):
clf_df = clf_dfs[clf_idx].reset_index(drop=True)
clf_name = clf_df.iloc[0, 0][1:]
clf_df = clf_df.iloc[2:, 0].reset_index(drop=True)
out_clf_df = clf_df.str.split('\t', expand=True)
if len(out_clf_df.columns) > 3:
if raise_error:
raise InvalidFileTypeError(msg=f'SimBA found {len(out_clf_df.columns)} columns for file {file_path} and classifier {clf_name} when trying to split the data by tabs.')
else:
ThirdPartyAnnotationsInvalidFileFormatWarning(annotation_app="BENTO", file_path=file_path, source=bento_file_reader.__name__, log_status=log_setting)
return results
numeric_check = list(out_clf_df.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()))
if False in numeric_check:
if raise_error:
raise InvalidInputError(msg=f'SimBA found values in the annotation data for behavior {clf_name} in file {file_path} that could not be interpreted as numeric values (seconds or frame numbers)')
else:
ThirdPartyAnnotationsInvalidFileFormatWarning(annotation_app="BENTO", file_path=file_path, source=bento_file_reader.__name__, log_status=log_setting)
return results
out_clf_df.columns = ['START', 'STOP', 'DURATION']
out_clf_df = out_clf_df.astype(np.float32)
int_check = np.array_equal(out_clf_df, out_clf_df.astype(int))
if int_check:
if orient == 'index':
results[clf_name] = out_clf_df[['START', 'STOP']].astype(np.int32)
else:
results[clf_name] = _orient_columns_melt(df=out_clf_df)

else:
if fps is None:
try:
fps_idx = df[0].str.contains(pat='Annotation framerate', regex=True)
fps_str = df.iloc[list(fps_idx.index[fps_idx])][0].values[0]
fps = float(fps_str.split(':')[1])
except:
raise FrameRangeError(f'The annotations are in seconds and FPS was not passed. FPS could also not be read from the BENTO file', source=bento_file_reader.__name__)
out_clf_df["START"] = out_clf_df["START"].astype(float) * fps
out_clf_df["STOP"] = out_clf_df["STOP"].astype(float) * fps
if orient == 'index':
results[clf_name] = out_clf_df[['START', 'STOP']].astype(np.int32)
else:
results[clf_name] = _orient_columns_melt(df=out_clf_df)
if save_path is None:
return results
else:
write_pickle(data=results, save_path=save_path)

0 comments on commit 6b035d0

Please sign in to comment.