Skip to content

Commit

Permalink
video processing
Browse files Browse the repository at this point in the history
  • Loading branch information
sronilsson committed Jun 27, 2024
1 parent 4eceaeb commit d7325a3
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 19 deletions.
115 changes: 103 additions & 12 deletions simba/mixins/statistics_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3969,15 +3969,106 @@ def sokal_michener(x: np.ndarray, y: np.ndarray, w: Optional[np.ndarray] = None)
unequal_cnt += 1 * w[i[0]]
return (2.0 * unequal_cnt) / (x.size + unequal_cnt)

# sample_1 = np.random.random_integers(low=1, high=2, size=(10, 50)).astype(np.float64)
# sample_2 = np.random.random_integers(low=7, high=20, size=(10, 50)).astype(np.float64)
# data = np.vstack([sample_1, sample_2])
# Statistics().hbos(data=data)

# sample_1 = np.random.normal(loc=10, scale=2, size=1000).astype(np.float64)
# sample_2 = np.random.normal(loc=12, scale=2, size=10000).astype(np.float64)

# sample_1 = np.random.randint(0, 100, (100, )).astype(np.float64)
# sample_2 = np.random.randint(110, 200, (100, )).astype(np.float64)
#
# Statistics().jensen_shannon_divergence(sample_1=sample_1, sample_2=sample_2)
def kumar_hassebrook_similarity(x: np.ndarray, y: np.ndarray) -> float:
"""
Kumar-Hassebrook similarity is a measure used to quantify the similarity between two vectors.
.. note::
Kumar-Hassebrook similarity score of 1 indicates identical vectors and 0 indicating no similarity
:param np.ndarray x: 1D array representing the first feature values.
:param np.ndarray y: 1D array representing the second feature values.
:return: Kumar-Hassebrook similarity between vectors x and y.
:example:
>>> x, y = np.random.randint(0, 500, (1000,)), np.random.randint(0, 500, (1000,))
>>> Statistics.kumar_hassebrook_similarity(x=x, y=y)
"""
check_valid_array(data=x, source=f'{Statistics.kumar_hassebrook_similarity.__name__} x', accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
check_valid_array(data=y, source=f'{Statistics.kumar_hassebrook_similarity.__name__} y', accepted_ndims=(1,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
dot_product = np.dot(x, y)
norm_x = np.linalg.norm(x)
norm_y = np.linalg.norm(y)
return dot_product / (norm_x ** 2 + norm_y ** 2 - dot_product)

def wave_hedges_distance(x: np.ndarray, y: np.ndarray) -> float:
"""
Computes the Wave-Hedges distance between two 1-dimensional arrays `x` and `y`. The Wave-Hedges distance is a measure of dissimilarity between arrays.
.. note::
Wave-Hedges distance score of 0 indicate identical arrays. There is no upper bound.
:example:
>>> x = np.random.randint(0, 500, (1000,))
>>> y = np.random.randint(0, 500, (1000,))
>>> wave_hedges_distance(x=x, y=y)
"""

check_valid_array(data=x, source=f'{Statistics.wave_hedges_distance.__name__} x', accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
check_valid_array(data=y, source=f'{Statistics.wave_hedges_distance.__name__} y', accepted_ndims=(1,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
x_y = abs(x - y)
xy_max = np.maximum(x, y)
return np.sum(np.where(((x_y != 0) & (xy_max != 0)), x_y / xy_max, 0))

@staticmethod
def gower_distance(x: np.ndarray, y: np.ndarray) -> np.ndarray:
"""
Compute Gower-like distance vector between corresponding rows of two numerical matrices.
Gower distance is a measure of dissimilarity between two vectors (or rows in this case).
.. note::
This function assumes x and y have the same shape and only considers numerical attributes.
Each observation in x is compared to the corresponding observation in y based on normalized
absolute differences across numerical columns.
:param np.ndarray x: First numerical matrix with shape (m, n).
:param np.ndarray y: Second numerical matrix with shape (m, n).
:return np.ndarray: Gower-like distance vector with shape (m,).
:example:
>>> x, y = np.random.randint(0, 500, (1000, 6000)), np.random.randint(0, 500, (1000, 6000))
>>> Statistics.gower_distance(x=x, y=y)
"""
check_valid_array(data=x, source=f'{Statistics.gower_distance.__name__} x', accepted_ndims=(1, 2), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
check_valid_array(data=y, source=f'{Statistics.gower_distance.__name__} y', accepted_ndims=(x.ndim,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
field_ranges = np.max(x, axis=0) - np.min(x, axis=0)
results = np.full((x.shape[0]), np.nan)
for i in range(x.shape[0]):
u, v = x[i], y[i]
dist = 0.0
for j in range(u.shape[0]):
if field_ranges[j] != 0:
dist += np.abs(u[j] - v[j]) / field_ranges[j]
results[i] = dist / u.shape[0]
return results

@staticmethod
def normalized_google_distance(x: np.ndarray, y: np.ndarray) -> float:
"""
Compute Normalized Google Distance (NGD) between two vectors or matrices.
.. note::
This function assumes x and y have the same shape. It computes NGD based on the sum of elements and the minimum values between corresponding elements of x and y.
:param np.ndarray x: First numerical matrix with shape (m, n).
:param np.ndarray y: Second array or matrix with shape (m, n).
:return float: Normalized Google Distance between x and y.
:example:
>>> x, y = np.random.randint(0, 500, (1000,200)), np.random.randint(0, 500, (1000,200))
>>> Statistics.normalized_google_distance(x=y, y=x)
"""
check_valid_array(data=x, source=f'{Statistics.normalized_google_distance.__name__} x', accepted_ndims=(1, 2), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
check_valid_array(data=y, source=f'{Statistics.normalized_google_distance.__name__} y', accepted_ndims=(x.ndim,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)

sum_x, sum_y = np.sum(x), np.sum(y)
sum_min = np.sum(np.minimum(x, y))
D = (sum_x + sum_y) - np.min([sum_x, sum_y])
N = np.max([sum_x, sum_y]) - sum_min
if D == 0:
return -1.0
else:
return N / D
5 changes: 1 addition & 4 deletions simba/utils/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,10 +813,7 @@ def check_valid_array(data: np.ndarray,
check_instance(source=source, instance=data, accepted_types=np.ndarray)
if accepted_ndims is not None:
if data.ndim not in accepted_ndims:
raise ArrayError(
msg=f"Array not of acceptable dimensions. Found {data.ndim}, accepted: {accepted_ndims}: {source}",
source=check_valid_array.__name__,
)
raise ArrayError(msg=f"Array not of acceptable dimensions. Found {data.ndim}, accepted: {accepted_ndims}: {source}", source=check_valid_array.__name__)
if accepted_sizes is not None:
if len(data.shape) not in accepted_sizes:
raise ArrayError(
Expand Down
3 changes: 1 addition & 2 deletions simba/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def plug_holes_shortest_bout(data_df: pd.DataFrame,
data_df[clf_name] = data[clf_name]

clf_bouts = detect_bouts(data_df=data_df, target_lst=[clf_name], fps=fps)
below_min_bouts = clf_bouts[clf_bouts['Bout_time'] <= shortest_bout_s]
below_min_bouts = clf_bouts[clf_bouts['Bout_time'] < shortest_bout_s]
if len(below_min_bouts) == 0:
return data_df

Expand All @@ -248,7 +248,6 @@ def plug_holes_shortest_bout(data_df: pd.DataFrame,

return data_df


def create_color_palettes(
no_animals: int, map_size: int, cmaps: Optional[List[str]] = None
) -> List[List[int]]:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_detect_bouts(data_path, target_lst, fps):
def test_plug_holes_shortest_bout():
data_df = pd.DataFrame(data=[1, 0, 1, 1, 1], columns=['target'])
results = plug_holes_shortest_bout(data_df=data_df, clf_name='target', fps=10, shortest_bout=2000)
pd.testing.assert_frame_equal(results, pd.DataFrame(data=[1, 1, 1, 1, 1], columns=['target']))
pd.testing.assert_frame_equal(results, pd.DataFrame(data=[0, 0, 0, 0, 0], columns=['target']))

def test_create_color_palettes():
results = create_color_palettes(no_animals=2, map_size=2)
Expand Down

0 comments on commit d7325a3

Please sign in to comment.