video processing

sgoldenlab · Jun 27, 2024 · d7325a3 · d7325a3
1 parent 4eceaeb
commit d7325a3
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 19 deletions.
diff --git a/simba/mixins/statistics_mixin.py b/simba/mixins/statistics_mixin.py
@@ -3969,15 +3969,106 @@ def sokal_michener(x: np.ndarray, y: np.ndarray, w: Optional[np.ndarray] = None)
                 unequal_cnt += 1 * w[i[0]]
         return (2.0 * unequal_cnt) / (x.size + unequal_cnt)
 
-# sample_1 = np.random.random_integers(low=1, high=2, size=(10, 50)).astype(np.float64)
-# sample_2 = np.random.random_integers(low=7, high=20, size=(10, 50)).astype(np.float64)
-# data = np.vstack([sample_1, sample_2])
-# Statistics().hbos(data=data)
-
-# sample_1 = np.random.normal(loc=10, scale=2, size=1000).astype(np.float64)
-# sample_2 = np.random.normal(loc=12, scale=2, size=10000).astype(np.float64)
-
-# sample_1 = np.random.randint(0, 100, (100, )).astype(np.float64)
-# sample_2 = np.random.randint(110, 200, (100, )).astype(np.float64)
-#
-# Statistics().jensen_shannon_divergence(sample_1=sample_1, sample_2=sample_2)
+    def kumar_hassebrook_similarity(x: np.ndarray, y: np.ndarray) -> float:
+        """
+        Kumar-Hassebrook similarity is a measure used to quantify the similarity between two vectors.
+
+        .. note::
+            Kumar-Hassebrook similarity score of 1 indicates identical vectors and 0 indicating no similarity
+
+        :param np.ndarray x: 1D array representing the first feature values.
+        :param np.ndarray y: 1D array representing the second feature values.
+        :return: Kumar-Hassebrook similarity between vectors x and y.
+
+        :example:
+        >>> x, y = np.random.randint(0, 500, (1000,)), np.random.randint(0, 500, (1000,))
+        >>> Statistics.kumar_hassebrook_similarity(x=x, y=y)
+        """
+        check_valid_array(data=x, source=f'{Statistics.kumar_hassebrook_similarity.__name__} x', accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        check_valid_array(data=y, source=f'{Statistics.kumar_hassebrook_similarity.__name__} y', accepted_ndims=(1,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        dot_product = np.dot(x, y)
+        norm_x = np.linalg.norm(x)
+        norm_y = np.linalg.norm(y)
+        return dot_product / (norm_x ** 2 + norm_y ** 2 - dot_product)
+
+    def wave_hedges_distance(x: np.ndarray, y: np.ndarray) -> float:
+        """
+
+        Computes the Wave-Hedges distance between two 1-dimensional arrays `x` and `y`. The Wave-Hedges distance is a measure of dissimilarity between arrays.
+
+        .. note::
+            Wave-Hedges distance score of 0 indicate identical arrays. There is no upper bound.
+
+
+        :example:
+        >>> x = np.random.randint(0, 500, (1000,))
+        >>> y = np.random.randint(0, 500, (1000,))
+        >>> wave_hedges_distance(x=x, y=y)
+        """
+
+        check_valid_array(data=x, source=f'{Statistics.wave_hedges_distance.__name__} x', accepted_ndims=(1,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        check_valid_array(data=y, source=f'{Statistics.wave_hedges_distance.__name__} y', accepted_ndims=(1,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        x_y = abs(x - y)
+        xy_max = np.maximum(x, y)
+        return np.sum(np.where(((x_y != 0) & (xy_max != 0)), x_y / xy_max, 0))
+
+    @staticmethod
+    def gower_distance(x: np.ndarray, y: np.ndarray) -> np.ndarray:
+        """
+        Compute Gower-like distance vector between corresponding rows of two numerical matrices.
+        Gower distance is a measure of dissimilarity between two vectors (or rows in this case).
+
+        .. note::
+           This function assumes x and y have the same shape and only considers numerical attributes.
+            Each observation in x is compared to the corresponding observation in y based on normalized
+            absolute differences across numerical columns.
+
+        :param np.ndarray x: First numerical matrix with shape (m, n).
+        :param np.ndarray y: Second numerical matrix with shape (m, n).
+        :return np.ndarray: Gower-like distance vector with shape (m,).
+
+        :example:
+        >>> x, y = np.random.randint(0, 500, (1000, 6000)), np.random.randint(0, 500, (1000, 6000))
+        >>> Statistics.gower_distance(x=x, y=y)
+
+        """
+        check_valid_array(data=x, source=f'{Statistics.gower_distance.__name__} x', accepted_ndims=(1, 2), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        check_valid_array(data=y, source=f'{Statistics.gower_distance.__name__} y', accepted_ndims=(x.ndim,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        field_ranges = np.max(x, axis=0) - np.min(x, axis=0)
+        results = np.full((x.shape[0]), np.nan)
+        for i in range(x.shape[0]):
+            u, v = x[i], y[i]
+            dist = 0.0
+            for j in range(u.shape[0]):
+                if field_ranges[j] != 0:
+                    dist += np.abs(u[j] - v[j]) / field_ranges[j]
+            results[i] = dist / u.shape[0]
+        return results
+
+    @staticmethod
+    def normalized_google_distance(x: np.ndarray, y: np.ndarray) -> float:
+        """
+        Compute Normalized Google Distance (NGD) between two vectors or matrices.
+
+        .. note::
+           This function assumes x and y have the same shape. It computes NGD based on the sum of elements and the minimum values between corresponding elements of x and y.
+
+        :param np.ndarray x: First numerical matrix with shape (m, n).
+        :param np.ndarray y: Second array or matrix with shape (m, n).
+        :return float:  Normalized Google Distance between x and y.
+
+        :example:
+        >>> x, y = np.random.randint(0, 500, (1000,200)), np.random.randint(0, 500, (1000,200))
+        >>> Statistics.normalized_google_distance(x=y, y=x)
+        """
+        check_valid_array(data=x, source=f'{Statistics.normalized_google_distance.__name__} x', accepted_ndims=(1, 2), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+        check_valid_array(data=y, source=f'{Statistics.normalized_google_distance.__name__} y', accepted_ndims=(x.ndim,), accepted_shapes=(x.shape,), accepted_dtypes=Formats.NUMERIC_DTYPES.value)
+
+        sum_x, sum_y = np.sum(x), np.sum(y)
+        sum_min = np.sum(np.minimum(x, y))
+        D = (sum_x + sum_y) - np.min([sum_x, sum_y])
+        N = np.max([sum_x, sum_y]) - sum_min
+        if D == 0:
+            return -1.0
+        else:
+            return N / D
diff --git a/simba/utils/checks.py b/simba/utils/checks.py
@@ -813,10 +813,7 @@ def check_valid_array(data: np.ndarray,
     check_instance(source=source, instance=data, accepted_types=np.ndarray)
     if accepted_ndims is not None:
         if data.ndim not in accepted_ndims:
-            raise ArrayError(
-                msg=f"Array not of acceptable dimensions. Found {data.ndim}, accepted: {accepted_ndims}: {source}",
-                source=check_valid_array.__name__,
-            )
+            raise ArrayError(msg=f"Array not of acceptable dimensions. Found {data.ndim}, accepted: {accepted_ndims}: {source}", source=check_valid_array.__name__)
     if accepted_sizes is not None:
         if len(data.shape) not in accepted_sizes:
             raise ArrayError(

diff --git a/simba/utils/data.py b/simba/utils/data.py
@@ -236,7 +236,7 @@ def plug_holes_shortest_bout(data_df: pd.DataFrame,
         data_df[clf_name] = data[clf_name]
 
     clf_bouts = detect_bouts(data_df=data_df, target_lst=[clf_name], fps=fps)
-    below_min_bouts = clf_bouts[clf_bouts['Bout_time'] <= shortest_bout_s]
+    below_min_bouts = clf_bouts[clf_bouts['Bout_time'] < shortest_bout_s]
     if len(below_min_bouts) == 0:
         return data_df
 
@@ -248,7 +248,6 @@ def plug_holes_shortest_bout(data_df: pd.DataFrame,
 
     return data_df
 
-
 def create_color_palettes(
     no_animals: int, map_size: int, cmaps: Optional[List[str]] = None
 ) -> List[List[int]]:

diff --git a/tests/test_utils_data.py b/tests/test_utils_data.py
@@ -14,7 +14,7 @@ def test_detect_bouts(data_path, target_lst, fps):
 def test_plug_holes_shortest_bout():
     data_df = pd.DataFrame(data=[1, 0, 1, 1, 1], columns=['target'])
     results = plug_holes_shortest_bout(data_df=data_df, clf_name='target', fps=10, shortest_bout=2000)
-    pd.testing.assert_frame_equal(results, pd.DataFrame(data=[1, 1, 1, 1, 1], columns=['target']))
+    pd.testing.assert_frame_equal(results, pd.DataFrame(data=[0, 0, 0, 0, 0], columns=['target']))
 
 def test_create_color_palettes():
     results = create_color_palettes(no_animals=2, map_size=2)