Merge pull request #73 from vc1492a/dev

Dev
vc1492a · Nov 3, 2024 · 5d73be8 · 5d73be8
2 parents f7b79f1 + 06541c7
commit 5d73be8
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 3 deletions.
diff --git a/LICENSE.txt → LICENSE b/LICENSE.txt → LICENSE
diff --git a/readme.md b/readme.md
@@ -38,7 +38,7 @@ This Python 3 implementation uses Numpy and the formulas outlined in
 to calculate the Local Outlier Probability of each sample.
 
 ## Dependencies
-- Python 3.6 - 3.12
+- Python 3.6 - 3.13
 - numpy >= 1.16.3
 - python-utils >= 2.3.0
 - (optional) numba >= 0.45.1
@@ -281,7 +281,12 @@ PyNomaly provides the ability to specify a distance matrix so that any
 distance metric can be used (a neighbor index matrix must also be provided).
 This can be useful when wanting to use a distance other than the euclidean.
 
+Note that in order to maintain alignment with the LoOP definition of closest neighbors, 
+an additional neighbor is added when using [scikit-learn's NearestNeighbors](https://scikit-learn.org/1.5/modules/neighbors.html) since `NearestNeighbors` 
+includes the point itself when calculating the cloest neighbors (whereas the LoOP method does not include distances to point itself). 
+
 ```python
+import numpy as np
 from sklearn.neighbors import NearestNeighbors
 
 data = np.array([
@@ -293,11 +298,18 @@ data = np.array([
     [421.5, 90.3, 50.0]
 ])
 
-neigh = NearestNeighbors(n_neighbors=3, metric='hamming')
+# Generate distance and neighbor matrices
+n_neighbors = 3 # the number of neighbors according to the LoOP definition 
+neigh = NearestNeighbors(n_neighbors=n_neighbors+1, metric='hamming')
 neigh.fit(data)
 d, idx = neigh.kneighbors(data, return_distance=True)
 
-m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=3).fit()
+# Remove self-distances - you MUST do this to preserve the same results as intended by the definition of LoOP
+indices = np.delete(indices, 0, 1)
+distances = np.delete(distances, 0, 1)
+
+# Fit and return scores
+m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=n_neighbors+1).fit()
 scores = m.local_outlier_probabilities
 ```
 

diff --git a/setup.py b/setup.py
@@ -1,5 +1,9 @@
 from setuptools import setup
 
+from pathlib import Path
+this_directory = Path(__file__).parent
+long_description = (this_directory / "README.md").read_text()
+
 setup(
     name='PyNomaly',
     packages=['PyNomaly'],
@@ -9,6 +13,8 @@
                 'method providing an outlier score in the range of [0,1].',
     author='Valentino Constantinou',
     author_email='[email protected]',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
     url='https://github.com/vc1492a/PyNomaly',
     download_url='https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz',
     keywords=['outlier', 'anomaly', 'detection', 'machine', 'learning',

diff --git a/tests/test_loop.py b/tests/test_loop.py
@@ -790,3 +790,47 @@ def test_data_flipping() -> None:
         fit2.norm_prob_local_outlier_factor,
         decimal=6,
     )
+
+
+def test_distance_matrix_consistency(X_n120) -> None:
+    """
+    Test to ensure that the distance matrix is consistent with the neighbor
+    matrix and that the software is able to handle self-distances.
+    :return: None
+    """
+
+    neigh = NearestNeighbors(metric='euclidean')
+    neigh.fit(X_n120)
+    distances, indices = neigh.kneighbors(X_n120, n_neighbors=11, return_distance=True)
+
+    # remove the closest neighbor (its the point itself) from each row in the indices matrix and distances matrix
+    indices = np.delete(indices, 0, 1)
+    distances = np.delete(distances, 0, 1)
+
+    # Fit LoOP with and without distance matrix
+    clf_data = loop.LocalOutlierProbability(X_n120, n_neighbors=10)
+    clf_dist = loop.LocalOutlierProbability(distance_matrix=distances, neighbor_matrix=indices, n_neighbors=11)
+
+    # Attempt to retrieve scores and check types
+    scores_data = clf_data.fit().local_outlier_probabilities
+    scores_dist = clf_dist.fit().local_outlier_probabilities
+
+    # Debugging prints to investigate types and contents
+    print("Type of scores_data:", type(scores_data))
+    print("Type of scores_dist:", type(scores_dist))
+    print("Value of scores_data:", scores_data)
+    print("Value of scores_dist:", scores_dist)
+    print("Shape of scores_data:", scores_data.shape)
+    print("Shape of scores_dist:", scores_dist.shape)
+
+    # Convert to arrays if they aren't already
+    scores_data = np.array(scores_data) if not isinstance(scores_data, np.ndarray) else scores_data
+    scores_dist = np.array(scores_dist) if not isinstance(scores_dist, np.ndarray) else scores_dist
+
+    # Check shapes and types before assertion
+    assert scores_data.shape == scores_dist.shape, "Score shapes mismatch"
+    assert isinstance(scores_data, np.ndarray), "Expected scores_data to be a numpy array"
+    assert isinstance(scores_dist, np.ndarray), "Expected scores_dist to be a numpy array"
+
+    # Compare scores allowing for minor floating-point differences
+    assert_array_almost_equal(scores_data, scores_dist, decimal=10, err_msg="Inconsistent LoOP scores due to self-distances")