Merge pull request NGO-Algorithm-Audit#3 from krstopro/master

Update CI, refactor source
krstopro · Mar 4, 2024 · b8e9a37 · b8e9a37
2 parents ccd880d + 2243453
commit b8e9a37
Show file tree

Hide file tree

Showing 9 changed files with 68 additions and 101 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -14,22 +14,28 @@ on:
 jobs:
   main:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.11']
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
 
       - name: Install poetry
         run: |
           pipx install poetry
-          poetry config virtualenvs.in-project true
+          poetry config virtualenvs.path .virtualenvs
 
-      - name: Set up Python
+      - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:
-          python-version: '3.11'
+          python-version: ${{ matrix.python-version }}
           cache: poetry
           cache-dependency-path: poetry.lock
 
+      - name: Set poetry environment
+        run: poetry env use ${{ matrix.python-version }}
+
       - name: Install dependencies
         run: poetry install --no-root --no-interaction
 

diff --git a/bias_scan/clustering/_bahc.py b/bias_scan/clustering/_bahc.py
@@ -10,22 +10,6 @@ class BiasAwareHierarchicalClustering(ABC, BaseEstimator, ClusterMixin):
 
     This abstract class specifies an interface for all bias-aware hierarchical
     clustering classes.
-
-    Parameters
-    ----------
-    max_iter : int
-        Maximum number of iterations.
-    min_cluster_size : int
-        Minimum size of a cluster.
-    
-    Attributes
-    ----------
-    n_cluster_ : int
-        Number of clusters.
-    labels_ : ndarray of shape (n_samples,)
-        Labels for each point.
-    biases_ : ndarray of shape (n_clusters,)
-        Biases for each cluster.
     """
 
     def __init__(self, max_iter, min_cluster_size):
@@ -50,19 +34,21 @@ def fit(self, X, y):
         """
         n_samples, _ = X.shape
         self.n_clusters_ = 1
-        self.labels_ = np.zeros(n_samples, dtype=np.uint16)
-        labels = []
+        labels = np.zeros(n_samples, dtype=np.uint32)
+        clusters = []
         biases = []
         label = 0
         bias = -np.mean(y)
         heap = [(None, label, bias)]
+        print(labels)
         for _ in range(self.max_iter):
             if not heap:
                 break
             _, label, bias = heapq.heappop(heap)
-            cluster_indices = np.nonzero(self.labels_ == label)[0]
+            cluster_indices = np.nonzero(labels == label)[0]
             cluster = X[cluster_indices]
-            cluster_labels = self.split(cluster)
+            cluster_labels = self._split(cluster)
+            # TODO: Maybe check if cluster_labels are 0s and 1s
             indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]]
             indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]]
             if (
@@ -80,25 +66,31 @@ def fit(self, X, y):
                     heapq.heappush(heap, (-std0, label, bias0))
                     std1 = np.std(y[indices1])
                     heapq.heappush(heap, (-std1, self.n_clusters_, bias1))
-                    self.labels_[indices1] = self.n_clusters_
+                    labels[indices1] = self.n_clusters_
                     self.n_clusters_ += 1
                 else:
-                    labels.append(label)
+                    clusters.append(label)
                     biases.append(bias)
             else:
-                labels.append(label)
+                clusters.append(label)
                 biases.append(bias)
-        labels = np.array(labels + [label for _, label, _ in heap])
+            print(labels)
+            print(heap)
+            print(clusters)
+        clusters = np.array(clusters + [label for _, label, _ in heap])
         biases = np.array(biases + [bias for _, _, bias in heap])
-        sorted_indices = np.argsort(-biases)
-        labels = labels[sorted_indices]
-        self.biases_ = biases[sorted_indices]
-        d = { label: index for label, index in zip(labels, range(n_samples))}
-        self.labels_ = np.array(d[label] for label in self.labels_)
+        print(clusters)
+        print(biases)
+        indices = np.argsort(-biases)
+        clusters = clusters[indices]
+        self.biases_ = biases[indices]
+        mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
+        mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
+        self.labels_ = mapping[labels]
         return self
 
     @abstractmethod
-    def split(self, X):
+    def _split(self, X):
         """Splits the data into two clusters.
 
         Parameters

diff --git a/bias_scan/clustering/_kmeans.py b/bias_scan/clustering/_kmeans.py
@@ -11,34 +11,19 @@ class BiasAwareHierarchicalKMeans(BiasAwareHierarchicalClustering):
         Maximum number of iterations.
     min_cluster_size : int
         Minimum size of a cluster.
-    init : {'k-means++', 'random'}, callable or array-like of shape \
-            (n_clusters, n_features), default='k-means++'
-    n_init : 'auto' or int, default='auto'
-    kmeans_max_iter : int, default=300
-    tol : float, default=1e-4
+    kmeans_params : dict
+        k-means parameters
     """
 
     def __init__(
         self,
         max_iter,
         min_cluster_size,
-        init="k-means++",
-        n_init="auto",
-        kmeans_max_iter=300,
-        tol=1e-4,
+        kmeans_params={"n_clusters": 2, "n_init": "auto"},
     ):
         super().__init__(max_iter, min_cluster_size)
-        self.init = init
-        self.n_init = n_init
-        self.kmeans_max_iter = kmeans_max_iter
-        self.tol = tol
+        self.kmeans_params = kmeans_params
+        self.kmeans = KMeans(**kmeans_params)
 
-    def split(self, X):
-        kmeans = KMeans(
-            n_clusters=2,
-            init=self.init,
-            n_init=self.n_init,
-            max_iter=self.kmeans_max_iter,
-            tol=self.tol,
-        )
-        return kmeans.fit_predict(X)
+    def _split(self, X):
+        return self.kmeans.fit_predict(X)
diff --git a/bias_scan/clustering/_kmodes.py b/bias_scan/clustering/_kmodes.py
@@ -11,29 +11,14 @@ class BiasAwareHierarchicalKModes(BiasAwareHierarchicalClustering):
         Maximum number of iterations.
     min_cluster_size : int
         Minimum size of a cluster.
-    init : {'Huang', 'Cao', 'random'}, default='Cao'
-    n_init : int, default=10
-    kmodes_max_iter : int, default=100
+    kmodes_params : dict
+        k-modes parameters
     """
 
-    def __init__(
-        self,
-        max_iter,
-        min_cluster_size,
-        init="Cao",
-        n_init=10,
-        kmodes_max_iter=100,
-    ):
+    def __init__(self, max_iter, min_cluster_size, kmodes_params={"n_clusters": 2}):
         super().__init__(max_iter, min_cluster_size)
-        self.init = init
-        self.n_init = n_init
-        self.kmodes_max_iter = kmodes_max_iter
+        self.kmodes_params = kmodes_params
+        self.kmodes = KModes(**kmodes_params)
 
-    def split(self, X):
-        kmodes = KModes(
-            n_clusters=2,
-            init=self.init,
-            n_init=self.n_init,
-            max_iter=self.kmodes_max_iter,
-        )
-        return kmodes.fit_predict(X)
+    def _split(self, X):
+        return self.kmodes.fit_predict(X)
diff --git a/bias_scan/clustering/tests/__init__.py b/bias_scan/clustering/tests/__init__.py
diff --git a/bias_scan/clustering/tests/test_bahc.py b/bias_scan/clustering/tests/test_bahc.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,10 +13,12 @@ scikit-learn = "^1.4.1.post1"
 kmodes = "^0.12.2"
 
 [tool.poetry.dev-dependencies]
+numpy = "^1.26.4"
+scikit-learn = "^1.4.1.post1"
+kmodes = "^0.12.2"
 ruff = "^0.2.2"
 pytest = "^8.0.2"
 
-
 [build-system]
 requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+build-backend = "poetry.core.masonry.api"
diff --git a/tests/test_bahc.py b/tests/test_bahc.py
@@ -3,11 +3,21 @@
 from bias_scan.clustering import BiasAwareHierarchicalKMeans
 
 
+def test_shapes():
+    # Checks that labels and biases have the right shapes
+    rng = np.random.RandomState(12)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
+    algo.fit(X, y)
+    assert len(algo.labels_) == 20
+    assert len(algo.biases_) == algo.n_clusters_
+
 def test_clusters():
     # Checks that label values are between 0 and n_clusters
     rng = np.random.RandomState(12)
-    X = rng.rand(10, 5)
-    y = rng.rand(10)
-    algo = BiasAwareHierarchicalKMeans(max_iter=3, min_cluster_size=2)
+    X = rng.rand(20, 10)
+    y = rng.rand(20)
+    algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
     algo.fit(X, y)
     assert np.array_equal(np.unique(algo.labels_), np.arange(algo.n_clusters_))