Skip to content

Commit

Permalink
Merge pull request NGO-Algorithm-Audit#3 from krstopro/master
Browse files Browse the repository at this point in the history
Update CI, refactor source
  • Loading branch information
krstopro authored Mar 4, 2024
2 parents ccd880d + 2243453 commit b8e9a37
Show file tree
Hide file tree
Showing 9 changed files with 68 additions and 101 deletions.
12 changes: 9 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,28 @@ on:
jobs:
main:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.11']
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install poetry
run: |
pipx install poetry
poetry config virtualenvs.in-project true
poetry config virtualenvs.path .virtualenvs
- name: Set up Python
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: ${{ matrix.python-version }}
cache: poetry
cache-dependency-path: poetry.lock

- name: Set poetry environment
run: poetry env use ${{ matrix.python-version }}

- name: Install dependencies
run: poetry install --no-root --no-interaction

Expand Down
52 changes: 22 additions & 30 deletions bias_scan/clustering/_bahc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,6 @@ class BiasAwareHierarchicalClustering(ABC, BaseEstimator, ClusterMixin):
This abstract class specifies an interface for all bias-aware hierarchical
clustering classes.
Parameters
----------
max_iter : int
Maximum number of iterations.
min_cluster_size : int
Minimum size of a cluster.
Attributes
----------
n_cluster_ : int
Number of clusters.
labels_ : ndarray of shape (n_samples,)
Labels for each point.
biases_ : ndarray of shape (n_clusters,)
Biases for each cluster.
"""

def __init__(self, max_iter, min_cluster_size):
Expand All @@ -50,19 +34,21 @@ def fit(self, X, y):
"""
n_samples, _ = X.shape
self.n_clusters_ = 1
self.labels_ = np.zeros(n_samples, dtype=np.uint16)
labels = []
labels = np.zeros(n_samples, dtype=np.uint32)
clusters = []
biases = []
label = 0
bias = -np.mean(y)
heap = [(None, label, bias)]
print(labels)
for _ in range(self.max_iter):
if not heap:
break
_, label, bias = heapq.heappop(heap)
cluster_indices = np.nonzero(self.labels_ == label)[0]
cluster_indices = np.nonzero(labels == label)[0]
cluster = X[cluster_indices]
cluster_labels = self.split(cluster)
cluster_labels = self._split(cluster)
# TODO: Maybe check if cluster_labels are 0s and 1s
indices0 = cluster_indices[np.nonzero(cluster_labels == 0)[0]]
indices1 = cluster_indices[np.nonzero(cluster_labels == 1)[0]]
if (
Expand All @@ -80,25 +66,31 @@ def fit(self, X, y):
heapq.heappush(heap, (-std0, label, bias0))
std1 = np.std(y[indices1])
heapq.heappush(heap, (-std1, self.n_clusters_, bias1))
self.labels_[indices1] = self.n_clusters_
labels[indices1] = self.n_clusters_
self.n_clusters_ += 1
else:
labels.append(label)
clusters.append(label)
biases.append(bias)
else:
labels.append(label)
clusters.append(label)
biases.append(bias)
labels = np.array(labels + [label for _, label, _ in heap])
print(labels)
print(heap)
print(clusters)
clusters = np.array(clusters + [label for _, label, _ in heap])
biases = np.array(biases + [bias for _, _, bias in heap])
sorted_indices = np.argsort(-biases)
labels = labels[sorted_indices]
self.biases_ = biases[sorted_indices]
d = { label: index for label, index in zip(labels, range(n_samples))}
self.labels_ = np.array(d[label] for label in self.labels_)
print(clusters)
print(biases)
indices = np.argsort(-biases)
clusters = clusters[indices]
self.biases_ = biases[indices]
mapping = np.zeros(self.n_clusters_, dtype=np.uint32)
mapping[clusters] = np.arange(self.n_clusters_, dtype=np.uint32)
self.labels_ = mapping[labels]
return self

@abstractmethod
def split(self, X):
def _split(self, X):
"""Splits the data into two clusters.
Parameters
Expand Down
29 changes: 7 additions & 22 deletions bias_scan/clustering/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,19 @@ class BiasAwareHierarchicalKMeans(BiasAwareHierarchicalClustering):
Maximum number of iterations.
min_cluster_size : int
Minimum size of a cluster.
init : {'k-means++', 'random'}, callable or array-like of shape \
(n_clusters, n_features), default='k-means++'
n_init : 'auto' or int, default='auto'
kmeans_max_iter : int, default=300
tol : float, default=1e-4
kmeans_params : dict
k-means parameters
"""

def __init__(
self,
max_iter,
min_cluster_size,
init="k-means++",
n_init="auto",
kmeans_max_iter=300,
tol=1e-4,
kmeans_params={"n_clusters": 2, "n_init": "auto"},
):
super().__init__(max_iter, min_cluster_size)
self.init = init
self.n_init = n_init
self.kmeans_max_iter = kmeans_max_iter
self.tol = tol
self.kmeans_params = kmeans_params
self.kmeans = KMeans(**kmeans_params)

def split(self, X):
kmeans = KMeans(
n_clusters=2,
init=self.init,
n_init=self.n_init,
max_iter=self.kmeans_max_iter,
tol=self.tol,
)
return kmeans.fit_predict(X)
def _split(self, X):
return self.kmeans.fit_predict(X)
29 changes: 7 additions & 22 deletions bias_scan/clustering/_kmodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,14 @@ class BiasAwareHierarchicalKModes(BiasAwareHierarchicalClustering):
Maximum number of iterations.
min_cluster_size : int
Minimum size of a cluster.
init : {'Huang', 'Cao', 'random'}, default='Cao'
n_init : int, default=10
kmodes_max_iter : int, default=100
kmodes_params : dict
k-modes parameters
"""

def __init__(
self,
max_iter,
min_cluster_size,
init="Cao",
n_init=10,
kmodes_max_iter=100,
):
def __init__(self, max_iter, min_cluster_size, kmodes_params={"n_clusters": 2}):
super().__init__(max_iter, min_cluster_size)
self.init = init
self.n_init = n_init
self.kmodes_max_iter = kmodes_max_iter
self.kmodes_params = kmodes_params
self.kmodes = KModes(**kmodes_params)

def split(self, X):
kmodes = KModes(
n_clusters=2,
init=self.init,
n_init=self.n_init,
max_iter=self.kmodes_max_iter,
)
return kmodes.fit_predict(X)
def _split(self, X):
return self.kmodes.fit_predict(X)
Empty file.
13 changes: 0 additions & 13 deletions bias_scan/clustering/tests/test_bahc.py

This file was deleted.

12 changes: 6 additions & 6 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ scikit-learn = "^1.4.1.post1"
kmodes = "^0.12.2"

[tool.poetry.dev-dependencies]
numpy = "^1.26.4"
scikit-learn = "^1.4.1.post1"
kmodes = "^0.12.2"
ruff = "^0.2.2"
pytest = "^8.0.2"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"
16 changes: 13 additions & 3 deletions tests/test_bahc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,21 @@
from bias_scan.clustering import BiasAwareHierarchicalKMeans


def test_shapes():
# Checks that labels and biases have the right shapes
rng = np.random.RandomState(12)
X = rng.rand(20, 10)
y = rng.rand(20)
algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
algo.fit(X, y)
assert len(algo.labels_) == 20
assert len(algo.biases_) == algo.n_clusters_

def test_clusters():
# Checks that label values are between 0 and n_clusters
rng = np.random.RandomState(12)
X = rng.rand(10, 5)
y = rng.rand(10)
algo = BiasAwareHierarchicalKMeans(max_iter=3, min_cluster_size=2)
X = rng.rand(20, 10)
y = rng.rand(20)
algo = BiasAwareHierarchicalKMeans(max_iter=5, min_cluster_size=2)
algo.fit(X, y)
assert np.array_equal(np.unique(algo.labels_), np.arange(algo.n_clusters_))

0 comments on commit b8e9a37

Please sign in to comment.