Skip to content

Commit

Permalink
Fix hierarchical_topics(...) when the distances between three clust…
Browse files Browse the repository at this point in the history
…ers are the same (#1929)
  • Loading branch information
azikoss authored Jun 13, 2024
1 parent b544ba7 commit 0a28916
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
8 changes: 7 additions & 1 deletion bertopic/_bertopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@
from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
from bertopic._utils import (
MyLogger, check_documents_type, check_embeddings_shape,
check_is_fitted, validate_distance_matrix, select_topic_representation
check_is_fitted, validate_distance_matrix, select_topic_representation,
get_unique_distances
)
import bertopic._save_utils as save_utils

Expand Down Expand Up @@ -986,6 +987,11 @@ def hierarchical_topics(self,
# Use the 1-D condensed distance matrix as an input instead of the raw distance matrix
Z = linkage_function(X)

# Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with
# `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters
if len(Z[:, 2]) != len(np.unique(Z[:, 2])):
Z[:, 2] = get_unique_distances(Z[:, 2])

# Calculate basic bag-of-words to be iteratively merged later
documents = pd.DataFrame({"Document": docs,
"ID": range(len(docs)),
Expand Down
25 changes: 25 additions & 0 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,31 @@ def validate_distance_matrix(X, n_samples):
return X



def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
"""Check if the consecutive elements in the distance array are the same. If so, a small noise
is added to one of the elements to make sure that the array does not contain duplicates.
Arguments:
dists: distance array sorted in the increasing order.
noise_max: the maximal magnitude of noise to be added.
Returns:
Unique distances sorted in the preserved increasing order.
"""
dists_cp = dists.copy()

for i in range(dists.shape[0] - 1):
if dists[i] == dists[i + 1]:
# returns the next unique distance or the current distance with the added noise
next_unique_dist = next((d for d in dists[i + 1:] if d != dists[i]), dists[i] + noise_max)

# the noise can never be large then the difference between the next unique distance and the current one
curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise)
return dists_cp


def select_topic_representation(
ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
Expand Down
19 changes: 18 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pytest
import logging
import numpy as np
from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation
from typing import List
from bertopic._utils import check_documents_type, check_embeddings_shape, MyLogger, select_topic_representation, get_unique_distances
from scipy.sparse import csr_matrix


def test_logger():
logger = MyLogger()
logger.configure("DEBUG")
Expand Down Expand Up @@ -34,8 +36,23 @@ def test_check_embeddings_shape():
embeddings = np.array([[1, 2, 3],
[2, 3, 4]])
check_embeddings_shape(embeddings, docs)


def test_make_unique_distances():
def check_dists(dists: List[float], noise_max: float):
unique_dists = get_unique_distances(np.array(dists, dtype=float), noise_max=noise_max)
assert len(unique_dists) == len(dists), "The number of elements must be the same"
assert len(dists) == len(np.unique(unique_dists)), "The distances must be unique"

check_dists([0, 0, 0.5, 0.75, 1, 1], noise_max=1e-7)

# testing whether the distances are sorted in ascending order when if the noise is extremely high
check_dists([0, 0, 0, 0.5, 0.75, 1, 1], noise_max=20)

# test whether the distances are sorted in ascending order when the distances are all the same
check_dists([0, 0, 0, 0, 0, 0, 0], noise_max=1e-7)


def test_select_topic_representation():
ctfidf_embeddings = np.array([[1, 1, 1]])
ctfidf_embeddings_sparse = csr_matrix(
Expand Down

0 comments on commit 0a28916

Please sign in to comment.