From c66ae414c6ca403bfd2acaef4938adb20d0241d5 Mon Sep 17 00:00:00 2001 From: Kasper Fyhn Date: Wed, 20 Nov 2024 10:23:25 +0100 Subject: [PATCH] Removing docformatter pre-commit hook + showing progress in clustering --- .pre-commit-config.yaml | 6 ------ src/conspiracies/corpusprocessing/clustering.py | 9 ++++++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b43aa97..40d3d76 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,12 +11,6 @@ repos: hooks: - id: add-trailing-comma - - repo: https://github.com/PyCQA/docformatter - rev: v1.7.5 - hooks: - - id: docformatter - args: [--in-place] - - repo: https://github.com/psf/black rev: 24.8.0 hooks: diff --git a/src/conspiracies/corpusprocessing/clustering.py b/src/conspiracies/corpusprocessing/clustering.py index cd7fbcf..05deb3e 100644 --- a/src/conspiracies/corpusprocessing/clustering.py +++ b/src/conspiracies/corpusprocessing/clustering.py @@ -102,7 +102,11 @@ def _cluster( fields: List[TripletField], ): model = self._get_embedding_model() - embeddings = model.encode([field.text for field in fields]) + print("Creating embeddings:") + embeddings = model.encode( + [field.text for field in fields], + show_progress_bar=True, + ) embeddings = StandardScaler().fit_transform(embeddings) if self.n_dimensions is not None: @@ -110,6 +114,7 @@ def _cluster( reducer = UMAP(n_components=self.n_dimensions, n_neighbors=self.n_neighbors) embeddings = reducer.fit_transform(embeddings) + print("Clustering ...") hdbscan_model = HDBSCAN( min_cluster_size=self.min_cluster_size, min_samples=self.min_samples, @@ -161,7 +166,9 @@ def create_mappings(self, triplets: List[Triplet]) -> Mappings: entities = subjects + objects predicates = [triplet.predicate for triplet in triplets] + print("Creating mappings for entities") entity_clusters = self._cluster(entities) + print("Creating mappings for predicates") predicate_clusters = self._cluster(predicates) mappings = Mappings(