From 791cfac986e38e37c8d29d78b6ead2c4154eaf57 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 1 Jan 2023 23:02:40 +0000 Subject: [PATCH 1/2] :arrow_up: Bump sphinx-notes/pages from 2 to 3 Bumps [sphinx-notes/pages](https://github.com/sphinx-notes/pages) from 2 to 3. - [Release notes](https://github.com/sphinx-notes/pages/releases) - [Commits](https://github.com/sphinx-notes/pages/compare/v2...v3) --- updated-dependencies: - dependency-name: sphinx-notes/pages dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> --- .github/workflows/documentation.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml index 11590fd..13aceba 100644 --- a/.github/workflows/documentation.yaml +++ b/.github/workflows/documentation.yaml @@ -16,7 +16,7 @@ jobs: python -m pip install --upgrade pip pip install -e .[dev] - name: Build and Commit - uses: sphinx-notes/pages@v2 + uses: sphinx-notes/pages@v3 - name: Push changes uses: ad-m/github-push-action@v0.6.0 with: From 034aba419247d1bd8da2cd8aec931aca483b43e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 1 Jan 2023 23:03:08 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tweetopic/__init__.py | 2 +- tweetopic/_btm.py | 44 ++++++++++++++++++++++++++++++++----------- tweetopic/_dmm.py | 8 ++++---- tweetopic/_doc.py | 6 ++---- tweetopic/btm.py | 18 +++++++++++------- tweetopic/pipeline.py | 6 ++++-- 6 files changed, 55 insertions(+), 29 deletions(-) diff --git a/tweetopic/__init__.py b/tweetopic/__init__.py index 0327249..2419c51 100644 --- a/tweetopic/__init__.py +++ b/tweetopic/__init__.py @@ -1,3 +1,3 @@ -from tweetopic.dmm import DMM # noqa: F401 from tweetopic.btm import BTM # noqa: F401 +from tweetopic.dmm import DMM # noqa: F401 from tweetopic.pipeline import TopicPipeline # noqa: F401 diff --git a/tweetopic/_btm.py b/tweetopic/_btm.py index 3c5795c..368105f 100644 --- a/tweetopic/_btm.py +++ b/tweetopic/_btm.py @@ -1,4 +1,4 @@ -"""Module for utility functions for fitting BTMs""" +"""Module for utility functions for fitting BTMs.""" import random from typing import Dict, Tuple, TypeVar @@ -6,12 +6,14 @@ import numba import numpy as np from numba import njit + from tweetopic._prob import norm_prob, sample_categorical @njit def doc_unique_biterms( - doc_unique_words: np.ndarray, doc_unique_word_counts: np.ndarray + doc_unique_words: np.ndarray, + doc_unique_word_counts: np.ndarray, ) -> Dict[Tuple[int, int], int]: (n_max_unique_words,) = doc_unique_words.shape biterm_counts = dict() @@ -42,7 +44,7 @@ def doc_unique_biterms( @njit def nb_add_counter(dest: Dict[T, int], source: Dict[T, int]): - """Adds one counter dict to another in place with Numba""" + """Adds one counter dict to another in place with Numba.""" for key in source: if key in dest: dest[key] += source[key] @@ -52,17 +54,20 @@ def nb_add_counter(dest: Dict[T, int], source: Dict[T, int]): @njit def corpus_unique_biterms( - doc_unique_words: np.ndarray, doc_unique_word_counts: np.ndarray + doc_unique_words: np.ndarray, + doc_unique_word_counts: np.ndarray, ) -> Dict[Tuple[int, int], int]: n_documents, _ = doc_unique_words.shape biterm_counts = doc_unique_biterms( - doc_unique_words[0], doc_unique_word_counts[0] + doc_unique_words[0], + doc_unique_word_counts[0], ) for i_doc in range(1, n_documents): doc_unique_words_i = doc_unique_words[i_doc] doc_unique_word_counts_i = doc_unique_word_counts[i_doc] doc_biterms = doc_unique_biterms( - doc_unique_words_i, doc_unique_word_counts_i + doc_unique_words_i, + doc_unique_word_counts_i, ) nb_add_counter(biterm_counts, doc_biterms) return biterm_counts @@ -70,7 +75,7 @@ def corpus_unique_biterms( @njit def compute_biterm_set( - biterm_counts: Dict[Tuple[int, int], int] + biterm_counts: Dict[Tuple[int, int], int], ) -> np.ndarray: return np.array(list(biterm_counts.keys())) @@ -115,7 +120,12 @@ def add_biterm( topic_biterm_count: np.ndarray, ) -> None: add_remove_biterm( - True, i_biterm, i_topic, biterms, topic_word_count, topic_biterm_count + True, + i_biterm, + i_topic, + biterms, + topic_word_count, + topic_biterm_count, ) @@ -128,7 +138,12 @@ def remove_biterm( topic_biterm_count: np.ndarray, ) -> None: add_remove_biterm( - False, i_biterm, i_topic, biterms, topic_word_count, topic_biterm_count + False, + i_biterm, + i_topic, + biterms, + topic_word_count, + topic_biterm_count, ) @@ -146,7 +161,11 @@ def init_components( i_topic = random.randint(0, n_components - 1) biterm_topic_assignments[i_biterm] = i_topic add_biterm( - i_biterm, i_topic, biterms, topic_word_count, topic_biterm_count + i_biterm, + i_topic, + biterms, + topic_word_count, + topic_biterm_count, ) return biterm_topic_assignments, topic_word_count, topic_biterm_count @@ -360,7 +379,10 @@ def predict_docs( ) biterms = doc_unique_biterms(words, word_counts) prob_topic_given_document( - pred, biterms, topic_distribution, topic_word_distribution + pred, + biterms, + topic_distribution, + topic_word_distribution, ) predictions[i_doc, :] = pred return predictions diff --git a/tweetopic/_dmm.py b/tweetopic/_dmm.py index 53e8e2a..d323276 100644 --- a/tweetopic/_dmm.py +++ b/tweetopic/_dmm.py @@ -1,4 +1,5 @@ -"""Module containing tools for fitting a Dirichlet Multinomial Mixture Model.""" +"""Module containing tools for fitting a Dirichlet Multinomial Mixture +Model.""" from __future__ import annotations from math import exp, log @@ -6,7 +7,7 @@ import numpy as np from numba import njit -from tweetopic._prob import sample_categorical, norm_prob +from tweetopic._prob import norm_prob, sample_categorical @njit @@ -197,8 +198,7 @@ def _cond_prob( # I use logs instead of computing the products directly, # as it would quickly result in numerical overflow. log_norm_term = log( - (cluster_doc_count[i_cluster] + alpha) - / (n_docs - 1 + n_clusters * alpha), + (cluster_doc_count[i_cluster] + alpha) / (n_docs - 1 + n_clusters * alpha), ) log_numerator = 0 for i_unique in range(max_unique_words): diff --git a/tweetopic/_doc.py b/tweetopic/_doc.py index 657c6dc..fa60957 100644 --- a/tweetopic/_doc.py +++ b/tweetopic/_doc.py @@ -11,14 +11,12 @@ def init_doc_words( n_docs, _ = doc_term_matrix.shape doc_unique_words = np.zeros((n_docs, max_unique_words)).astype(np.uint32) doc_unique_word_counts = np.zeros((n_docs, max_unique_words)).astype( - np.uint32 + np.uint32, ) for i_doc in range(n_docs): unique_words = doc_term_matrix[i_doc].rows[0] # type: ignore unique_word_counts = doc_term_matrix[i_doc].data[0] # type: ignore for i_unique in range(len(unique_words)): doc_unique_words[i_doc, i_unique] = unique_words[i_unique] - doc_unique_word_counts[i_doc, i_unique] = unique_word_counts[ - i_unique - ] + doc_unique_word_counts[i_doc, i_unique] = unique_word_counts[i_unique] return doc_unique_words, doc_unique_word_counts diff --git a/tweetopic/btm.py b/tweetopic/btm.py index a7a20ed..34df208 100644 --- a/tweetopic/btm.py +++ b/tweetopic/btm.py @@ -7,15 +7,19 @@ import scipy.sparse as spr import sklearn from numpy.typing import ArrayLike -from tweetopic._btm import (compute_biterm_set, corpus_unique_biterms, - fit_model, predict_docs) + +from tweetopic._btm import ( + compute_biterm_set, + corpus_unique_biterms, + fit_model, + predict_docs, +) from tweetopic._doc import init_doc_words from tweetopic.exceptions import NotFittedException class BTM(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator): - """Implementation of the Biterm Topic Model with Gibbs Sampling - solver. + """Implementation of the Biterm Topic Model with Gibbs Sampling solver. Parameters ---------- @@ -136,7 +140,8 @@ def fit(self, X: Union[spr.spmatrix, ArrayLike], y: None = None): max_unique_words=max_unique_words, ) biterms = corpus_unique_biterms( - doc_unique_words, doc_unique_word_counts + doc_unique_words, + doc_unique_word_counts, ) biterm_set = compute_biterm_set(biterms) self.topic_distribution, self.components_ = fit_model( @@ -152,8 +157,7 @@ def fit(self, X: Union[spr.spmatrix, ArrayLike], y: None = None): # TODO: Something goes terribly wrong here, fix this def transform(self, X: Union[spr.spmatrix, ArrayLike]) -> np.ndarray: - """Predicts probabilities for each document belonging to each - topic. + """Predicts probabilities for each document belonging to each topic. Parameters ---------- diff --git a/tweetopic/pipeline.py b/tweetopic/pipeline.py index ba19fca..5cabacb 100644 --- a/tweetopic/pipeline.py +++ b/tweetopic/pipeline.py @@ -47,7 +47,8 @@ def fit(self, texts: Iterable[str]) -> TopicPipeline: return self def fit_transform( - self, texts: Iterable[str] + self, + texts: Iterable[str], ) -> Union[ArrayLike, spr.spmatrix]: """Fits vectorizer and topic model and transforms the given text. @@ -65,7 +66,8 @@ def fit_transform( return self.topic_model.fit_transform(doc_term_matrix) def transform( - self, texts: Iterable[str] + self, + texts: Iterable[str], ) -> Union[ArrayLike, spr.spmatrix]: """Transforms given texts with the fitted pipeline.