diff --git a/algorithm/algorithm.html b/algorithm/algorithm.html index 57d1a9b4..df7481a5 100755 --- a/algorithm/algorithm.html +++ b/algorithm/algorithm.html @@ -2791,7 +2791,7 @@

Code Overviewhdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics ctfidf_model=ctfidf_model, # Step 5 - Extract topic words - representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations + representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations )

Detailed Overview

diff --git a/api/backends/base.html b/api/backends/base.html index b838e85f..a6fc2c5b 100755 --- a/api/backends/base.html +++ b/api/backends/base.html @@ -2660,7 +2660,7 @@

BaseEmbedder
-

The Base Embedder used for creating embedding models

+

The Base Embedder used for creating embedding models.

Parameters:

@@ -2694,7 +2694,7 @@

BaseEmbedder Source code in bertopic\backend\_base.py
class BaseEmbedder:
-    """ The Base Embedder used for creating embedding models
+    """The Base Embedder used for creating embedding models.
 
     Arguments:
         embedding_model: The main embedding model to be used for extracting
@@ -2704,17 +2704,14 @@ 

BaseEmbedder then the `embedding_model` is purely used for creating document embeddings. """ - def __init__(self, - embedding_model=None, - word_embedding_model=None): + + def __init__(self, embedding_model=None, word_embedding_model=None): self.embedding_model = embedding_model self.word_embedding_model = word_embedding_model - def embed(self, - documents: List[str], - verbose: bool = False) -> np.ndarray: - """ Embed a list of n documents/words into an n-dimensional - matrix of embeddings + def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + """Embed a list of n documents/words into an n-dimensional + matrix of embeddings. Arguments: documents: A list of documents or words to be embedded @@ -2726,11 +2723,9 @@

BaseEmbedder """ pass - def embed_words(self, - words: List[str], - verbose: bool = False) -> np.ndarray: - """ Embed a list of n words into an n-dimensional - matrix of embeddings + def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray: + """Embed a list of n words into an n-dimensional + matrix of embeddings. Arguments: words: A list of words to be embedded @@ -2743,11 +2738,9 @@

BaseEmbedder """ return self.embed(words, verbose) - def embed_documents(self, - document: List[str], - verbose: bool = False) -> np.ndarray: - """ Embed a list of n words into an n-dimensional - matrix of embeddings + def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray: + """Embed a list of n words into an n-dimensional + matrix of embeddings. Arguments: document: A list of documents to be embedded @@ -2787,7 +2780,7 @@

Embed a list of n documents/words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2832,11 +2825,9 @@

Source code in bertopic\backend\_base.py -
def embed(self,
-          documents: List[str],
-          verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n documents/words into an n-dimensional
-    matrix of embeddings
+          
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n documents/words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         documents: A list of documents or words to be embedded
@@ -2868,7 +2859,7 @@ 

Embed a list of n words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2913,11 +2904,9 @@

Source code in bertopic\backend\_base.py -
def embed_documents(self,
-                    document: List[str],
-                    verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n words into an n-dimensional
-    matrix of embeddings
+          
def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         document: A list of documents to be embedded
@@ -2949,7 +2938,7 @@ 

Embed a list of n words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2994,11 +2983,9 @@

Source code in bertopic\backend\_base.py -
def embed_words(self,
-                words: List[str],
-                verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n words into an n-dimensional
-    matrix of embeddings
+          
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         words: A list of words to be embedded
diff --git a/api/backends/cohere.html b/api/backends/cohere.html
index 123a2c48..6bd625bb 100755
--- a/api/backends/cohere.html
+++ b/api/backends/cohere.html
@@ -2632,7 +2632,7 @@ 

CohereBackend
-

Cohere Embedding Model

+

Cohere Embedding Model.

Parameters:

@@ -2701,7 +2701,7 @@

CohereBackend Source code in bertopic\backend\_cohere.py @@ -2816,7 +2816,7 @@

Embed a list of n documents/words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2861,11 +2861,9 @@

Source code in bertopic\backend\_cohere.py -
def embed(self,
-          documents: List[str],
-          verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n documents/words into an n-dimensional
-    matrix of embeddings
+          
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n documents/words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         documents: A list of documents or words to be embedded
diff --git a/api/backends/openai.html b/api/backends/openai.html
index 35da8cf4..b86a5d86 100755
--- a/api/backends/openai.html
+++ b/api/backends/openai.html
@@ -2632,7 +2632,7 @@ 

OpenAIBackend
-

OpenAI Embedding Model

+

OpenAI Embedding Model.

Parameters:

@@ -2694,7 +2694,7 @@

OpenAIBackend Source code in bertopic\backend\_openai.py
class OpenAIBackend(BaseEmbedder):
-    """ OpenAI Embedding Model
+    """OpenAI Embedding Model.
 
     Arguments:
         client: A `openai.OpenAI` client.
@@ -2709,7 +2709,6 @@ 

OpenAIBackend deployment_ids. Examples: - ```python import openai from bertopic.backend import OpenAIBackend @@ -2718,12 +2717,15 @@

OpenAIBackend openai_embedder = OpenAIBackend(client, "text-embedding-ada-002") ``` """ - def __init__(self, - client: openai.OpenAI, - embedding_model: str = "text-embedding-ada-002", - delay_in_seconds: float = None, - batch_size: int = None, - generator_kwargs: Mapping[str, Any] = {}): + + def __init__( + self, + client: openai.OpenAI, + embedding_model: str = "text-embedding-ada-002", + delay_in_seconds: float = None, + batch_size: int = None, + generator_kwargs: Mapping[str, Any] = {}, + ): super().__init__() self.client = client self.embedding_model = embedding_model @@ -2736,11 +2738,9 @@

OpenAIBackendelif not self.generator_kwargs.get("engine"): self.generator_kwargs["model"] = self.embedding_model - def embed(self, - documents: List[str], - verbose: bool = False) -> np.ndarray: - """ Embed a list of n documents/words into an n-dimensional - matrix of embeddings + def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray: + """Embed a list of n documents/words into an n-dimensional + matrix of embeddings. Arguments: documents: A list of documents or words to be embedded @@ -2772,7 +2772,7 @@

OpenAIBackenddef _chunks(self, documents): for i in range(0, len(documents), self.batch_size): - yield documents[i:i + self.batch_size] + yield documents[i : i + self.batch_size]

@@ -2802,7 +2802,7 @@

Embed a list of n documents/words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2847,11 +2847,9 @@

Source code in bertopic\backend\_openai.py -
def embed(self,
-          documents: List[str],
-          verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n documents/words into an n-dimensional
-    matrix of embeddings
+          
def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n documents/words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         documents: A list of documents or words to be embedded
diff --git a/api/backends/word_doc.html b/api/backends/word_doc.html
index abe9d10e..ad41a2b9 100755
--- a/api/backends/word_doc.html
+++ b/api/backends/word_doc.html
@@ -2646,26 +2646,22 @@ 

WordDocEmbedder
-

Combine a document- and word-level embedder

+

Combine a document- and word-level embedder.

Source code in bertopic\backend\_word_doc.py
class WordDocEmbedder(BaseEmbedder):
-    """ Combine a document- and word-level embedder
-    """
-    def __init__(self,
-                 embedding_model,
-                 word_embedding_model):
+    """Combine a document- and word-level embedder."""
+
+    def __init__(self, embedding_model, word_embedding_model):
         super().__init__()
 
         self.embedding_model = select_backend(embedding_model)
         self.word_embedding_model = select_backend(word_embedding_model)
 
-    def embed_words(self,
-                    words: List[str],
-                    verbose: bool = False) -> np.ndarray:
-        """ Embed a list of n words into an n-dimensional
-        matrix of embeddings
+    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
+        """Embed a list of n words into an n-dimensional
+        matrix of embeddings.
 
         Arguments:
             words: A list of words to be embedded
@@ -2678,11 +2674,9 @@ 

WordDocEmbedder """ return self.word_embedding_model.embed(words, verbose) - def embed_documents(self, - document: List[str], - verbose: bool = False) -> np.ndarray: - """ Embed a list of n words into an n-dimensional - matrix of embeddings + def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray: + """Embed a list of n words into an n-dimensional + matrix of embeddings. Arguments: document: A list of documents to be embedded @@ -2722,7 +2716,7 @@

Embed a list of n words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2767,11 +2761,9 @@

Source code in bertopic\backend\_word_doc.py -
def embed_documents(self,
-                    document: List[str],
-                    verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n words into an n-dimensional
-    matrix of embeddings
+          
def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         document: A list of documents to be embedded
@@ -2803,7 +2795,7 @@ 

Embed a list of n words into an n-dimensional -matrix of embeddings

+matrix of embeddings.

Parameters:

@@ -2848,11 +2840,9 @@

Source code in bertopic\backend\_word_doc.py -
def embed_words(self,
-                words: List[str],
-                verbose: bool = False) -> np.ndarray:
-    """ Embed a list of n words into an n-dimensional
-    matrix of embeddings
+          
def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
+    """Embed a list of n words into an n-dimensional
+    matrix of embeddings.
 
     Arguments:
         words: A list of words to be embedded
diff --git a/api/bertopic.html b/api/bertopic.html
index 2d222e33..9405f6e2 100755
--- a/api/bertopic.html
+++ b/api/bertopic.html
@@ -1600,6 +1600,13 @@
     bertopic._bertopic.BERTopic
   
   
+
+      
+        
  • + + topic_labels_ + +
  • @@ -2873,6 +2880,13 @@ bertopic._bertopic.BERTopic +
  • + +
  • + + topic_labels_ + +
  • @@ -3320,7 +3334,6 @@

    BERTopic representative_docs_ (Mapping[int, str]) : The representative documents for each topic. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -3347,26 +3360,28 @@

    BERTopic try out BERTopic several times until you find the topics that suit you best. """ - def __init__(self, - language: str = "english", - top_n_words: int = 10, - n_gram_range: Tuple[int, int] = (1, 1), - min_topic_size: int = 10, - nr_topics: Union[int, str] = None, - low_memory: bool = False, - calculate_probabilities: bool = False, - seed_topic_list: List[List[str]] = None, - zeroshot_topic_list: List[str] = None, - zeroshot_min_similarity: float = .7, - embedding_model=None, - umap_model: UMAP = None, - hdbscan_model: hdbscan.HDBSCAN = None, - vectorizer_model: CountVectorizer = None, - ctfidf_model: TfidfTransformer = None, - representation_model: BaseRepresentation = None, - verbose: bool = False, - ): - """BERTopic initialization + + def __init__( + self, + language: str = "english", + top_n_words: int = 10, + n_gram_range: Tuple[int, int] = (1, 1), + min_topic_size: int = 10, + nr_topics: Union[int, str] = None, + low_memory: bool = False, + calculate_probabilities: bool = False, + seed_topic_list: List[List[str]] = None, + zeroshot_topic_list: List[str] = None, + zeroshot_min_similarity: float = 0.7, + embedding_model=None, + umap_model: UMAP = None, + hdbscan_model: hdbscan.HDBSCAN = None, + vectorizer_model: CountVectorizer = None, + ctfidf_model: TfidfTransformer = None, + representation_model: BaseRepresentation = None, + verbose: bool = False, + ): + """BERTopic initialization. Arguments: language: The main language used in your documents. The default sentence-transformers @@ -3384,7 +3399,7 @@

    BERTopic NOTE: This param will not be used if you pass in your own CountVectorizer. min_topic_size: The minimum size of the topic. Increasing this value will lead - to a lower number of clusters/topics and vice versa. + to a lower number of clusters/topics and vice versa. It is the same parameter as `min_cluster_size` in HDBSCAN. NOTE: This param will not be used if you are using `hdbscan_model`. nr_topics: Specifying the number of topics will reduce the initial @@ -3436,8 +3451,9 @@

    BERTopic """ # Topic-based parameters if top_n_words > 100: - logger.warning("Note that extracting more than 100 words from a sparse " - "can slow down computation quite a bit.") + logger.warning( + "Note that extracting more than 100 words from a sparse can slow down computation quite a bit." + ) self.top_n_words = top_n_words self.min_topic_size = min_topic_size @@ -3462,18 +3478,22 @@

    BERTopicself.representation_model = representation_model # UMAP or another algorithm that has .fit and .transform functions - self.umap_model = umap_model or UMAP(n_neighbors=15, - n_components=5, - min_dist=0.0, - metric='cosine', - low_memory=self.low_memory) + self.umap_model = umap_model or UMAP( + n_neighbors=15, + n_components=5, + min_dist=0.0, + metric="cosine", + low_memory=self.low_memory, + ) # HDBSCAN or another clustering algorithm that has .fit and .predict functions and # the .labels_ variable to extract the labels - self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size, - metric='euclidean', - cluster_selection_method='eom', - prediction_data=True) + self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN( + min_cluster_size=self.min_topic_size, + metric="euclidean", + cluster_selection_method="eom", + prediction_data=True, + ) # Public attributes self.topics_ = None @@ -3482,7 +3502,7 @@

    BERTopicself.topic_mapper_ = None self.topic_representations_ = None self.topic_embeddings_ = None - self.topic_labels_ = None + self._topic_id_to_zeroshot_topic_idx = {} self.custom_labels_ = None self.c_tf_idf_ = None self.representative_images_ = None @@ -3490,7 +3510,6 @@

    BERTopicself.topic_aspects_ = {} # Private attributes for internal tracking purposes - self._outliers = 1 self._merged_topics = None if verbose: @@ -3498,12 +3517,47 @@

    BERTopicelse: logger.set_level("WARNING") - def fit(self, - documents: List[str], - embeddings: np.ndarray = None, - images: List[str] = None, - y: Union[List[int], np.ndarray] = None): - """ Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics + @property + def _outliers(self): + """Some algorithms have outlier labels (-1) that can be tricky to work + with if you are slicing data based on that labels. Therefore, we + track if there are outlier labels and act accordingly when slicing. + + Returns: + An integer indicating whether outliers are present in the topic model + """ + return 1 if -1 in self.topic_sizes_ else 0 + + @property + def topic_labels_(self): + """Map topic IDs to their labels. + A label is the topic ID, along with the first four words of the topic representation, joined using '_'. + Zeroshot topic labels come from self.zeroshot_topic_list rather than the calculated representation. + + Returns: + topic_labels: a dict mapping a topic ID (int) to its label (str) + """ + topic_labels = { + key: f"{key}_" + "_".join([word[0] for word in values[:4]]) + for key, values in self.topic_representations_.items() + } + if self._is_zeroshot(): + # Need to correct labels from zero-shot topics + topic_id_to_zeroshot_label = { + topic_id: self.zeroshot_topic_list[zeroshot_topic_idx] + for topic_id, zeroshot_topic_idx in self._topic_id_to_zeroshot_topic_idx.items() + } + topic_labels.update(topic_id_to_zeroshot_label) + return topic_labels + + def fit( + self, + documents: List[str], + embeddings: np.ndarray = None, + images: List[str] = None, + y: Union[List[int], np.ndarray] = None, + ): + """Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics. Arguments: documents: A list of documents to fit on @@ -3514,7 +3568,6 @@

    BERTopic specific instance is specified. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -3542,13 +3595,14 @@

    BERTopicself.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images) return self - def fit_transform(self, - documents: List[str], - embeddings: np.ndarray = None, - images: List[str] = None, - y: Union[List[int], np.ndarray] = None) -> Tuple[List[int], - Union[np.ndarray, None]]: - """ Fit the models on a collection of documents, generate topics, + def fit_transform( + self, + documents: List[str], + embeddings: np.ndarray = None, + images: List[str] = None, + y: Union[List[int], np.ndarray] = None, + ) -> Tuple[List[int], Union[np.ndarray, None]]: + """Fit the models on a collection of documents, generate topics, and return the probabilities and topic per document. Arguments: @@ -3568,7 +3622,6 @@

    BERTopic computation and may increase memory usage. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -3600,41 +3653,52 @@

    BERTopiccheck_embeddings_shape(embeddings, documents) doc_ids = range(len(documents)) if documents is not None else range(len(images)) - documents = pd.DataFrame({"Document": documents, - "ID": doc_ids, - "Topic": None, - "Image": images}) + documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images}) # Extract embeddings if embeddings is None: logger.info("Embedding - Transforming documents to embeddings.") - self.embedding_model = select_backend(self.embedding_model, - language=self.language) - embeddings = self._extract_embeddings(documents.Document.values.tolist(), - images=images, - method="document", - verbose=self.verbose) + self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose) + embeddings = self._extract_embeddings( + documents.Document.values.tolist(), + images=images, + method="document", + verbose=self.verbose, + ) logger.info("Embedding - Completed \u2713") else: if self.embedding_model is not None: - self.embedding_model = select_backend(self.embedding_model, - language=self.language) + self.embedding_model = select_backend( + self.embedding_model, language=self.language, verbose=self.verbose + ) # Guided Topic Modeling if self.seed_topic_list is not None and self.embedding_model is not None: y, embeddings = self._guided_topic_modeling(embeddings) + # Reduce dimensionality and fit UMAP model + umap_embeddings = self._reduce_dimensionality(embeddings, y) + # Zero-shot Topic Modeling if self._is_zeroshot(): - documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(documents, embeddings) - if documents is None: - return self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings) - - # Reduce dimensionality - umap_embeddings = self._reduce_dimensionality(embeddings, y) + documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling( + documents, embeddings + ) + # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering + umap_embeddings = self.umap_model.transform(embeddings) - # Cluster reduced embeddings - documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) + if len(documents) > 0: # No zero-shot topics matched + # Cluster reduced embeddings + documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) + if self._is_zeroshot() and len(assigned_documents) > 0: + documents, embeddings = self._combine_zeroshot_topics( + documents, embeddings, assigned_documents, assigned_embeddings + ) + else: + # All documents matches zero-shot topics + documents = assigned_documents + embeddings = assigned_embeddings + topics_before_reduction = self.topics_ # Sort and Map Topic IDs by their frequency if not self.nr_topics: @@ -3665,21 +3729,35 @@

    BERTopic# Save the top 3 most representative documents per topic self._save_representative_docs(documents) + # In the case of zero-shot topics, probability will come from cosine similarity, + # and the HDBSCAN model will be removed + if self._is_zeroshot() and len(assigned_documents) > 0: + self.hdbscan_model = BaseCluster() + sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_)) + + if self.calculate_probabilities: + probabilities = sim_matrix + else: + # Use `topics_before_reduction` because `self.topics_` may have already been updated from + # reducing topics, and the original probabilities are needed for `self._map_probabilities()` + probabilities = sim_matrix[ + np.arange(len(documents)), + np.array(topics_before_reduction) + self._outliers, + ] + # Resulting output self.probabilities_ = self._map_probabilities(probabilities, original_topics=True) predictions = documents.Topic.to_list() - # Combine Zero-shot with outliers - if self._is_zeroshot() and len(documents) != len(doc_ids): - predictions = self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings) - return predictions, self.probabilities_ - def transform(self, - documents: Union[str, List[str]], - embeddings: np.ndarray = None, - images: List[str] = None) -> Tuple[List[int], np.ndarray]: - """ After having fit a model, use transform to predict new instances + def transform( + self, + documents: Union[str, List[str]], + embeddings: np.ndarray = None, + images: List[str] = None, + ) -> Tuple[List[int], np.ndarray]: + """After having fit a model, use transform to predict new instances. Arguments: documents: A single document or a list of documents to predict on @@ -3695,7 +3773,6 @@

    BERTopic decrease memory usage. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -3729,16 +3806,15 @@

    BERTopicdocuments = [documents] if embeddings is None: - embeddings = self._extract_embeddings(documents, - images=images, - method="document", - verbose=self.verbose) + embeddings = self._extract_embeddings(documents, images=images, method="document", verbose=self.verbose) # Check if an embedding model was found if embeddings is None: - raise ValueError("No embedding model was found to embed the documents." - "Make sure when loading in the model using BERTopic.load()" - "to also specify the embedding model.") + raise ValueError( + "No embedding model was found to embed the documents." + "Make sure when loading in the model using BERTopic.load()" + "to also specify the embedding model." + ) # Transform without hdbscan_model and umap_model using only cosine similarity elif type(self.hdbscan_model) == BaseCluster: @@ -3760,7 +3836,9 @@

    BERTopic# Extract predictions and probabilities if it is a HDBSCAN-like model logger.info("Clustering - Approximating new points with `hdbscan_model`") if is_supported_hdbscan(self.hdbscan_model): - predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings) + predictions, probabilities = hdbscan_delegator( + self.hdbscan_model, "approximate_predict", umap_embeddings + ) # Calculate probabilities if self.calculate_probabilities: @@ -3777,11 +3855,13 @@

    BERTopicpredictions = self._map_predictions(predictions) return predictions, probabilities - def partial_fit(self, - documents: List[str], - embeddings: np.ndarray = None, - y: Union[List[int], np.ndarray] = None): - """ Fit BERTopic on a subset of the data and perform online learning + def partial_fit( + self, + documents: List[str], + embeddings: np.ndarray = None, + y: Union[List[int], np.ndarray] = None, + ): + """Fit BERTopic on a subset of the data and perform online learning with batch-like data. Online topic modeling in BERTopic is performed by using dimensionality @@ -3798,7 +3878,7 @@

    BERTopic For each subset of the data: - 1. Generate embeddings with a pre-traing language model + 1. Generate embeddings with a pre-trained language model 2. Incrementally update the dimensionality reduction algorithm with `partial_fit` 3. Incrementally update the cluster algorithm with `partial_fit` 4. Incrementally update the OnlineCountVectorizer and apply some form of decay @@ -3814,7 +3894,6 @@

    BERTopic specific instance is specified. Examples: - ```python from sklearn.datasets import fetch_20newsgroups from sklearn.cluster import MiniBatchKMeans @@ -3842,28 +3921,31 @@

    BERTopic# Checks check_embeddings_shape(embeddings, documents) if not hasattr(self.hdbscan_model, "partial_fit"): - raise ValueError("In order to use `.partial_fit`, the cluster model should have " - "a `.partial_fit` function.") + raise ValueError( + "In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function." + ) # Prepare documents if isinstance(documents, str): documents = [documents] - documents = pd.DataFrame({"Document": documents, - "ID": range(len(documents)), - "Topic": None}) + documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) # Extract embeddings if embeddings is None: if self.topic_representations_ is None: - self.embedding_model = select_backend(self.embedding_model, - language=self.language) - embeddings = self._extract_embeddings(documents.Document.values.tolist(), - method="document", - verbose=self.verbose) + self.embedding_model = select_backend( + self.embedding_model, language=self.language, verbose=self.verbose + ) + embeddings = self._extract_embeddings( + documents.Document.values.tolist(), + method="document", + verbose=self.verbose, + ) else: if self.embedding_model is not None and self.topic_representations_ is None: - self.embedding_model = select_backend(self.embedding_model, - language=self.language) + self.embedding_model = select_backend( + self.embedding_model, language=self.language, verbose=self.verbose + ) # Reduce dimensionality if self.seed_topic_list is not None and self.embedding_model is not None: @@ -3894,25 +3976,25 @@

    BERTopicmissing_topics = {} # Prepare documents - documents_per_topic = documents.sort_values("Topic").groupby(['Topic'], as_index=False) + documents_per_topic = documents.sort_values("Topic").groupby(["Topic"], as_index=False) updated_topics = documents_per_topic.first().Topic.astype(int) - documents_per_topic = documents_per_topic.agg({'Document': ' '.join}) + documents_per_topic = documents_per_topic.agg({"Document": " ".join}) # Update topic representations self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True) - self.topic_representations_ = self._extract_words_per_topic(updated_words, documents, self.c_tf_idf_, calculate_aspects=False) + self.topic_representations_ = self._extract_words_per_topic( + updated_words, documents, self.c_tf_idf_, calculate_aspects=False + ) self._create_topic_vectors() - self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) - for key, values in self.topic_representations_.items()} # Update topic sizes if len(missing_topics) > 0: - documents = documents.iloc[:-len(missing_topics)] + documents = documents.iloc[: -len(missing_topics)] if self.topic_sizes_ is None: self._update_topic_size(documents) else: - sizes = documents.groupby(['Topic'], as_index=False).count() + sizes = documents.groupby(["Topic"], as_index=False).count() for _, row in sizes.iterrows(): topic = int(row.Topic) if self.topic_sizes_.get(topic) is not None and topic not in missing_topics: @@ -3923,16 +4005,17 @@

    BERTopicreturn self - def topics_over_time(self, - docs: List[str], - timestamps: Union[List[str], - List[int]], - topics: List[int] = None, - nr_bins: int = None, - datetime_format: str = None, - evolution_tuning: bool = True, - global_tuning: bool = True) -> pd.DataFrame: - """ Create topics over time + def topics_over_time( + self, + docs: List[str], + timestamps: Union[List[str], List[int]], + topics: List[int] = None, + nr_bins: int = None, + datetime_format: str = None, + evolution_tuning: bool = True, + global_tuning: bool = True, + ) -> pd.DataFrame: + """Create topics over time. To create the topics over time, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculate at @@ -3940,7 +4023,7 @@

    BERTopic averaged with the global c-TF-IDF representations in order to fine-tune the local representations. - NOTE: + Note: Make sure to use a limited number of unique timestamps (<100) as the c-TF-IDF representation will be calculated at each single unique timestamp. Having a large number of unique timestamps can take some time to be calculated. @@ -3976,7 +4059,6 @@

    BERTopic at timestamp *t*. Examples: - The timestamps variable represents the timestamp of each document. If you have over 100 unique timestamps, it is advised to bin the timestamps as shown below: @@ -3991,16 +4073,18 @@

    BERTopiccheck_documents_type(docs) selected_topics = topics if topics else self.topics_ documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps}) - global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False) + global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) all_topics = sorted(list(documents.Topic.unique())) all_topics_indices = {topic: index for index, topic in enumerate(all_topics)} if isinstance(timestamps[0], str): infer_datetime_format = True if not datetime_format else False - documents["Timestamps"] = pd.to_datetime(documents["Timestamps"], - infer_datetime_format=infer_datetime_format, - format=datetime_format) + documents["Timestamps"] = pd.to_datetime( + documents["Timestamps"], + infer_datetime_format=infer_datetime_format, + format=datetime_format, + ) if nr_bins: documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins) @@ -4010,34 +4094,45 @@

    BERTopicdocuments = documents.sort_values("Timestamps") timestamps = documents.Timestamps.unique() if len(timestamps) > 100: - logger.warning(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) " - "which significantly slows down the application. Consider setting `nr_bins` " - "to a value lower than 100 to speed up calculation. ") + logger.warning( + f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) " + "which significantly slows down the application. Consider setting `nr_bins` " + "to a value lower than 100 to speed up calculation. " + ) # For each unique timestamp, create topic representations topics_over_time = [] for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose): - # Calculate c-TF-IDF representation for a specific timestamp selection = documents.loc[documents.Timestamps == timestamp, :] - documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join, - "Timestamps": "count"}) + documents_per_topic = selection.groupby(["Topic"], as_index=False).agg( + {"Document": " ".join, "Timestamps": "count"} + ) c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) if global_tuning or evolution_tuning: - c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False) + c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF # matrix at timestamp t-1 if evolution_tuning and index != 0: current_topics = sorted(list(documents_per_topic.Topic.values)) - overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics)))) + overlapping_topics = sorted( + list(set(previous_topics).intersection(set(current_topics))) # noqa: F821 + ) current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics] - previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics] - - c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] + - previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil() + previous_overlap_idx = [ + previous_topics.index(topic) # noqa: F821 + for topic in overlapping_topics + ] + + c_tf_idf.tolil()[current_overlap_idx] = ( + ( + c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx] # noqa: F821 + ) + / 2.0 + ).tolil() # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two @@ -4047,27 +4142,35 @@

    BERTopic# Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) - topic_frequency = pd.Series(documents_per_topic.Timestamps.values, - index=documents_per_topic.Topic).to_dict() + topic_frequency = pd.Series( + documents_per_topic.Timestamps.values, index=documents_per_topic.Topic + ).to_dict() # Fill dataframe with results - topics_at_timestamp = [(topic, - ", ".join([words[0] for words in values][:5]), - topic_frequency[topic], - timestamp) for topic, values in words_per_topic.items()] + topics_at_timestamp = [ + ( + topic, + ", ".join([words[0] for words in values][:5]), + topic_frequency[topic], + timestamp, + ) + for topic, values in words_per_topic.items() + ] topics_over_time.extend(topics_at_timestamp) if evolution_tuning: - previous_topics = sorted(list(documents_per_topic.Topic.values)) - previous_c_tf_idf = c_tf_idf.copy() + previous_topics = sorted(list(documents_per_topic.Topic.values)) # noqa: F841 + previous_c_tf_idf = c_tf_idf.copy() # noqa: F841 return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"]) - def topics_per_class(self, - docs: List[str], - classes: Union[List[int], List[str]], - global_tuning: bool = True) -> pd.DataFrame: - """ Create topics per class + def topics_per_class( + self, + docs: List[str], + classes: Union[List[int], List[str]], + global_tuning: bool = True, + ) -> pd.DataFrame: + """Create topics per class. To create the topics per class, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculated at @@ -4076,7 +4179,7 @@

    BERTopic local representations. This can be turned off if the pure representation is needed. - NOTE: + Note: Make sure to use a limited number of unique classes (<100) as the c-TF-IDF representation will be calculated at each single unique class. Having a large number of unique classes can take some time to be calculated. @@ -4093,7 +4196,6 @@

    BERTopic for each class. Examples: - ```python from bertopic import BERTopic topic_model = BERTopic() @@ -4103,48 +4205,55 @@

    BERTopic """ check_documents_type(docs) documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes}) - global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False) + global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) # For each unique timestamp, create topic representations topics_per_class = [] for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose): - # Calculate c-TF-IDF representation for a specific timestamp selection = documents.loc[documents.Class == class_, :] - documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join, - "Class": "count"}) + documents_per_topic = selection.groupby(["Topic"], as_index=False).agg( + {"Document": " ".join, "Class": "count"} + ) c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two if global_tuning: - c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False) + c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0 # Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) - topic_frequency = pd.Series(documents_per_topic.Class.values, - index=documents_per_topic.Topic).to_dict() + topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict() # Fill dataframe with results - topics_at_class = [(topic, - ", ".join([words[0] for words in values][:5]), - topic_frequency[topic], - class_) for topic, values in words_per_topic.items()] + topics_at_class = [ + ( + topic, + ", ".join([words[0] for words in values][:5]), + topic_frequency[topic], + class_, + ) + for topic, values in words_per_topic.items() + ] topics_per_class.extend(topics_at_class) topics_per_class = pd.DataFrame(topics_per_class, columns=["Topic", "Words", "Frequency", "Class"]) return topics_per_class - def hierarchical_topics(self, - docs: List[str], - linkage_function: Callable[[csr_matrix], np.ndarray] = None, - distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame: - """ Create a hierarchy of topics + def hierarchical_topics( + self, + docs: List[str], + use_ctfidf: bool = True, + linkage_function: Callable[[csr_matrix], np.ndarray] = None, + distance_function: Callable[[csr_matrix], csr_matrix] = None, + ) -> pd.DataFrame: + """Create a hierarchy of topics. To create this hierarchy, BERTopic needs to be already fitted once. - Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF + Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings representation using `scipy.cluster.hierarchy.linkage`. Based on that hierarchy, we calculate the topic representation at each @@ -4154,12 +4263,14 @@

    BERTopic Arguments: docs: The documents you used when calling either `fit` or `fit_transform` + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. linkage_function: The linkage function to use. Default is: `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)`. - You can pass any function that returns either a square matrix of - shape (n_samples, n_samples) with zeros on the diagonal and + You can pass any function that returns either a square matrix of + shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. @@ -4169,7 +4280,6 @@

    BERTopic represented by their parents and their children Examples: - ```python from bertopic import BERTopic topic_model = BERTopic() @@ -4195,21 +4305,26 @@

    BERTopicdistance_function = lambda x: 1 - cosine_similarity(x) if linkage_function is None: - linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) + linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True) # Calculate distance - embeddings = self.c_tf_idf_[self._outliers:] + embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][ + self._outliers : + ] X = distance_function(embeddings) X = validate_distance_matrix(X, embeddings.shape[0]) # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix Z = linkage_function(X) + # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with + # `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters + if len(Z[:, 2]) != len(np.unique(Z[:, 2])): + Z[:, 2] = get_unique_distances(Z[:, 2]) + # Calculate basic bag-of-words to be iteratively merged later - documents = pd.DataFrame({"Document": docs, - "ID": range(len(docs)), - "Topic": self.topics_}) - documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": self.topics_}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :] clean_documents = self._preprocess_text(documents_per_topic.Document.values) @@ -4223,13 +4338,20 @@

    BERTopicbow = self.vectorizer_model.transform(clean_documents) # Extract clusters - hier_topics = pd.DataFrame(columns=["Parent_ID", "Parent_Name", "Topics", - "Child_Left_ID", "Child_Left_Name", - "Child_Right_ID", "Child_Right_Name"]) + hier_topics = pd.DataFrame( + columns=[ + "Parent_ID", + "Parent_Name", + "Topics", + "Child_Left_ID", + "Child_Left_Name", + "Child_Right_ID", + "Child_Right_Name", + ] + ) for index in tqdm(range(len(Z))): - # Find clustered documents - clusters = sch.fcluster(Z, t=Z[index][2], criterion='distance') - self._outliers + clusters = sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers nr_clusters = len(clusters) # Extract first topic we find to get the set of topics in a merged topic @@ -4272,29 +4394,37 @@

    BERTopicchild_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name # Save results - hier_topics.loc[len(hier_topics), :] = [parent_id, parent_name, - clustered_topics, - int(Z[index][0]), child_left_name, - int(Z[index][1]), child_right_name] + hier_topics.loc[len(hier_topics), :] = [ + parent_id, + parent_name, + clustered_topics, + int(Z[index][0]), + child_left_name, + int(Z[index][1]), + child_right_name, + ] hier_topics["Distance"] = Z[:, 2] hier_topics = hier_topics.sort_values("Parent_ID", ascending=False) - hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]].astype(str) + hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[ + ["Parent_ID", "Child_Left_ID", "Child_Right_ID"] + ].astype(str) return hier_topics - def approximate_distribution(self, - documents: Union[str, List[str]], - window: int = 4, - stride: int = 1, - min_similarity: float = 0.1, - batch_size: int = 1000, - padding: bool = False, - use_embedding_model: bool = False, - calculate_tokens: bool = False, - separator: str = " ") -> Tuple[np.ndarray, - Union[List[np.ndarray], None]]: - """ A post-hoc approximation of topic distributions across documents. + def approximate_distribution( + self, + documents: Union[str, List[str]], + window: int = 4, + stride: int = 1, + min_similarity: float = 0.1, + batch_size: int = 1000, + padding: bool = False, + use_embedding_model: bool = False, + calculate_tokens: bool = False, + separator: str = " ", + ) -> Tuple[np.ndarray, Union[List[np.ndarray], None]]: + """A post-hoc approximation of topic distributions across documents. In order to perform this approximation, each document is split into tokens according to the provided tokenizer in the `CountVectorizer`. Then, a @@ -4352,7 +4482,6 @@

    BERTopic and `m` the topics. Examples: - After fitting the model, the topic distributions can be calculated regardless of the clustering model and regardless of whether the documents were previously seen or not: @@ -4380,13 +4509,13 @@

    BERTopicbatch_size = len(documents) batches = 1 else: - batches = math.ceil(len(documents)/batch_size) + batches = math.ceil(len(documents) / batch_size) topic_distributions = [] topic_token_distributions = [] for i in tqdm(range(batches), disable=not self.verbose): - doc_set = documents[i*batch_size: (i+1) * batch_size] + doc_set = documents[i * batch_size : (i + 1) * batch_size] # Extract tokens analyzer = self.vectorizer_model.build_tokenizer() @@ -4402,17 +4531,23 @@

    BERTopictoken_sets = [tokenset] token_sets_ids = [list(range(len(tokenset)))] else: - # Extract tokensets using window and stride parameters stride_indices = list(range(len(tokenset)))[::stride] token_sets = [] token_sets_ids = [] for stride_index in stride_indices: - selected_tokens = tokenset[stride_index: stride_index+window] + selected_tokens = tokenset[stride_index : stride_index + window] if padding or len(selected_tokens) == window: token_sets.append(selected_tokens) - token_sets_ids.append(list(range(stride_index, stride_index+len(selected_tokens)))) + token_sets_ids.append( + list( + range( + stride_index, + stride_index + len(selected_tokens), + ) + ) + ) # Add empty tokens at the beginning and end of a document if padding: @@ -4420,8 +4555,8 @@

    BERTopicpadded_ids = [] t = math.ceil(window / stride) - 1 for i in range(math.ceil(window / stride) - 1): - padded.append(tokenset[:window - ((t-i) * stride)]) - padded_ids.append(list(range(0, window - ((t-i) * stride)))) + padded.append(tokenset[: window - ((t - i) * stride)]) + padded_ids.append(list(range(0, window - ((t - i) * stride)))) token_sets = padded + token_sets token_sets_ids = padded_ids + token_sets_ids @@ -4435,13 +4570,13 @@

    BERTopic# Calculate similarity between embeddings of token sets and the topics if use_embedding_model: embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True) - similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers:]) + similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :]) # Calculate similarity between c-TF-IDF of token sets and the topics else: bow_doc = self.vectorizer_model.transform(all_sentences) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) - similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Only keep similarities that exceed the minimum similarity[similarity < min_similarity] = 0 @@ -4452,7 +4587,7 @@

    BERTopictopic_token_distribution = [] for index, token in enumerate(tokens): start = all_indices[index] - end = all_indices[index+1] + end = all_indices[index + 1] if start == end: end = end + 1 @@ -4477,20 +4612,20 @@

    BERTopictopic_token_distribution.append(np.array(matrix)) topic_distribution.append(np.add.reduce(matrix)) - topic_distribution = normalize(topic_distribution, norm='l1', axis=1) + topic_distribution = normalize(topic_distribution, norm="l1", axis=1) # Aggregate on a tokenset level indicated by the window and stride else: topic_distribution = [] - for index in range(len(all_indices)-1): + for index in range(len(all_indices) - 1): start = all_indices[index] - end = all_indices[index+1] + end = all_indices[index + 1] if start == end: end = end + 1 group = similarity[start:end].sum(axis=0) topic_distribution.append(group) - topic_distribution = normalize(np.array(topic_distribution), norm='l1', axis=1) + topic_distribution = normalize(np.array(topic_distribution), norm="l1", axis=1) topic_token_distribution = None # Combine results @@ -4504,22 +4639,22 @@

    BERTopicreturn topic_distributions, topic_token_distributions - def find_topics(self, - search_term: str = None, - image: str = None, - top_n: int = 5) -> Tuple[List[int], List[float]]: - """ Find topics most similar to a search_term + def find_topics(self, search_term: str = None, image: str = None, top_n: int = 5) -> Tuple[List[int], List[float]]: + """Find topics most similar to a search_term. - Creates an embedding for search_term and compares that with + Creates an embedding for a search query and compares that with the topic embeddings. The most similar topics are returned along with their similarity values. + The query is specified using search_term for text queries or image for image queries. + The search_term can be of any size but since it is compared with the topic representation it is advised to keep it below 5 words. Arguments: search_term: the term you want to use to search for topics. + image: path to the image you want to use to search for topics. top_n: the number of topics to return Returns: @@ -4527,7 +4662,6 @@

    BERTopic similarity: the similarity scores from high to low Examples: - You can use the underlying embedding model to find topics that best represent the search term: @@ -4546,14 +4680,11 @@

    BERTopic# Extract search_term embeddings and compare with topic embeddings if search_term is not None: - search_embedding = self._extract_embeddings([search_term], - method="word", - verbose=False).flatten() + search_embedding = self._extract_embeddings([search_term], method="word", verbose=False).flatten() elif image is not None: - search_embedding = self._extract_embeddings([None], - images=[image], - method="document", - verbose=False).flatten() + search_embedding = self._extract_embeddings( + [None], images=[image], method="document", verbose=False + ).flatten() sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten() # Extract topics most similar to search_term @@ -4563,16 +4694,18 @@

    BERTopicreturn similar_topics, similarity - def update_topics(self, - docs: List[str], - images: List[str] = None, - topics: List[int] = None, - top_n_words: int = 10, - n_gram_range: Tuple[int, int] = None, - vectorizer_model: CountVectorizer = None, - ctfidf_model: ClassTfidfTransformer = None, - representation_model: BaseRepresentation = None): - """ Updates the topic representation by recalculating c-TF-IDF with the new + def update_topics( + self, + docs: List[str], + images: List[str] = None, + topics: List[int] = None, + top_n_words: int = 10, + n_gram_range: Tuple[int, int] = None, + vectorizer_model: CountVectorizer = None, + ctfidf_model: ClassTfidfTransformer = None, + representation_model: BaseRepresentation = None, + ): + """Updates the topic representation by recalculating c-TF-IDF with the new parameters as defined in this function. When you have trained a model and viewed the topics and the words that represent them, @@ -4599,7 +4732,6 @@

    BERTopic are supported. Examples: - In order to update the topic representation, you will need to first fit the topic model and extract topics from them. Based on these, you can update the representation: @@ -4628,8 +4760,9 @@

    BERTopicn_gram_range = self.n_gram_range if top_n_words > 100: - logger.warning("Note that extracting more than 100 words from a sparse " - "can slow down computation quite a bit.") + logger.warning( + "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit." + ) self.top_n_words = top_n_words self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range) self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() @@ -4638,38 +4771,41 @@

    BERTopicif topics is None: topics = self.topics_ else: - logger.warning("Using a custom list of topic assignments may lead to errors if " - "topic reduction techniques are used afterwards. Make sure that " - "manually assigning topics is the last step in the pipeline." - "Note that topic embeddings will also be created through weighted" - "c-TF-IDF embeddings instead of centroid embeddings.") - - self._outliers = 1 if -1 in set(topics) else 0 + logger.warning( + "Using a custom list of topic assignments may lead to errors if " + "topic reduction techniques are used afterwards. Make sure that " + "manually assigning topics is the last step in the pipeline." + "Note that topic embeddings will also be created through weighted" + "c-TF-IDF embeddings instead of centroid embeddings." + ) - # Extract words documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images}) - documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) + + # Update topic sizes and assignments + self._update_topic_size(documents) + + # Extract words and update topic labels self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) self.topic_representations_ = self._extract_words_per_topic(words, documents) # Update topic vectors if set(topics) != self.topics_: - # Remove outlier topic embedding if all that has changed is the outlier class - same_position = all([True if old_topic == new_topic else False for old_topic, new_topic in zip(self.topics_, topics) if old_topic != -1]) + same_position = all( + [ + True if old_topic == new_topic else False + for old_topic, new_topic in zip(self.topics_, topics) + if old_topic != -1 + ] + ) if same_position and -1 not in topics and -1 in self.topics_: self.topic_embeddings_ = self.topic_embeddings_[1:] else: self._create_topic_vectors() - # Update topic labels - self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) - for key, values in - self.topic_representations_.items()} - self._update_topic_size(documents) - def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]: - """ Return topics with top n words and their c-TF-IDF score + """Return topics with top n words and their c-TF-IDF score. Arguments: full: If True, returns all different forms of topic representations @@ -4679,7 +4815,6 @@

    BERTopic self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score Examples: - ```python all_topics = topic_model.get_topics() ``` @@ -4694,7 +4829,7 @@

    BERTopicreturn self.topic_representations_ def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]: - """ Return top n words for a specific topic and their c-TF-IDF scores + """Return top n words for a specific topic and their c-TF-IDF scores. Arguments: topic: A specific topic for which you want its representation @@ -4705,7 +4840,6 @@

    BERTopic The top n words for a specific word and its respective c-TF-IDF scores Examples: - ```python topic = topic_model.get_topic(12) ``` @@ -4723,7 +4857,7 @@

    BERTopicreturn False def get_topic_info(self, topic: int = None) -> pd.DataFrame: - """ Get information about each topic including its ID, frequency, and name. + """Get information about each topic including its ID, frequency, and name. Arguments: topic: A specific topic for which you want the frequency @@ -4732,7 +4866,6 @@

    BERTopic info: The information relating to either a single topic or all topics Examples: - ```python info_df = topic_model.get_topic_info() ``` @@ -4756,7 +4889,9 @@

    BERTopicif self.topic_aspects_: for aspect, values in self.topic_aspects_.items(): if isinstance(list(values.values())[-1], list): - if isinstance(list(values.values())[-1][0], tuple) or isinstance(list(values.values())[-1][0], list): + if isinstance(list(values.values())[-1][0], tuple) or isinstance( + list(values.values())[-1][0], list + ): values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()} elif isinstance(list(values.values())[-1][0], str): values = {topic: " ".join(value).strip() for topic, value in values.items()} @@ -4775,7 +4910,7 @@

    BERTopicreturn info.reset_index(drop=True) def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]: - """ Return the size of topics (descending order) + """Return the size of topics (descending order). Arguments: topic: A specific topic for which you want the frequency @@ -4785,7 +4920,6 @@

    BERTopic the frequencies of all topics Examples: - To extract the frequency of all topics: ```python @@ -4802,14 +4936,17 @@

    BERTopicif isinstance(topic, int): return self.topic_sizes_[topic] else: - return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values("Count", - ascending=False) - - def get_document_info(self, - docs: List[str], - df: pd.DataFrame = None, - metadata: Mapping[str, Any] = None) -> pd.DataFrame: - """ Get information about the documents on which the topic was trained + return pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values( + "Count", ascending=False + ) + + def get_document_info( + self, + docs: List[str], + df: pd.DataFrame = None, + metadata: Mapping[str, Any] = None, + ) -> pd.DataFrame: + """Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and probability of the clustering if the cluster @@ -4880,8 +5017,10 @@

    BERTopicif len(self.probabilities_.shape) == 1: document_info["Probability"] = self.probabilities_ else: - document_info["Probability"] = [max(probs) if topic != -1 else 1-sum(probs) - for topic, probs in zip(self.topics_, self.probabilities_)] + document_info["Probability"] = [ + max(probs) if topic != -1 else 1 - sum(probs) + for topic, probs in zip(self.topics_, self.probabilities_) + ] # Add representative document labels repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs] @@ -4895,9 +5034,9 @@

    BERTopicreturn document_info def get_representative_docs(self, topic: int = None) -> List[str]: - """ Extract the best representing documents per topic. + """Extract the best representing documents per topic. - NOTE: + Note: This does not extract all documents per topic as all documents are not saved within BERTopic. To get all documents, please run the following: @@ -4918,7 +5057,6 @@

    BERTopic Representative documents of the chosen topic Examples: - To extract the representative docs of all topics: ```python @@ -4941,14 +5079,16 @@

    BERTopicreturn self.representative_docs_ @staticmethod - def get_topic_tree(hier_topics: pd.DataFrame, - max_distance: float = None, - tight_layout: bool = False) -> str: - """ Extract the topic tree such that it can be printed + def get_topic_tree( + hier_topics: pd.DataFrame, + max_distance: float = None, + tight_layout: bool = False, + ) -> str: + """Extract the topic tree such that it can be printed. Arguments: hier_topics: A dataframe containing the structure of the topic tree. - This is the output of `topic_model.hierachical_topics()` + This is the output of `topic_model.hierarchical_topics()` max_distance: The maximum distance between two topics. This value is based on the Distance column in `hier_topics`. tight_layout: Whether to use a tight layout (narrow width) for @@ -4970,7 +5110,6 @@

    BERTopic from `topic_model.get_topic`. In other words, they are the original un-grouped topics. Examples: - ```python # Train model from bertopic import BERTopic @@ -4995,17 +5134,23 @@

    BERTopictopic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()} # Create tree - tree = {str(row[1].Parent_ID): [str(row[1].Child_Left_ID), str(row[1].Child_Right_ID)] - for row in hier_topics.iterrows()} + tree = { + str(row[1].Parent_ID): [ + str(row[1].Child_Left_ID), + str(row[1].Child_Right_ID), + ] + for row in hier_topics.iterrows() + } def get_tree(start, tree): - """ Based on: https://stackoverflow.com/a/51920869/10532563 """ + """Based on: https://stackoverflow.com/a/51920869/10532563.""" def _tree(to_print, start, parent, tree, grandpa=None, indent=""): - # Get distance between merged topics - distance = hier_topics.loc[(hier_topics.Child_Left_ID == parent) | - (hier_topics.Child_Right_ID == parent), "Distance"] + distance = hier_topics.loc[ + (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent), + "Distance", + ] distance = distance.values[0] if len(distance) > 0 else 10 if parent != start: @@ -5013,7 +5158,6 @@

    BERTopicto_print += topic_to_name[parent] else: if int(parent) <= max_original_topic: - # Do not append topic ID if they are not merged if distance < max_distance: to_print += "■──" + topic_to_name[parent] + f" ── Topic: {parent}" + "\n" @@ -5031,7 +5175,7 @@

    BERTopicchild = tree[parent][-1] to_print += indent + "└" + "─" - to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width+1)) + to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width + 1)) return to_print @@ -5043,7 +5187,7 @@

    BERTopicreturn get_tree(start, tree) def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None: - """ Set custom topic labels in your fitted BERTopic model + """Set custom topic labels in your fitted BERTopic model. Arguments: topic_labels: If a list of topic labels, it should contain the same number @@ -5055,7 +5199,6 @@

    BERTopic in the dictionary. Examples: - First, we define our topic labels with `.generate_topic_labels` in which we can customize our topic labels: @@ -5091,24 +5234,30 @@

    BERTopicelse: info = self.get_topic_info() original_labels = dict(zip(info.Topic, info.Name)) - custom_labels = [topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics] + custom_labels = [ + topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] + for topic in unique_topics + ] elif isinstance(topic_labels, list): if len(topic_labels) == len(unique_topics): custom_labels = topic_labels else: - raise ValueError("Make sure that `topic_labels` contains the same number " - "of labels as there are topics.") + raise ValueError( + "Make sure that `topic_labels` contains the same number " "of labels as there are topics." + ) self.custom_labels_ = custom_labels - def generate_topic_labels(self, - nr_words: int = 3, - topic_prefix: bool = True, - word_length: int = None, - separator: str = "_", - aspect: str = None) -> List[str]: - """ Get labels for each topic in a user-defined format + def generate_topic_labels( + self, + nr_words: int = 3, + topic_prefix: bool = True, + word_length: int = None, + separator: str = "_", + aspect: str = None, + ) -> List[str]: + """Get labels for each topic in a user-defined format. Arguments: nr_words: Top `n` words per topic to use @@ -5130,7 +5279,6 @@

    BERTopic otherwise it is 0. Examples: - To create our custom topic labels, usage is rather straightforward: ```python @@ -5160,12 +5308,13 @@

    BERTopicreturn topic_labels - def merge_topics(self, - docs: List[str], - topics_to_merge: List[Union[Iterable[int], int]], - images: List[str] = None) -> None: - """ - Arguments: + def merge_topics( + self, + docs: List[str], + topics_to_merge: List[Union[Iterable[int], int]], + images: List[str] = None, + ) -> None: + """Arguments: docs: The documents you used when calling either `fit` or `fit_transform` topics_to_merge: Either a list of topics or a list of list of topics to merge. For example: @@ -5173,10 +5322,9 @@

    BERTopic [[1, 2], [3, 4]] will merge topics 1 and 2, and separately merge topics 3 and 4. images: A list of paths to the images used when calling either - `fit` or `fit_transform` + `fit` or `fit_transform`. Examples: - If you want to merge topics 1, 2, and 3: ```python @@ -5195,7 +5343,14 @@

    BERTopic """ check_is_fitted(self) check_documents_type(docs) - documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))}) + documents = pd.DataFrame( + { + "Document": docs, + "Topic": self.topics_, + "Image": images, + "ID": range(len(docs)), + } + ) mapping = {topic: topic for topic in set(self.topics_)} if isinstance(topics_to_merge[0], int): @@ -5206,17 +5361,21 @@

    BERTopicfor topic in topic_group: mapping[topic] = topic_group[0] else: - raise ValueError("Make sure that `topics_to_merge` is either" - "a list of topics or a list of list of topics.") + raise ValueError( + "Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics." + ) # Track mappings and sizes of topics for merging topic embeddings mappings = defaultdict(list) for key, val in sorted(mapping.items()): mappings[val].append(key) - mappings = {topic_from: - {"topics_to": topics_to, - "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} - for topic_from, topics_to in mappings.items()} + mappings = { + topic_to: { + "topics_from": topics_from, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], + } + for topic_to, topics_from in mappings.items() + } # Update topics documents.Topic = documents.Topic.map(mapping) @@ -5227,16 +5386,19 @@

    BERTopicself._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) - def reduce_topics(self, - docs: List[str], - nr_topics: Union[int, str] = 20, - images: List[str] = None) -> None: - """ Reduce the number of topics to a fixed number of topics + def reduce_topics( + self, + docs: List[str], + nr_topics: Union[int, str] = 20, + images: List[str] = None, + use_ctfidf: bool = False, + ) -> None: + """Reduce the number of topics to a fixed number of topics or automatically. If nr_topics is an integer, then the number of topics is reduced to nr_topics using `AgglomerativeClustering` on the cosine distance matrix - of the topic embeddings. + of the topic c-TF-IDF or semantic embeddings. If nr_topics is `"auto"`, then HDBSCAN is used to automatically reduce the number of topics by running it on the topic embeddings. @@ -5248,13 +5410,14 @@

    BERTopic nr_topics: The number of topics you want reduced to images: A list of paths to the images used when calling either `fit` or `fit_transform` + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. Updates: topics_ : Assigns topics to their merged representations. probabilities_ : Assigns probabilities to their merged representations. Examples: - You can further reduce the topics by passing the documents with their topics and probabilities (if they were calculated): @@ -5273,26 +5436,35 @@

    BERTopiccheck_documents_type(docs) self.nr_topics = nr_topics - documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))}) + documents = pd.DataFrame( + { + "Document": docs, + "Topic": self.topics_, + "Image": images, + "ID": range(len(docs)), + } + ) # Reduce number of topics - documents = self._reduce_topics(documents) + documents = self._reduce_topics(documents, use_ctfidf) self._merged_topics = None self._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) return self - def reduce_outliers(self, - documents: List[str], - topics: List[int], - images: List[str] = None, - strategy: str = "distributions", - probabilities: np.ndarray = None, - threshold: float = 0, - embeddings: np.ndarray = None, - distributions_params: Mapping[str, Any] = {}) -> List[int]: - """ Reduce outliers by merging them with their nearest topic according + def reduce_outliers( + self, + documents: List[str], + topics: List[int], + images: List[str] = None, + strategy: str = "distributions", + probabilities: np.ndarray = None, + threshold: float = 0, + embeddings: np.ndarray = None, + distributions_params: Mapping[str, Any] = {}, + ) -> List[int]: + """Reduce outliers by merging them with their nearest topic according to one of several strategies. When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created @@ -5339,6 +5511,7 @@

    BERTopic * "embeddings" Calculate the embeddings for outlier documents and find the best matching topic embedding. + probabilities: Probabilities generated by HDBSCAN for each document when using the strategy `"probabilities"`. threshold: The threshold for assigning topics to outlier documents. This value represents the minimum probability when `strategy="probabilities"`. For all other strategies, it represents the minimum similarity. @@ -5369,6 +5542,9 @@

    BERTopic new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") ``` """ + if not self._outliers: + raise ValueError("No outliers to reduce.") + if images is not None: strategy = "embeddings" @@ -5378,14 +5554,18 @@

    BERTopic# Reduce outliers by extracting most likely topics through the topic-term probability matrix if strategy.lower() == "probabilities": - new_topics = [np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic - for topic, prob in zip(topics, probabilities)] + new_topics = [ + np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic + for topic, prob in zip(topics, probabilities) + ] # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions elif strategy.lower() == "distributions": outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] outlier_docs = [documents[index] for index in outlier_ids] - topic_distr, _ = self.approximate_distribution(outlier_docs, min_similarity=threshold, **distributions_params) + topic_distr, _ = self.approximate_distribution( + outlier_docs, min_similarity=threshold, **distributions_params + ) outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]) new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] @@ -5397,7 +5577,7 @@

    BERTopic# Calculate c-TF-IDF of outlier documents with all topics bow_doc = self.vectorizer_model.transform(outlier_docs) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) - similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 @@ -5407,8 +5587,10 @@

    BERTopic# Reduce outliers by finding the most similar topic embeddings elif strategy.lower() == "embeddings": if self.embedding_model is None and embeddings is None: - raise ValueError("To use this strategy, you will need to pass a model to `embedding_model`" - "when instantiating BERTopic.") + raise ValueError( + "To use this strategy, you will need to pass a model to `embedding_model`" + "when instantiating BERTopic." + ) outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] if images is not None: outlier_docs = [images[index] for index in outlier_ids] @@ -5423,7 +5605,7 @@

    BERTopicoutlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose) else: outlier_embeddings = self.embedding_model.embed_documents(outlier_docs) - similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers:]) + similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 @@ -5432,14 +5614,17 @@

    BERTopicreturn new_topics - def visualize_topics(self, - topics: List[int] = None, - top_n_topics: int = None, - custom_labels: bool = False, - title: str = "<b>Intertopic Distance Map</b>", - width: int = 650, - height: int = 650) -> go.Figure: - """ Visualize topics, their sizes, and their corresponding words + def visualize_topics( + self, + topics: List[int] = None, + top_n_topics: int = None, + use_ctfidf: bool = False, + custom_labels: bool = False, + title: str = "<b>Intertopic Distance Map</b>", + width: int = 650, + height: int = 650, + ) -> go.Figure: + """Visualize topics, their sizes, and their corresponding words. This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA. @@ -5450,6 +5635,7 @@

    BERTopic For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. top_n_topics: Only select the top n most frequent topics + use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. @@ -5457,7 +5643,6 @@

    BERTopic height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -5472,27 +5657,32 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_topics(self, - topics=topics, - top_n_topics=top_n_topics, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_documents(self, - docs: List[str], - topics: List[int] = None, - embeddings: np.ndarray = None, - reduced_embeddings: np.ndarray = None, - sample: float = None, - hide_annotations: bool = False, - hide_document_hover: bool = False, - custom_labels: bool = False, - title: str = "<b>Documents and Topics</b>", - width: int = 1200, - height: int = 750) -> go.Figure: - """ Visualize documents and their topics in 2D + return plotting.visualize_topics( + self, + topics=topics, + top_n_topics=top_n_topics, + use_ctfidf=use_ctfidf, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_documents( + self, + docs: List[str], + topics: List[int] = None, + embeddings: np.ndarray = None, + reduced_embeddings: np.ndarray = None, + sample: float = None, + hide_annotations: bool = False, + hide_document_hover: bool = False, + custom_labels: bool = False, + title: str = "<b>Documents and Topics</b>", + width: int = 1200, + height: int = 750, + ) -> go.Figure: + """Visualize documents and their topics in 2D. Arguments: topic_model: A fitted BERTopic instance. @@ -5517,7 +5707,6 @@

    BERTopic height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -5563,37 +5752,43 @@

    BERTopic """ check_is_fitted(self) check_documents_type(docs) - return plotting.visualize_documents(self, - docs=docs, - topics=topics, - embeddings=embeddings, - reduced_embeddings=reduced_embeddings, - sample=sample, - hide_annotations=hide_annotations, - hide_document_hover=hide_document_hover, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_document_datamap(self, - docs: List[str], - topics: List[int] = None, - embeddings: np.ndarray = None, - reduced_embeddings: np.ndarray = None, - custom_labels: Union[bool, str] = False, - title: str = "Documents and Topics", - sub_title: Union[str, None] = None, - width: int = 1200, - height: int = 1200, - **datamap_kwds): - """ Visualize documents and their topics in 2D as a static plot for publication using + return plotting.visualize_documents( + self, + docs=docs, + topics=topics, + embeddings=embeddings, + reduced_embeddings=reduced_embeddings, + sample=sample, + hide_annotations=hide_annotations, + hide_document_hover=hide_document_hover, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_document_datamap( + self, + docs: List[str], + topics: List[int] = None, + embeddings: np.ndarray = None, + reduced_embeddings: np.ndarray = None, + custom_labels: Union[bool, str] = False, + title: str = "Documents and Topics", + sub_title: Union[str, None] = None, + width: int = 1200, + height: int = 1200, + **datamap_kwds, + ): + """Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model. Arguments: topic_model: A fitted BERTopic instance. docs: The documents you used when calling either `fit` or `fit_transform` + topics: A selection of topics to visualize. + Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points. embeddings: The embeddings of all documents in `docs`. reduced_embeddings: The 2D reduced embeddings of all documents in `docs`. custom_labels: If bool, whether to use custom topic labels that were defined using @@ -5611,7 +5806,6 @@

    BERTopic figure: A Matplotlib Figure object. Examples: - To visualize the topics simply run: ```python @@ -5654,33 +5848,38 @@

    BERTopic """ check_is_fitted(self) check_documents_type(docs) - return plotting.visualize_document_datamap(self, - docs, - topics, - embeddings, - reduced_embeddings, - custom_labels, - title, - sub_title, - width, - height, - **datamap_kwds) - def visualize_hierarchical_documents(self, - docs: List[str], - hierarchical_topics: pd.DataFrame, - topics: List[int] = None, - embeddings: np.ndarray = None, - reduced_embeddings: np.ndarray = None, - sample: Union[float, int] = None, - hide_annotations: bool = False, - hide_document_hover: bool = True, - nr_levels: int = 10, - level_scale: str = 'linear', - custom_labels: bool = False, - title: str = "<b>Hierarchical Documents and Topics</b>", - width: int = 1200, - height: int = 750) -> go.Figure: - """ Visualize documents and their topics in 2D at different levels of hierarchy + return plotting.visualize_document_datamap( + self, + docs, + topics, + embeddings, + reduced_embeddings, + custom_labels, + title, + sub_title, + width, + height, + **datamap_kwds, + ) + + def visualize_hierarchical_documents( + self, + docs: List[str], + hierarchical_topics: pd.DataFrame, + topics: List[int] = None, + embeddings: np.ndarray = None, + reduced_embeddings: np.ndarray = None, + sample: Union[float, int] = None, + hide_annotations: bool = False, + hide_document_hover: bool = True, + nr_levels: int = 10, + level_scale: str = "linear", + custom_labels: bool = False, + title: str = "<b>Hierarchical Documents and Topics</b>", + width: int = 1200, + height: int = 750, + ) -> go.Figure: + """Visualize documents and their topics in 2D at different levels of hierarchy. Arguments: docs: The documents you used when calling either `fit` or `fit_transform` @@ -5701,7 +5900,7 @@

    BERTopic specific points. Helps to speed up generation of visualizations. nr_levels: The number of levels to be visualized in the hierarchy. First, the distances in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with - equal length. Then, for each list of distances, the merged topics, that have + equal length. Then, for each list of distances, the merged topics, that have a distance less or equal to the maximum distance of the selected list of distances, are selected. NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to the length of `hierarchical_topics`. @@ -5719,7 +5918,6 @@

    BERTopic height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -5766,30 +5964,34 @@

    BERTopic """ check_is_fitted(self) check_documents_type(docs) - return plotting.visualize_hierarchical_documents(self, - docs=docs, - hierarchical_topics=hierarchical_topics, - topics=topics, - embeddings=embeddings, - reduced_embeddings=reduced_embeddings, - sample=sample, - hide_annotations=hide_annotations, - hide_document_hover=hide_document_hover, - nr_levels=nr_levels, - level_scale=level_scale, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_term_rank(self, - topics: List[int] = None, - log_scale: bool = False, - custom_labels: bool = False, - title: str = "<b>Term score decline per Topic</b>", - width: int = 800, - height: int = 500) -> go.Figure: - """ Visualize the ranks of all terms across all topics + return plotting.visualize_hierarchical_documents( + self, + docs=docs, + hierarchical_topics=hierarchical_topics, + topics=topics, + embeddings=embeddings, + reduced_embeddings=reduced_embeddings, + sample=sample, + hide_annotations=hide_annotations, + hide_document_hover=hide_document_hover, + nr_levels=nr_levels, + level_scale=level_scale, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_term_rank( + self, + topics: List[int] = None, + log_scale: bool = False, + custom_labels: bool = False, + title: str = "<b>Term score decline per Topic</b>", + width: int = 800, + height: int = 500, + ) -> go.Figure: + """Visualize the ranks of all terms across all topics. Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows @@ -5810,7 +6012,6 @@

    BERTopic fig: A plotly figure Examples: - To visualize the ranks of all words across all topics simply run: @@ -5834,24 +6035,28 @@

    BERTopic [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). """ check_is_fitted(self) - return plotting.visualize_term_rank(self, - topics=topics, - log_scale=log_scale, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_topics_over_time(self, - topics_over_time: pd.DataFrame, - top_n_topics: int = None, - topics: List[int] = None, - normalize_frequency: bool = False, - custom_labels: bool = False, - title: str = "<b>Topics over Time</b>", - width: int = 1250, - height: int = 450) -> go.Figure: - """ Visualize topics over time + return plotting.visualize_term_rank( + self, + topics=topics, + log_scale=log_scale, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_topics_over_time( + self, + topics_over_time: pd.DataFrame, + top_n_topics: int = None, + topics: List[int] = None, + normalize_frequency: bool = False, + custom_labels: bool = False, + title: str = "<b>Topics over Time</b>", + width: int = 1250, + height: int = 450, + ) -> go.Figure: + """Visualize topics over time. Arguments: topics_over_time: The topics you would like to be visualized with the @@ -5869,7 +6074,6 @@

    BERTopic A plotly.graph_objects.Figure including all traces Examples: - To visualize the topics over time, simply run: ```python @@ -5885,26 +6089,30 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_topics_over_time(self, - topics_over_time=topics_over_time, - top_n_topics=top_n_topics, - topics=topics, - normalize_frequency=normalize_frequency, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_topics_per_class(self, - topics_per_class: pd.DataFrame, - top_n_topics: int = 10, - topics: List[int] = None, - normalize_frequency: bool = False, - custom_labels: bool = False, - title: str = "<b>Topics per Class</b>", - width: int = 1250, - height: int = 900) -> go.Figure: - """ Visualize topics per class + return plotting.visualize_topics_over_time( + self, + topics_over_time=topics_over_time, + top_n_topics=top_n_topics, + topics=topics, + normalize_frequency=normalize_frequency, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_topics_per_class( + self, + topics_per_class: pd.DataFrame, + top_n_topics: int = 10, + topics: List[int] = None, + normalize_frequency: bool = False, + custom_labels: bool = False, + title: str = "<b>Topics per Class</b>", + width: int = 1250, + height: int = 900, + ) -> go.Figure: + """Visualize topics per class. Arguments: topics_per_class: The topics you would like to be visualized with the @@ -5922,7 +6130,6 @@

    BERTopic A plotly.graph_objects.Figure including all traces Examples: - To visualize the topics per class, simply run: ```python @@ -5938,24 +6145,28 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_topics_per_class(self, - topics_per_class=topics_per_class, - top_n_topics=top_n_topics, - topics=topics, - normalize_frequency=normalize_frequency, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_distribution(self, - probabilities: np.ndarray, - min_probability: float = 0.015, - custom_labels: bool = False, - title: str = "<b>Topic Probability Distribution</b>", - width: int = 800, - height: int = 600) -> go.Figure: - """ Visualize the distribution of topic probabilities + return plotting.visualize_topics_per_class( + self, + topics_per_class=topics_per_class, + top_n_topics=top_n_topics, + topics=topics, + normalize_frequency=normalize_frequency, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_distribution( + self, + probabilities: np.ndarray, + min_probability: float = 0.015, + custom_labels: bool = False, + title: str = "<b>Topic Probability Distribution</b>", + width: int = 800, + height: int = 600, + ) -> go.Figure: + """Visualize the distribution of topic probabilities. Arguments: probabilities: An array of probability scores @@ -5968,7 +6179,6 @@

    BERTopic height: The height of the figure. Examples: - Make sure to fit the model before and only input the probabilities of a single document: @@ -5984,19 +6194,23 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_distribution(self, - probabilities=probabilities, - min_probability=min_probability, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_approximate_distribution(self, - document: str, - topic_token_distribution: np.ndarray, - normalize: bool = False): - """ Visualize the topic distribution calculated by `.approximate_topic_distribution` + return plotting.visualize_distribution( + self, + probabilities=probabilities, + min_probability=min_probability, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_approximate_distribution( + self, + document: str, + topic_token_distribution: np.ndarray, + normalize: bool = False, + ): + """Visualize the topic distribution calculated by `.approximate_topic_distribution` on a token level. Thereby indicating the extent to which a certain word or phrase belongs to a specific topic. The assumption here is that a single word can belong to multiple similar topics and as such can give information about the broader set of topics within @@ -6016,7 +6230,6 @@

    BERTopic for each token. Examples: - ```python # Calculate the topic distributions on a token level # Note that we need to have `calculate_token_level=True` @@ -6038,28 +6251,33 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_approximate_distribution(self, - document=document, - topic_token_distribution=topic_token_distribution, - normalize=normalize) - - def visualize_hierarchy(self, - orientation: str = "left", - topics: List[int] = None, - top_n_topics: int = None, - custom_labels: bool = False, - title: str = "<b>Hierarchical Clustering</b>", - width: int = 1000, - height: int = 600, - hierarchical_topics: pd.DataFrame = None, - linkage_function: Callable[[csr_matrix], np.ndarray] = None, - distance_function: Callable[[csr_matrix], csr_matrix] = None, - color_threshold: int = 1) -> go.Figure: - """ Visualize a hierarchical structure of the topics + return plotting.visualize_approximate_distribution( + self, + document=document, + topic_token_distribution=topic_token_distribution, + normalize=normalize, + ) + + def visualize_hierarchy( + self, + orientation: str = "left", + topics: List[int] = None, + top_n_topics: int = None, + use_ctfidf: bool = True, + custom_labels: bool = False, + title: str = "<b>Hierarchical Clustering</b>", + width: int = 1000, + height: int = 600, + hierarchical_topics: pd.DataFrame = None, + linkage_function: Callable[[csr_matrix], np.ndarray] = None, + distance_function: Callable[[csr_matrix], csr_matrix] = None, + color_threshold: int = 1, + ) -> go.Figure: + """Visualize a hierarchical structure of the topics. A ward linkage function is used to perform the hierarchical clustering based on the cosine distance - matrix between topic embeddings. + matrix between c-TF-IDF or semantic embeddings of the topics. Arguments: topic_model: A fitted BERTopic instance. @@ -6067,6 +6285,8 @@

    BERTopic Either 'left' or 'bottom' topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. NOTE: Custom labels are only generated for the original @@ -6094,7 +6314,6 @@

    BERTopic fig: A plotly figure Examples: - To visualize the hierarchical structure of topics simply run: @@ -6123,31 +6342,36 @@

    BERTopic style="width:1000px; height: 680px; border: 0px;""></iframe> """ check_is_fitted(self) - return plotting.visualize_hierarchy(self, - orientation=orientation, - topics=topics, - top_n_topics=top_n_topics, - custom_labels=custom_labels, - title=title, - width=width, - height=height, - hierarchical_topics=hierarchical_topics, - linkage_function=linkage_function, - distance_function=distance_function, - color_threshold=color_threshold - ) - - def visualize_heatmap(self, - topics: List[int] = None, - top_n_topics: int = None, - n_clusters: int = None, - custom_labels: bool = False, - title: str = "<b>Similarity Matrix</b>", - width: int = 800, - height: int = 800) -> go.Figure: - """ Visualize a heatmap of the topic's similarity matrix - - Based on the cosine similarity matrix between topic embeddings, + return plotting.visualize_hierarchy( + self, + orientation=orientation, + topics=topics, + top_n_topics=top_n_topics, + use_ctfidf=use_ctfidf, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + hierarchical_topics=hierarchical_topics, + linkage_function=linkage_function, + distance_function=distance_function, + color_threshold=color_threshold, + ) + + def visualize_heatmap( + self, + topics: List[int] = None, + top_n_topics: int = None, + n_clusters: int = None, + use_ctfidf: bool = False, + custom_labels: bool = False, + title: str = "<b>Similarity Matrix</b>", + width: int = 800, + height: int = 800, + ) -> go.Figure: + """Visualize a heatmap of the topic's similarity matrix. + + Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics, a heatmap is created showing the similarity between topics. Arguments: @@ -6155,6 +6379,8 @@

    BERTopic top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. @@ -6165,7 +6391,6 @@

    BERTopic fig: A plotly figure Examples: - To visualize the similarity matrix of topics simply run: @@ -6181,25 +6406,30 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_heatmap(self, - topics=topics, - top_n_topics=top_n_topics, - n_clusters=n_clusters, - custom_labels=custom_labels, - title=title, - width=width, - height=height) - - def visualize_barchart(self, - topics: List[int] = None, - top_n_topics: int = 8, - n_words: int = 5, - custom_labels: bool = False, - title: str = "Topic Word Scores", - width: int = 250, - height: int = 250, - autoscale: bool=False) -> go.Figure: - """ Visualize a barchart of selected topics + return plotting.visualize_heatmap( + self, + topics=topics, + top_n_topics=top_n_topics, + n_clusters=n_clusters, + use_ctfidf=use_ctfidf, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + ) + + def visualize_barchart( + self, + topics: List[int] = None, + top_n_topics: int = 8, + n_words: int = 5, + custom_labels: bool = False, + title: str = "Topic Word Scores", + width: int = 250, + height: int = 250, + autoscale: bool = False, + ) -> go.Figure: + """Visualize a barchart of selected topics. Arguments: topics: A selection of topics to visualize. @@ -6216,7 +6446,6 @@

    BERTopic fig: A plotly figure Examples: - To visualize the barchart of selected topics simply run: @@ -6232,22 +6461,26 @@

    BERTopic ``` """ check_is_fitted(self) - return plotting.visualize_barchart(self, - topics=topics, - top_n_topics=top_n_topics, - n_words=n_words, - custom_labels=custom_labels, - title=title, - width=width, - height=height, - autoscale=autoscale) - - def save(self, - path, - serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle", - save_embedding_model: Union[bool, str] = True, - save_ctfidf: bool = False): - """ Saves the model to the specified path or folder + return plotting.visualize_barchart( + self, + topics=topics, + top_n_topics=top_n_topics, + n_words=n_words, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + autoscale=autoscale, + ) + + def save( + self, + path, + serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle", + save_embedding_model: Union[bool, str] = True, + save_ctfidf: bool = False, + ): + """Saves the model to the specified path or folder. When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should @@ -6269,7 +6502,6 @@

    BERTopic or `pytorch` Examples: - To save the model in an efficient and safe format (safetensors) with c-TF-IDF information: ```python @@ -6294,13 +6526,14 @@

    BERTopic safetensors. """ if serialization == "pickle": - logger.warning("When you use `pickle` to save/load a BERTopic model," - "please make sure that the environments in which you save" - "and load the model are **exactly** the same. The version of BERTopic," - "its dependencies, and python need to remain the same.") - - with open(path, 'wb') as file: + logger.warning( + "When you use `pickle` to save/load a BERTopic model," + "please make sure that the environments in which you save" + "and load the model are **exactly** the same. The version of BERTopic," + "its dependencies, and python need to remain the same." + ) + with open(path, "wb") as file: # This prevents the vectorizer from being too large in size if `min_df` was # set to a value higher than 1 self.vectorizer_model.stop_words_ = None @@ -6313,36 +6546,47 @@

    BERTopicelse: joblib.dump(self, file) elif serialization == "safetensors" or serialization == "pytorch": - # Directory save_directory = Path(path) save_directory.mkdir(exist_ok=True, parents=True) # Check embedding model - if save_embedding_model and hasattr(self.embedding_model, '_hf_model') and not isinstance(save_embedding_model, str): + if ( + save_embedding_model + and hasattr(self.embedding_model, "_hf_model") + and not isinstance(save_embedding_model, str) + ): save_embedding_model = self.embedding_model._hf_model elif not save_embedding_model: - logger.warning("You are saving a BERTopic model without explicitly defining an embedding model." - "If you are using a sentence-transformers model or a HuggingFace model supported" - "by sentence-transformers, please save the model by using a pointer towards that model." - "For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`") + logger.warning( + "You are saving a BERTopic model without explicitly defining an embedding model." + "If you are using a sentence-transformers model or a HuggingFace model supported" + "by sentence-transformers, please save the model by using a pointer towards that model." + "For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`" + ) # Minimal save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization) save_utils.save_topics(model=self, path=save_directory / "topics.json") save_utils.save_images(model=self, path=save_directory / "images") - save_utils.save_config(model=self, path=save_directory / 'config.json', embedding_model=save_embedding_model) + save_utils.save_config( + model=self, + path=save_directory / "config.json", + embedding_model=save_embedding_model, + ) # Additional if save_ctfidf: - save_utils.save_ctfidf(model=self, save_directory=save_directory, serialization=serialization) - save_utils.save_ctfidf_config(model=self, path=save_directory / 'ctfidf_config.json') + save_utils.save_ctfidf( + model=self, + save_directory=save_directory, + serialization=serialization, + ) + save_utils.save_ctfidf_config(model=self, path=save_directory / "ctfidf_config.json") @classmethod - def load(cls, - path: str, - embedding_model=None): - """ Loads the model from the specified path or directory + def load(cls, path: str, embedding_model=None): + """Loads the model from the specified path or directory. Arguments: path: Either load a BERTopic model from a file (`.pickle`) or a folder containing @@ -6351,7 +6595,6 @@

    BERTopic in the BERTopic model file or directory. Examples: - ```python BERTopic.load("model_dir") ``` @@ -6366,10 +6609,10 @@

    BERTopic# Load from Pickle if file_or_dir.is_file(): - with open(file_or_dir, 'rb') as file: + with open(file_or_dir, "rb") as file: if embedding_model: topic_model = joblib.load(file) - topic_model.embedding_model = select_backend(embedding_model) + topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) else: topic_model = joblib.load(file) return topic_model @@ -6381,18 +6624,25 @@

    BERTopictopics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path) else: raise ValueError("Make sure to either pass a valid directory or HF model.") - topic_model = _create_model_from_files(topics, params, tensors, ctfidf_tensors, ctfidf_config, images, - warn_no_backend=(embedding_model is None)) + topic_model = _create_model_from_files( + topics, + params, + tensors, + ctfidf_tensors, + ctfidf_config, + images, + warn_no_backend=(embedding_model is None), + ) # Replace embedding model if one is specifically chosen if embedding_model is not None: - topic_model.embedding_model = select_backend(embedding_model) + topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) return topic_model @classmethod - def merge_models(cls, models, min_similarity: float = .7, embedding_model=None): - """ Merge multiple pre-trained BERTopic models into a single model. + def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None): + """Merge multiple pre-trained BERTopic models into a single model. The models are merged as if they were all saved using pytorch or safetensors, so a minimal version without c-TF-IDF. @@ -6419,7 +6669,6 @@

    BERTopic loading a model from the HuggingFace Hub without c-TF-IDF Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -6439,7 +6688,6 @@

    BERTopic# Temporarily save model and push to HF with TemporaryDirectory() as tmpdir: - # Save model weights and config. all_topics, all_params, all_tensors = [], [], [] for index, model in enumerate(models): @@ -6462,7 +6710,9 @@

    BERTopicsims = np.max(sim_matrix, axis=1) # Extract new topics - new_topics = sorted([index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity]) + new_topics = sorted( + [index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity] + ) max_topic = max(set(merged_topics["topics"])) # Merge Topic Representations @@ -6471,7 +6721,9 @@

    BERTopicif new_topic != -1: max_topic += 1 new_topics_dict[new_topic] = max_topic - merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][str(new_topic)] + merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][ + str(new_topic) + ] merged_topics["topic_labels"][str(max_topic)] = selected_topics["topic_labels"][str(new_topic)] # Add new aspects @@ -6497,14 +6749,14 @@

    BERTopicmerged_tensors = np.vstack([merged_tensors, new_tensors]) # Topic Mapper - merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic+1, 1))).mappings_ + merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_ # Find similar topics and re-assign those from the new models sims_idx = np.argmax(sim_matrix, axis=1) sims = np.max(sim_matrix, axis=1) to_merge = { - a - selected_topics["_outliers"]: - b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims)) + a - selected_topics["_outliers"]: b - merged_topics["_outliers"] + for a, (b, val) in enumerate(zip(sims_idx, sims)) if val >= min_similarity } to_merge.update(new_topics_dict) @@ -6515,28 +6767,37 @@

    BERTopic# Create a new model from the merged parameters merged_tensors = {"topic_embeddings": torch.from_numpy(merged_tensors)} - merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False) + merged_model = _create_model_from_files( + merged_topics, + merged_params, + merged_tensors, + None, + None, + None, + warn_no_backend=False, + ) merged_model.embedding_model = models[0].embedding_model # Replace embedding model if one is specifically chosen + verbose = any([model.verbose for model in models]) if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder: - merged_model.embedding_model = select_backend(embedding_model) + merged_model.embedding_model = select_backend(embedding_model, verbose=verbose) return merged_model def push_to_hf_hub( - self, - repo_id: str, - commit_message: str = 'Add BERTopic model', - token: str = None, - revision: str = None, - private: bool = False, - create_pr: bool = False, - model_card: bool = True, - serialization: str = "safetensors", - save_embedding_model: Union[str, bool] = True, - save_ctfidf: bool = False, - ): - """ Push your BERTopic model to a HuggingFace Hub + self, + repo_id: str, + commit_message: str = "Add BERTopic model", + token: str = None, + revision: str = None, + private: bool = False, + create_pr: bool = False, + model_card: bool = True, + serialization: str = "safetensors", + save_embedding_model: Union[str, bool] = True, + save_ctfidf: bool = False, + ): + """Push your BERTopic model to a HuggingFace Hub. Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account: @@ -6571,7 +6832,6 @@

    BERTopic Examples: - ```python topic_model.push_to_hf_hub( repo_id="ArXiv", @@ -6580,13 +6840,22 @@

    BERTopic ) ``` """ - return save_utils.push_to_hf_hub(model=self, repo_id=repo_id, commit_message=commit_message, - token=token, revision=revision, private=private, create_pr=create_pr, - model_card=model_card, serialization=serialization, - save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf) + return save_utils.push_to_hf_hub( + model=self, + repo_id=repo_id, + commit_message=commit_message, + token=token, + revision=revision, + private=private, + create_pr=create_pr, + model_card=model_card, + serialization=serialization, + save_embedding_model=save_embedding_model, + save_ctfidf=save_ctfidf, + ) def get_params(self, deep: bool = False) -> Mapping[str, Any]: - """ Get parameters for this estimator. + """Get parameters for this estimator. Adapted from: https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178 @@ -6602,19 +6871,21 @@

    BERTopicout = dict() for key in self._get_param_names(): value = getattr(self, key) - if deep and hasattr(value, 'get_params'): + if deep and hasattr(value, "get_params"): deep_items = value.get_params().items() - out.update((key + '__' + k, val) for k, val in deep_items) + out.update((key + "__" + k, val) for k, val in deep_items) out[key] = value return out - def _extract_embeddings(self, - documents: Union[List[str], str], - images: List[str] = None, - method: str = "document", - verbose: bool = None) -> np.ndarray: - """ Extract sentence/document embeddings through pre-trained embeddings - For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html + def _extract_embeddings( + self, + documents: Union[List[str], str], + images: List[str] = None, + method: str = "document", + verbose: bool = None, + ) -> np.ndarray: + """Extract sentence/document embeddings through pre-trained embeddings + For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html. Arguments: documents: Dataframe with documents and their corresponding IDs @@ -6635,44 +6906,47 @@

    BERTopicelif method == "document": embeddings = self.embedding_model.embed_documents(documents, verbose=verbose) elif documents[0] is None and images is None: - raise ValueError("Make sure to use an embedding model that can either embed documents" - "or images depending on which you want to embed.") + raise ValueError( + "Make sure to use an embedding model that can either embed documents" + "or images depending on which you want to embed." + ) else: - raise ValueError("Wrong method for extracting document/word embeddings. " - "Either choose 'word' or 'document' as the method. ") + raise ValueError( + "Wrong method for extracting document/word embeddings. " + "Either choose 'word' or 'document' as the method. " + ) return embeddings def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame: - """ Convert images to text """ + """Convert images to text.""" logger.info("Images - Converting images to text. This might take a while.") if isinstance(self.representation_model, dict): for tuner in self.representation_model.values(): - if getattr(tuner, 'image_to_text_model', False): + if getattr(tuner, "image_to_text_model", False): documents = tuner.image_to_text(documents, embeddings) elif isinstance(self.representation_model, list): for tuner in self.representation_model: - if getattr(tuner, 'image_to_text_model', False): + if getattr(tuner, "image_to_text_model", False): documents = tuner.image_to_text(documents, embeddings) elif isinstance(self.representation_model, BaseRepresentation): - if getattr(self.representation_model, 'image_to_text_model', False): + if getattr(self.representation_model, "image_to_text_model", False): documents = self.representation_model.image_to_text(documents, embeddings) logger.info("Images - Completed \u2713") return documents def _map_predictions(self, predictions: List[int]) -> List[int]: - """ Map predictions to the correct topics if topics were reduced """ + """Map predictions to the correct topics if topics were reduced.""" mappings = self.topic_mapper_.get_mappings(original_topics=True) - mapped_predictions = [mappings[prediction] - if prediction in mappings - else -1 - for prediction in predictions] + mapped_predictions = [mappings[prediction] if prediction in mappings else -1 for prediction in predictions] return mapped_predictions - def _reduce_dimensionality(self, - embeddings: Union[np.ndarray, csr_matrix], - y: Union[List[int], np.ndarray] = None, - partial_fit: bool = False) -> np.ndarray: - """ Reduce dimensionality of embeddings using UMAP and train a UMAP model + def _reduce_dimensionality( + self, + embeddings: Union[np.ndarray, csr_matrix], + y: Union[List[int], np.ndarray] = None, + partial_fit: bool = False, + ) -> np.ndarray: + """Reduce dimensionality of embeddings using UMAP and train a UMAP model. Arguments: embeddings: The extracted embeddings using the sentence transformer module. @@ -6697,25 +6971,26 @@

    BERTopicy = np.array(y) if y is not None else None self.umap_model.fit(embeddings, y=y) except TypeError: - self.umap_model.fit(embeddings) umap_embeddings = self.umap_model.transform(embeddings) logger.info("Dimensionality - Completed \u2713") return np.nan_to_num(umap_embeddings) - def _cluster_embeddings(self, - umap_embeddings: np.ndarray, - documents: pd.DataFrame, - partial_fit: bool = False, - y: np.ndarray = None) -> Tuple[pd.DataFrame, - np.ndarray]: - """ Cluster UMAP embeddings with HDBSCAN + def _cluster_embeddings( + self, + umap_embeddings: np.ndarray, + documents: pd.DataFrame, + partial_fit: bool = False, + y: np.ndarray = None, + ) -> Tuple[pd.DataFrame, np.ndarray]: + """Cluster UMAP embeddings with HDBSCAN. Arguments: umap_embeddings: The reduced sentence embeddings with UMAP documents: Dataframe with documents and their corresponding IDs partial_fit: Whether to run `partial_fit` for online learning + y: Array of topics to use Returns: documents: Updated dataframe with documents and their corresponding IDs @@ -6726,7 +7001,7 @@

    BERTopicif partial_fit: self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings) labels = self.hdbscan_model.labels_ - documents['Topic'] = labels + documents["Topic"] = labels self.topics_ = labels else: try: @@ -6738,14 +7013,9 @@

    BERTopiclabels = self.hdbscan_model.labels_ except AttributeError: labels = y - documents['Topic'] = labels + documents["Topic"] = labels self._update_topic_size(documents) - # Some algorithms have outlier labels (-1) that can be tricky to work - # with if you are slicing data based on that labels. Therefore, we - # track if there are outlier labels and act accordingly when slicing. - self._outliers = 1 if -1 in set(labels) else 0 - # Extract probabilities probabilities = None if hasattr(self.hdbscan_model, "probabilities_"): @@ -6759,9 +7029,10 @@

    BERTopiclogger.info("Cluster - Completed \u2713") return documents, probabilities - def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array, - pd.DataFrame, np.array]: - """ Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list + def _zeroshot_topic_modeling( + self, documents: pd.DataFrame, embeddings: np.ndarray + ) -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]: + """Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list. We transform the topics in `self.zeroshot_topic_list` to embeddings and compare them through cosine similarity with the document embeddings. @@ -6782,7 +7053,9 @@

    BERTopicassignment = np.argmax(cosine_similarities, 1) assignment_vals = np.max(cosine_similarities, 1) assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity] - non_assigned_ids = [index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity] + non_assigned_ids = [ + index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity + ] # Assign topics assigned_documents = documents.iloc[assigned_ids] @@ -6791,171 +7064,94 @@

    BERTopicassigned_documents["ID"] = range(len(assigned_documents)) assigned_embeddings = embeddings[assigned_ids] + # Check that if a number of topics was specified, it exceeds the number of zeroshot topics matched + num_zeroshot_topics = len(assigned_documents["Topic"].unique()) + if self.nr_topics and not self.nr_topics > num_zeroshot_topics: + raise ValueError( + f"The set nr_topics ({self.nr_topics}) must exceed the number of matched zero-shot topics " + f"({num_zeroshot_topics}). Consider raising nr_topics or raising the " + f"zeroshot_min_similarity ({self.zeroshot_min_similarity})." + ) + # Select non-assigned topics to be clustered documents = documents.iloc[non_assigned_ids] documents["Old_ID"] = documents["ID"].copy() documents["ID"] = range(len(documents)) embeddings = embeddings[non_assigned_ids] - # If only matches were found - if len(non_assigned_ids) == 0: - return None, None, assigned_documents, assigned_embeddings logger.info("Zeroshot Step 1 - Completed \u2713") return documents, embeddings, assigned_documents, assigned_embeddings def _is_zeroshot(self): - """ Check whether zero-shot topic modeling is possible + """Check whether zero-shot topic modeling is possible. - * There should be a cluster model used * Embedding model is necessary to convert zero-shot topics to embeddings * Zero-shot topics should be defined """ - if self.zeroshot_topic_list is not None and self.embedding_model is not None and type(self.hdbscan_model) != BaseCluster: + if self.zeroshot_topic_list is not None and self.embedding_model is not None: return True return False - def _combine_zeroshot_topics(self, - documents: pd.DataFrame, - assigned_documents: pd.DataFrame, - embeddings: np.ndarray) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]: - """ Combine the zero-shot topics with the clustered topics + def _combine_zeroshot_topics( + self, + documents: pd.DataFrame, + embeddings: np.ndarray, + assigned_documents: pd.DataFrame, + assigned_embeddings: np.ndarray, + ) -> Tuple[pd.DataFrame, np.ndarray]: + """Combine the zero-shot topics with the clustered topics. + + The zero-shot topics will be inserted between the outlier topic (that may or may not exist) and the rest of the + topics from clustering. The rest of the topics from clustering will be given new IDs to correspond to topics + after zero-shot topics. - There are three cases considered: - * Only zero-shot topics were found which will only return the zero-shot topic model - * Only clustered topics were found which will only return the clustered topic model - * Both zero-shot and clustered topics were found which will return a merged model - * This merged model is created using the `merge_models` function which will ignore - the underlying UMAP and HDBSCAN models + Documents and embeddings used in zero-shot topic modeling and clustering and re-merged. Arguments: - documents: Dataframe with documents and their corresponding IDs - assigned_documents: Dataframe with documents and their corresponding IDs + documents: DataFrame with clustered documents and their corresponding IDs + embeddings: The document embeddings for clustered documents + assigned_documents: DataFrame with documents and their corresponding IDs that were assigned to a zero-shot topic - embeddings: The document embeddings + assigned_embeddings: The document embeddings for documents that were assigned to a zero-shot topic Returns: - topics: The topics for each document - probabilities: The probabilities for each document + documents: DataFrame with all the original documents with their topic assignments + embeddings: np.ndarray of embeddings aligned with the documents """ - logger.info("Zeroshot Step 2 - Clustering documents that were not found in the zero-shot model...") - - # Fit BERTopic without actually performing any clustering - docs = assigned_documents.Document.tolist() - y = assigned_documents.Topic.tolist() - empty_dimensionality_model = BaseDimensionalityReduction() - empty_cluster_model = BaseCluster() - zeroshot_model = BERTopic( - n_gram_range=self.n_gram_range, - low_memory=self.low_memory, - calculate_probabilities=self.calculate_probabilities, - embedding_model=self.embedding_model, - umap_model=empty_dimensionality_model, - hdbscan_model=empty_cluster_model, - vectorizer_model=self.vectorizer_model, - ctfidf_model=self.ctfidf_model, - representation_model=self.representation_model, - verbose=self.verbose - ).fit(docs, embeddings=embeddings, y=y) - logger.info("Zeroshot Step 2 - Completed \u2713") - logger.info("Zeroshot Step 3 - Combining clustered topics with the zeroshot model") + logger.info("Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...") + # Combine Zero-shot topics with topics from clustering + zeroshot_topic_idx_to_topic_id = { + zeroshot_topic_id: new_topic_id + for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic)) + } + self._topic_id_to_zeroshot_topic_idx = { + new_topic_id: zeroshot_topic_id + for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic)) + } + assigned_documents.Topic = assigned_documents.Topic.map(zeroshot_topic_idx_to_topic_id) + num_zeroshot_topics = len(zeroshot_topic_idx_to_topic_id) - # Update model - self.umap_model = BaseDimensionalityReduction() - self.hdbscan_model = BaseCluster() + # Insert zeroshot topics between outlier cluster and other clusters + documents.Topic = documents.Topic.apply( + lambda topic_id: topic_id + num_zeroshot_topics if topic_id != -1 else topic_id + ) - # Update topic label - assigned_topics = assigned_documents.groupby("Topic").first().reset_index() - indices, topics = assigned_topics.ID.values, assigned_topics.Topic.values - labels = [zeroshot_model.topic_labels_[zeroshot_model.topics_[index]] for index in indices] - labels = {label: self.zeroshot_topic_list[topic] for label, topic in zip(labels, topics)} - - # If only zero-shot matches were found and clustering was not performed - if documents is None: - for topic in range(len(set(y))): - if zeroshot_model.topic_labels_.get(topic): - if labels.get(zeroshot_model.topic_labels_[topic]): - zeroshot_model.topic_labels_[topic] = labels[zeroshot_model.topic_labels_[topic]] - self.__dict__.clear() - self.__dict__.update(zeroshot_model.__dict__) - return self.topics_, self.probabilities_ - - # Merge the two topic models - merged_model = BERTopic.merge_models([zeroshot_model, self], min_similarity=1) - - # Update topic labels and representative docs of the zero-shot model - for topic in range(len(set(y))): - if merged_model.topic_labels_.get(topic): - if labels.get(merged_model.topic_labels_[topic]): - label = labels[merged_model.topic_labels_[topic]] - merged_model.topic_labels_[topic] = label - merged_model.representative_docs_[topic] = zeroshot_model.representative_docs_[topic] - - # Add representative docs of the clustered model - for topic in set(self.topics_): - merged_model.representative_docs_[topic + self._outliers + len(set(y))] = self.representative_docs_[topic] - - if self._outliers and merged_model.topic_sizes_.get(-1): - merged_model.topic_sizes_[len(set(y))] = merged_model.topic_sizes_[-1] - del merged_model.topic_sizes_[-1] - - # Update topic assignment by finding the documents with the - # correct updated topics - zeroshot_indices = list(assigned_documents.Old_ID.values) - zeroshot_topics = [self.zeroshot_topic_list[topic] for topic in assigned_documents.Topic.values] - - cluster_indices = list(documents.Old_ID.values) - cluster_names = list(merged_model.topic_labels_.values())[len(set(y)):] - if self._outliers: - cluster_topics = [cluster_names[topic] if topic != -1 else "Outliers" for topic in documents.Topic.values] - else: - cluster_topics = [cluster_names[topic] for topic in documents.Topic.values] - - df = pd.DataFrame({ - "Indices": zeroshot_indices + cluster_indices, - "Label": zeroshot_topics + cluster_topics} - ).sort_values("Indices") - reverse_topic_labels = dict((v, k) for k, v in merged_model.topic_labels_.items()) - if self._outliers: - reverse_topic_labels["Outliers"] = -1 - df.Label = df.Label.map(reverse_topic_labels) - merged_model.topics_ = df.Label.astype(int).tolist() - - # Update the class internally - has_outliers = bool(self._outliers) - self.__dict__.clear() - self.__dict__.update(merged_model.__dict__) - logger.info("Zeroshot Step 3 - Completed \u2713") - - # Move -1 topic back to position 0 if it exists - if has_outliers: - nr_zeroshot_topics = len(set(y)) - - # Re-map the topics such that the -1 topic is at position 0 - new_mappings = {} - for topic in self.topics_: - if topic < nr_zeroshot_topics: - new_mappings[topic] = topic - elif topic == nr_zeroshot_topics: - new_mappings[topic] = -1 - else: - new_mappings[topic] = topic - 1 - - # Re-map the topics including all representations (labels, sizes, embeddings, etc.) - self.topics_ = [new_mappings[topic] for topic in self.topics_] - self.topic_representations_ = {new_mappings[topic]: repr for topic, repr in self.topic_representations_.items()} - self.topic_labels_ = {new_mappings[topic]: label for topic, label in self.topic_labels_.items()} - self.topic_sizes_ = collections.Counter(self.topics_) - self.topic_embeddings_ = np.vstack([ - self.topic_embeddings_[nr_zeroshot_topics], - self.topic_embeddings_[:nr_zeroshot_topics], - self.topic_embeddings_[nr_zeroshot_topics+1:] - ]) - self._outliers = 1 - - return self.topics_ + # Combine the clustered documents/embeddings with assigned documents/embeddings in the original order + documents = pd.concat([documents, assigned_documents]) + embeddings = np.vstack([embeddings, assigned_embeddings]) + sorted_indices = documents.Old_ID.argsort() + documents = documents.iloc[sorted_indices] + embeddings = embeddings[sorted_indices] + + # Update topic sizes and topic mapper + self._update_topic_size(documents) + self.topic_mapper_ = TopicMapper(self.topics_) + + logger.info("Zeroshot Step 2 - Completed \u2713") + return documents, embeddings def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]: - """ Apply Guided Topic Modeling + """Apply Guided Topic Modeling. We transform the seeded topics to embeddings using the same embedder as used for generating document embeddings. @@ -6971,7 +7167,7 @@

    BERTopic Arguments: embeddings: The document embeddings - Returns + Returns: y: The labels for each seeded topic embeddings: Updated embeddings """ @@ -6994,8 +7190,14 @@

    BERTopiclogger.info("Guided - Completed \u2713") return y, embeddings - def _extract_topics(self, documents: pd.DataFrame, embeddings: np.ndarray = None, mappings=None, verbose: bool = False): - """ Extract topics from the clusters using a class-based TF-IDF + def _extract_topics( + self, + documents: pd.DataFrame, + embeddings: np.ndarray = None, + mappings=None, + verbose: bool = False, + ): + """Extract topics from the clusters using a class-based TF-IDF. Arguments: documents: Dataframe with documents and their corresponding IDs @@ -7008,18 +7210,15 @@

    BERTopic """ if verbose: logger.info("Representation - Extracting topics from clusters using representation models.") - documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) self.topic_representations_ = self._extract_words_per_topic(words, documents) self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) - self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) - for key, values in - self.topic_representations_.items()} if verbose: logger.info("Representation - Completed \u2713") def _save_representative_docs(self, documents: pd.DataFrame): - """ Save the 3 most representative docs per topic + """Save the 3 most representative docs per topic. Arguments: documents: Dataframe with documents and their corresponding IDs @@ -7032,21 +7231,22 @@

    BERTopicdocuments, self.topic_representations_, nr_samples=500, - nr_repr_docs=3 + nr_repr_docs=3, ) self.representative_docs_ = repr_docs - def _extract_representative_docs(self, - c_tf_idf: csr_matrix, - documents: pd.DataFrame, - topics: Mapping[str, List[Tuple[str, float]]], - nr_samples: int = 500, - nr_repr_docs: int = 5, - diversity: float = None - ) -> Union[List[str], List[List[int]]]: - """ Approximate most representative documents per topic by sampling + def _extract_representative_docs( + self, + c_tf_idf: csr_matrix, + documents: pd.DataFrame, + topics: Mapping[str, List[Tuple[str, float]]], + nr_samples: int = 500, + nr_repr_docs: int = 5, + diversity: float = None, + ) -> Union[List[str], List[List[int]]]: + """Approximate most representative documents per topic by sampling a subset of the documents in each topic and calculating which are - most represenative to their topic based on the cosine similarity between + most representative to their topic based on the cosine similarity between c-TF-IDF representations. Arguments: @@ -7069,9 +7269,9 @@

    BERTopic# Sample documents per topic documents_per_topic = ( documents.drop("Image", axis=1, errors="ignore") - .groupby('Topic') - .sample(n=nr_samples, replace=True, random_state=42) - .drop_duplicates() + .groupby("Topic") + .sample(n=nr_samples, replace=True, random_state=42) + .drop_duplicates() ) # Find and extract documents that are most similar to the topic @@ -7081,7 +7281,6 @@

    BERTopicrepr_docs_ids = [] labels = sorted(list(topics.keys())) for index, topic in enumerate(labels): - # Slice data selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :] selected_docs = selection["Document"].values @@ -7095,7 +7294,13 @@

    BERTopic# Use MMR to find representative but diverse documents if diversity: - docs = mmr(c_tf_idf[index], ctfidf, selected_docs, top_n=nr_docs, diversity=diversity) + docs = mmr( + c_tf_idf[index], + ctfidf, + selected_docs, + top_n=nr_docs, + diversity=diversity, + ) # Extract top n most representative documents else: @@ -7106,12 +7311,17 @@

    BERTopicrepr_docs_ids.append(doc_ids) repr_docs.extend(docs) repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)]) - repr_docs_mappings = {topic: repr_docs[i[0]:i[-1]+1] for topic, i in zip(topics.keys(), repr_docs_indices)} + repr_docs_mappings = {topic: repr_docs[i[0] : i[-1] + 1] for topic, i in zip(topics.keys(), repr_docs_indices)} return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids - def _create_topic_vectors(self, documents: pd.DataFrame = None, embeddings: np.ndarray = None, mappings=None): - """ Creates embeddings per topics based on their topic representation + def _create_topic_vectors( + self, + documents: pd.DataFrame = None, + embeddings: np.ndarray = None, + mappings=None, + ): + """Creates embeddings per topics based on their topic representation. As a default, topic vectors (topic embeddings) are created by taking the average of all document embeddings within a topic. If topics are @@ -7138,16 +7348,18 @@

    BERTopic# Topic embeddings when merging topics elif self.topic_embeddings_ is not None and mappings is not None: topic_embeddings_dict = {} - for topic_from, topics_to in mappings.items(): - topic_ids = topics_to["topics_to"] - topic_sizes = topics_to["topic_sizes"] + for topic_to, topics_from in mappings.items(): + topic_ids = topics_from["topics_from"] + topic_sizes = topics_from["topic_sizes"] if topic_ids: embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers] topic_embedding = np.average(embds, axis=0, weights=topic_sizes) - topic_embeddings_dict[topic_from] = topic_embedding + topic_embeddings_dict[topic_to] = topic_embedding # Re-order topic embeddings - topics_to_map = {topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]} + topics_to_map = { + topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] + } topic_embeddings = {} for topic, embds in topic_embeddings_dict.items(): topic_embeddings[topics_to_map[topic]] = embds @@ -7167,11 +7379,7 @@

    BERTopic# Extract embeddings for all words in all topics topic_words = [self.get_topic(topic) for topic in topic_list] topic_words = [word[0] for topic in topic_words for word in topic] - word_embeddings = self._extract_embeddings( - topic_words, - method="word", - verbose=False - ) + word_embeddings = self._extract_embeddings(topic_words, method="word", verbose=False) # Take the weighted average of word embeddings in a topic based on their c-TF-IDF value # The embeddings var is a single numpy matrix and therefore slicing is necessary to @@ -7181,16 +7389,22 @@

    BERTopicword_importance = [val[1] for val in self.get_topic(topic)] if sum(word_importance) == 0: word_importance = [1 for _ in range(len(self.get_topic(topic)))] - topic_embedding = np.average(word_embeddings[i * n: n + (i * n)], weights=word_importance, axis=0) + topic_embedding = np.average( + word_embeddings[i * n : n + (i * n)], + weights=word_importance, + axis=0, + ) topic_embeddings.append(topic_embedding) self.topic_embeddings_ = np.array(topic_embeddings) - def _c_tf_idf(self, - documents_per_topic: pd.DataFrame, - fit: bool = True, - partial_fit: bool = False) -> Tuple[csr_matrix, List[str]]: - """ Calculate a class-based TF-IDF where m is the number of total documents. + def _c_tf_idf( + self, + documents_per_topic: pd.DataFrame, + fit: bool = True, + partial_fit: bool = False, + ) -> Tuple[csr_matrix, List[str]]: + """Calculate a class-based TF-IDF where m is the number of total documents. Arguments: documents_per_topic: The joined documents per topic such that each topic has a single @@ -7222,10 +7436,14 @@

    BERTopicmultiplier = None if self.ctfidf_model.seed_words and self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] - multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]) + multiplier = np.array( + [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] + ) multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)]) elif self.ctfidf_model.seed_words: - multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]) + multiplier = np.array( + [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] + ) elif self.seed_topic_list: seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words]) @@ -7238,7 +7456,7 @@

    BERTopicreturn c_tf_idf, words def _update_topic_size(self, documents: pd.DataFrame): - """ Calculate the topic sizes + """Calculate the topic sizes. Arguments: documents: Updated dataframe with documents and their corresponding IDs and newly added Topics @@ -7246,13 +7464,14 @@

    BERTopicself.topic_sizes_ = collections.Counter(documents.Topic.values.tolist()) self.topics_ = documents.Topic.astype(int).tolist() - def _extract_words_per_topic(self, - words: List[str], - documents: pd.DataFrame, - c_tf_idf: csr_matrix = None, - calculate_aspects: bool = True) -> Mapping[str, - List[Tuple[str, float]]]: - """ Based on tf_idf scores per topic, extract the top n words per topic + def _extract_words_per_topic( + self, + words: List[str], + documents: pd.DataFrame, + c_tf_idf: csr_matrix = None, + calculate_aspects: bool = True, + ) -> Mapping[str, List[Tuple[str, float]]]: + """Based on tf_idf scores per topic, extract the top n words per topic. If the top words per topic need to be extracted, then only the `words` parameter needs to be passed. If the top words per topic in a specific timestamp, then it @@ -7263,6 +7482,7 @@

    BERTopic words: List of all words (sorted according to tf_idf matrix position) documents: DataFrame with documents and their topic IDs c_tf_idf: A c-TF-IDF matrix from which to calculate the top words + calculate_aspects: Whether to calculate additional topic aspects Returns: topics: The top words per topic @@ -7282,43 +7502,68 @@

    BERTopicscores = np.take_along_axis(scores, sorted_indices, axis=1) # Get top 30 words per topic based on c-TF-IDF score - topics = {label: [(words[word_index], score) - if word_index is not None and score > 0 - else ("", 0.00001) - for word_index, score in zip(indices[index][::-1], scores[index][::-1]) - ] - for index, label in enumerate(labels)} + base_topics = { + label: [ + (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001) + for word_index, score in zip(indices[index][::-1], scores[index][::-1]) + ] + for index, label in enumerate(labels) + } # Fine-tune the topic representations - if isinstance(self.representation_model, list): + topics = base_topics.copy() + if not self.representation_model: + # Default representation: c_tf_idf + top_n_words + topics = {label: values[: self.top_n_words] for label, values in topics.items()} + elif isinstance(self.representation_model, list): for tuner in self.representation_model: topics = tuner.extract_topics(self, documents, c_tf_idf, topics) elif isinstance(self.representation_model, BaseRepresentation): topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics) elif isinstance(self.representation_model, dict): if self.representation_model.get("Main"): - topics = self.representation_model["Main"].extract_topics(self, documents, c_tf_idf, topics) - topics = {label: values[:self.top_n_words] for label, values in topics.items()} + main_model = self.representation_model["Main"] + if isinstance(main_model, BaseRepresentation): + topics = main_model.extract_topics(self, documents, c_tf_idf, topics) + elif isinstance(main_model, list): + for tuner in main_model: + topics = tuner.extract_topics(self, documents, c_tf_idf, topics) + else: + raise TypeError(f"unsupported type {type(main_model).__name__} for representation_model['Main']") + else: + # Default representation: c_tf_idf + top_n_words + topics = {label: values[: self.top_n_words] for label, values in topics.items()} + else: + raise TypeError(f"unsupported type {type(self.representation_model).__name__} for representation_model") # Extract additional topic aspects if calculate_aspects and isinstance(self.representation_model, dict): for aspect, aspect_model in self.representation_model.items(): - aspects = topics.copy() if aspect != "Main": + aspects = base_topics.copy() + if not aspect_model: + # Default representation: c_tf_idf + top_n_words + aspects = {label: values[: self.top_n_words] for label, values in aspects.items()} if isinstance(aspect_model, list): for tuner in aspect_model: aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects) - self.topic_aspects_[aspect] = aspects elif isinstance(aspect_model, BaseRepresentation): - self.topic_aspects_[aspect] = aspect_model.extract_topics(self, documents, c_tf_idf, aspects) + aspects = aspect_model.extract_topics(self, documents, c_tf_idf, aspects) + else: + raise TypeError( + f"unsupported type {type(aspect_model).__name__} for representation_model[{repr(aspect)}]" + ) + self.topic_aspects_[aspect] = aspects return topics - def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame: - """ Reduce topics to self.nr_topics + def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: + """Reduce topics to self.nr_topics. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic + embeddings are used. Returns: documents: Updated dataframe with documents and the reduced number of Topics @@ -7328,20 +7573,24 @@

    BERTopicif isinstance(self.nr_topics, int): if self.nr_topics < initial_nr_topics: - documents = self._reduce_to_n_topics(documents) + documents = self._reduce_to_n_topics(documents, use_ctfidf) elif isinstance(self.nr_topics, str): - documents = self._auto_reduce_topics(documents) + documents = self._auto_reduce_topics(documents, use_ctfidf) else: raise ValueError("nr_topics needs to be an int or 'auto'! ") - logger.info(f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}") + logger.info( + f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}" + ) return documents - def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame: - """ Reduce topics to self.nr_topics + def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: + """Reduce topics to self.nr_topics. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic + embedding are used. Returns: documents: Updated dataframe with documents and the reduced number of Topics @@ -7349,30 +7598,36 @@

    BERTopictopics = documents.Topic.tolist().copy() # Create topic distance matrix - if self.topic_embeddings_ is not None: - topic_embeddings = self.topic_embeddings_[self._outliers:, ] - else: - topic_embeddings = self.c_tf_idf_[self._outliers:, ].toarray() - distance_matrix = 1-cosine_similarity(topic_embeddings) + topic_embeddings = select_topic_representation( + self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True + )[0][self._outliers :] + distance_matrix = 1 - cosine_similarity(topic_embeddings) np.fill_diagonal(distance_matrix, 0) # Cluster the topic embeddings using AgglomerativeClustering if version.parse(sklearn_version) >= version.parse("1.4.0"): cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric="precomputed", linkage="average") else: - cluster = AgglomerativeClustering(self.nr_topics - self._outliers, affinity="precomputed", linkage="average") + cluster = AgglomerativeClustering( + self.nr_topics - self._outliers, + affinity="precomputed", + linkage="average", + ) cluster.fit(distance_matrix) new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics] # Track mappings and sizes of topics for merging topic embeddings mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)} - mappings = defaultdict(list) + basic_mappings = defaultdict(list) for key, val in sorted(mapped_topics.items()): - mappings[val].append(key) - mappings = {topic_from: - {"topics_to": topics_to, - "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} - for topic_from, topics_to in mappings.items()} + basic_mappings[val].append(key) + mappings = { + topic_to: { + "topics_from": topics_from, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], + } + for topic_to, topics_from in basic_mappings.items() + } # Map topics documents.Topic = new_topics @@ -7382,37 +7637,81 @@

    BERTopic# Update representations documents = self._sort_mappings_by_frequency(documents) self._extract_topics(documents, mappings=mappings) + + # When zero-shot topic(s) are present in the topics to merge, + # determine whether to take one of the zero-shot topic labels + # or use a calculated representation. + if self._is_zeroshot(): + new_topic_id_to_zeroshot_topic_idx = {} + topics_to_map = { + topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] + } + + for topic_to, topics_from in basic_mappings.items(): + # When extracting topics, the reduced topics were reordered. + # Must get the updated topic_to. + topic_to = topics_to_map[topic_to] + + # which of the original topics are zero-shot + zeroshot_topic_ids = [ + topic_id for topic_id in topics_from if topic_id in self._topic_id_to_zeroshot_topic_idx + ] + if len(zeroshot_topic_ids) == 0: + continue + + # If any of the original topics are zero-shot, take the best fitting zero-shot label + # if the cosine similarity with the new topic exceeds the zero-shot threshold + zeroshot_labels = [ + self.zeroshot_topic_list[self._topic_id_to_zeroshot_topic_idx[topic_id]] + for topic_id in zeroshot_topic_ids + ] + zeroshot_embeddings = self._extract_embeddings(zeroshot_labels) + cosine_similarities = cosine_similarity( + zeroshot_embeddings, [self.topic_embeddings_[topic_to]] + ).flatten() + best_zeroshot_topic_idx = np.argmax(cosine_similarities) + best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx] + if best_cosine_similarity >= self.zeroshot_min_similarity: + new_topic_id_to_zeroshot_topic_idx[topic_to] = zeroshot_topic_ids[best_zeroshot_topic_idx] + + self._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx + self._update_topic_size(documents) return documents - def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame: - """ Reduce the number of topics automatically using HDBSCAN + def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: + """Reduce the number of topics automatically using HDBSCAN. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. Returns: documents: Updated dataframe with documents and the reduced number of Topics """ topics = documents.Topic.tolist().copy() - unique_topics = sorted(list(documents.Topic.unique()))[self._outliers:] + unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :] max_topic = unique_topics[-1] # Find similar topics - if self.topic_embeddings_ is not None: - embeddings = np.array(self.topic_embeddings_) - else: - embeddings = self.c_tf_idf_.toarray() - norm_data = normalize(embeddings, norm='l2') - predictions = hdbscan.HDBSCAN(min_cluster_size=2, - metric='euclidean', - cluster_selection_method='eom', - prediction_data=True).fit_predict(norm_data[self._outliers:]) + embeddings = select_topic_representation( + self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True + )[0] + norm_data = normalize(embeddings, norm="l2") + predictions = hdbscan.HDBSCAN( + min_cluster_size=2, + metric="euclidean", + cluster_selection_method="eom", + prediction_data=True, + ).fit_predict(norm_data[self._outliers :]) # Map similar topics - mapped_topics = {unique_topics[index]: prediction + max_topic - for index, prediction in enumerate(predictions) - if prediction != -1} + mapped_topics = { + unique_topics[index]: prediction + max_topic + for index, prediction in enumerate(predictions) + if prediction != -1 + } documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int) mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())} @@ -7420,10 +7719,13 @@

    BERTopicmappings = defaultdict(list) for key, val in sorted(mapped_topics.items()): mappings[val].append(key) - mappings = {topic_from: - {"topics_to": topics_to, - "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} - for topic_from, topics_to in mappings.items()} + mappings = { + topic_from: { + "topics_to": topics_to, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to], + } + for topic_from, topics_to in mappings.items() + } # Update documents and topics self.topic_mapper_.add_mappings(mapped_topics) @@ -7433,7 +7735,7 @@

    BERTopicreturn documents def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame: - """ Reorder mappings by their frequency. + """Reorder mappings by their frequency. For example, if topic 88 was mapped to topic 5 and topic 5 turns out to be the largest topic, @@ -7469,10 +7771,10 @@

    BERTopicself._update_topic_size(documents) return documents - def _map_probabilities(self, - probabilities: Union[np.ndarray, None], - original_topics: bool = False) -> Union[np.ndarray, None]: - """ Map the probabilities to the reduced topics. + def _map_probabilities( + self, probabilities: Union[np.ndarray, None], original_topics: bool = False + ) -> Union[np.ndarray, None]: + """Map the probabilities to the reduced topics. This is achieved by adding together the probabilities of all topics that are mapped to the same topic. Then, the topics that were mapped from are set to 0 as they @@ -7492,8 +7794,12 @@

    BERTopic# Map array of probabilities (probability for assigned topic per document) if probabilities is not None: if len(probabilities.shape) == 2: - mapped_probabilities = np.zeros((probabilities.shape[0], - len(set(mappings.values())) - self._outliers)) + mapped_probabilities = np.zeros( + ( + probabilities.shape[0], + len(set(mappings.values())) - self._outliers, + ) + ) for from_topic, to_topic in mappings.items(): if to_topic != -1 and from_topic != -1: mapped_probabilities[:, to_topic] += probabilities[:, from_topic] @@ -7503,7 +7809,7 @@

    BERTopicreturn probabilities def _preprocess_text(self, documents: np.ndarray) -> List[str]: - """ Basic preprocessing of text + r"""Basic preprocessing of text. Steps: * Replace \n and \t with whitespace @@ -7512,13 +7818,13 @@

    BERTopiccleaned_documents = [doc.replace("\n", " ") for doc in documents] cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents] if self.language == "english": - cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents] + cleaned_documents = [re.sub(r"[^A-Za-z0-9 ]+", "", doc) for doc in cleaned_documents] cleaned_documents = [doc if doc != "" else "emptydoc" for doc in cleaned_documents] return cleaned_documents @staticmethod def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray: - """ Return indices of top n values in each row of a sparse matrix + """Return indices of top n values in each row of a sparse matrix. Retrieved from: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix @@ -7540,7 +7846,7 @@

    BERTopic@staticmethod def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray: - """ Return the top n values for each row in a sparse matrix + """Return the top n values for each row in a sparse matrix. Arguments: matrix: The sparse matrix from which to get the top n indices per row @@ -7557,14 +7863,15 @@

    BERTopic@classmethod def _get_param_names(cls): - """Get parameter names for the estimator + """Get parameter names for the estimator. Adapted from: https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178 """ init_signature = inspect.signature(cls.__init__) - parameters = sorted([p.name for p in init_signature.parameters.values() - if p.name != 'self' and p.kind != p.VAR_KEYWORD]) + parameters = sorted( + [p.name for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD] + ) return parameters def __str__(self): @@ -7594,6 +7901,46 @@

    BERTopic + + + +

    +topic_labels_ + + + property + readonly + + +

    + +
    + +

    Map topic IDs to their labels. +A label is the topic ID, along with the first four words of the topic representation, joined using '_'. +Zeroshot topic labels come from self.zeroshot_topic_list rather than the calculated representation.

    + +

    Returns:

    +

  • + + + + + + + + + + + + +
    TypeDescription
    topic_labels

    a dict mapping a topic ID (int) to its label (str)

    + +
    + + + @@ -7612,7 +7959,7 @@

    -

    BERTopic initialization

    +

    BERTopic initialization.

    Parameters:

    @@ -7658,7 +8005,7 @@

    @@ -7778,26 +8125,27 @@

    min_topic_size int

    The minimum size of the topic. Increasing this value will lead - to a lower number of clusters/topics and vice versa. + to a lower number of clusters/topics and vice versa. It is the same parameter as min_cluster_size in HDBSCAN. NOTE: This param will not be used if you are using hdbscan_model.

    10
    Source code in bertopic\_bertopic.py -
    def __init__(self,
    -             language: str = "english",
    -             top_n_words: int = 10,
    -             n_gram_range: Tuple[int, int] = (1, 1),
    -             min_topic_size: int = 10,
    -             nr_topics: Union[int, str] = None,
    -             low_memory: bool = False,
    -             calculate_probabilities: bool = False,
    -             seed_topic_list: List[List[str]] = None,
    -             zeroshot_topic_list: List[str] = None,
    -             zeroshot_min_similarity: float = .7,
    -             embedding_model=None,
    -             umap_model: UMAP = None,
    -             hdbscan_model: hdbscan.HDBSCAN = None,
    -             vectorizer_model: CountVectorizer = None,
    -             ctfidf_model: TfidfTransformer = None,
    -             representation_model: BaseRepresentation = None,
    -             verbose: bool = False,
    -             ):
    -    """BERTopic initialization
    +          
    def __init__(
    +    self,
    +    language: str = "english",
    +    top_n_words: int = 10,
    +    n_gram_range: Tuple[int, int] = (1, 1),
    +    min_topic_size: int = 10,
    +    nr_topics: Union[int, str] = None,
    +    low_memory: bool = False,
    +    calculate_probabilities: bool = False,
    +    seed_topic_list: List[List[str]] = None,
    +    zeroshot_topic_list: List[str] = None,
    +    zeroshot_min_similarity: float = 0.7,
    +    embedding_model=None,
    +    umap_model: UMAP = None,
    +    hdbscan_model: hdbscan.HDBSCAN = None,
    +    vectorizer_model: CountVectorizer = None,
    +    ctfidf_model: TfidfTransformer = None,
    +    representation_model: BaseRepresentation = None,
    +    verbose: bool = False,
    +):
    +    """BERTopic initialization.
     
         Arguments:
             language: The main language used in your documents. The default sentence-transformers
    @@ -7815,7 +8163,7 @@ 

    NOTE: This param will not be used if you pass in your own CountVectorizer. min_topic_size: The minimum size of the topic. Increasing this value will lead - to a lower number of clusters/topics and vice versa. + to a lower number of clusters/topics and vice versa. It is the same parameter as `min_cluster_size` in HDBSCAN. NOTE: This param will not be used if you are using `hdbscan_model`. nr_topics: Specifying the number of topics will reduce the initial @@ -7867,8 +8215,9 @@

    """ # Topic-based parameters if top_n_words > 100: - logger.warning("Note that extracting more than 100 words from a sparse " - "can slow down computation quite a bit.") + logger.warning( + "Note that extracting more than 100 words from a sparse can slow down computation quite a bit." + ) self.top_n_words = top_n_words self.min_topic_size = min_topic_size @@ -7893,18 +8242,22 @@

    self.representation_model = representation_model # UMAP or another algorithm that has .fit and .transform functions - self.umap_model = umap_model or UMAP(n_neighbors=15, - n_components=5, - min_dist=0.0, - metric='cosine', - low_memory=self.low_memory) + self.umap_model = umap_model or UMAP( + n_neighbors=15, + n_components=5, + min_dist=0.0, + metric="cosine", + low_memory=self.low_memory, + ) # HDBSCAN or another clustering algorithm that has .fit and .predict functions and # the .labels_ variable to extract the labels - self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size, - metric='euclidean', - cluster_selection_method='eom', - prediction_data=True) + self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN( + min_cluster_size=self.min_topic_size, + metric="euclidean", + cluster_selection_method="eom", + prediction_data=True, + ) # Public attributes self.topics_ = None @@ -7913,7 +8266,7 @@

    self.topic_mapper_ = None self.topic_representations_ = None self.topic_embeddings_ = None - self.topic_labels_ = None + self._topic_id_to_zeroshot_topic_idx = {} self.custom_labels_ = None self.c_tf_idf_ = None self.representative_images_ = None @@ -7921,7 +8274,6 @@

    self.topic_aspects_ = {} # Private attributes for internal tracking purposes - self._outliers = 1 self._merged_topics = None if verbose: @@ -8143,18 +8495,19 @@

    Source code in bertopic\_bertopic.py -
    def approximate_distribution(self,
    -                             documents: Union[str, List[str]],
    -                             window: int = 4,
    -                             stride: int = 1,
    -                             min_similarity: float = 0.1,
    -                             batch_size: int = 1000,
    -                             padding: bool = False,
    -                             use_embedding_model: bool = False,
    -                             calculate_tokens: bool = False,
    -                             separator: str = " ") -> Tuple[np.ndarray,
    -                                                            Union[List[np.ndarray], None]]:
    -    """ A post-hoc approximation of topic distributions across documents.
    +          
    def approximate_distribution(
    +    self,
    +    documents: Union[str, List[str]],
    +    window: int = 4,
    +    stride: int = 1,
    +    min_similarity: float = 0.1,
    +    batch_size: int = 1000,
    +    padding: bool = False,
    +    use_embedding_model: bool = False,
    +    calculate_tokens: bool = False,
    +    separator: str = " ",
    +) -> Tuple[np.ndarray, Union[List[np.ndarray], None]]:
    +    """A post-hoc approximation of topic distributions across documents.
     
         In order to perform this approximation, each document is split into tokens
         according to the provided tokenizer in the `CountVectorizer`. Then, a
    @@ -8212,7 +8565,6 @@ 

    and `m` the topics. Examples: - After fitting the model, the topic distributions can be calculated regardless of the clustering model and regardless of whether the documents were previously seen or not: @@ -8240,13 +8592,13 @@

    batch_size = len(documents) batches = 1 else: - batches = math.ceil(len(documents)/batch_size) + batches = math.ceil(len(documents) / batch_size) topic_distributions = [] topic_token_distributions = [] for i in tqdm(range(batches), disable=not self.verbose): - doc_set = documents[i*batch_size: (i+1) * batch_size] + doc_set = documents[i * batch_size : (i + 1) * batch_size] # Extract tokens analyzer = self.vectorizer_model.build_tokenizer() @@ -8262,17 +8614,23 @@

    token_sets = [tokenset] token_sets_ids = [list(range(len(tokenset)))] else: - # Extract tokensets using window and stride parameters stride_indices = list(range(len(tokenset)))[::stride] token_sets = [] token_sets_ids = [] for stride_index in stride_indices: - selected_tokens = tokenset[stride_index: stride_index+window] + selected_tokens = tokenset[stride_index : stride_index + window] if padding or len(selected_tokens) == window: token_sets.append(selected_tokens) - token_sets_ids.append(list(range(stride_index, stride_index+len(selected_tokens)))) + token_sets_ids.append( + list( + range( + stride_index, + stride_index + len(selected_tokens), + ) + ) + ) # Add empty tokens at the beginning and end of a document if padding: @@ -8280,8 +8638,8 @@

    padded_ids = [] t = math.ceil(window / stride) - 1 for i in range(math.ceil(window / stride) - 1): - padded.append(tokenset[:window - ((t-i) * stride)]) - padded_ids.append(list(range(0, window - ((t-i) * stride)))) + padded.append(tokenset[: window - ((t - i) * stride)]) + padded_ids.append(list(range(0, window - ((t - i) * stride)))) token_sets = padded + token_sets token_sets_ids = padded_ids + token_sets_ids @@ -8295,13 +8653,13 @@

    # Calculate similarity between embeddings of token sets and the topics if use_embedding_model: embeddings = self._extract_embeddings(all_sentences, method="document", verbose=True) - similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers:]) + similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :]) # Calculate similarity between c-TF-IDF of token sets and the topics else: bow_doc = self.vectorizer_model.transform(all_sentences) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) - similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Only keep similarities that exceed the minimum similarity[similarity < min_similarity] = 0 @@ -8312,7 +8670,7 @@

    topic_token_distribution = [] for index, token in enumerate(tokens): start = all_indices[index] - end = all_indices[index+1] + end = all_indices[index + 1] if start == end: end = end + 1 @@ -8337,20 +8695,20 @@

    topic_token_distribution.append(np.array(matrix)) topic_distribution.append(np.add.reduce(matrix)) - topic_distribution = normalize(topic_distribution, norm='l1', axis=1) + topic_distribution = normalize(topic_distribution, norm="l1", axis=1) # Aggregate on a tokenset level indicated by the window and stride else: topic_distribution = [] - for index in range(len(all_indices)-1): + for index in range(len(all_indices) - 1): start = all_indices[index] - end = all_indices[index+1] + end = all_indices[index + 1] if start == end: end = end + 1 group = similarity[start:end].sum(axis=0) topic_distribution.append(group) - topic_distribution = normalize(np.array(topic_distribution), norm='l1', axis=1) + topic_distribution = normalize(np.array(topic_distribution), norm="l1", axis=1) topic_token_distribution = None # Combine results @@ -8383,10 +8741,11 @@

    -

    Find topics most similar to a search_term

    -

    Creates an embedding for search_term and compares that with +

    Find topics most similar to a search_term.

    +

    Creates an embedding for a search query and compares that with the topic embeddings. The most similar topics are returned along with their similarity values.

    +

    The query is specified using search_term for text queries or image for image queries.

    The search_term can be of any size but since it is compared with the topic representation it is advised to keep it below 5 words.

    @@ -8408,6 +8767,12 @@

    the term you want to use to search for topics.

    None + + image + str +

    path to the image you want to use to search for topics.

    + None + top_n int @@ -8443,22 +8808,22 @@

    Source code in bertopic\_bertopic.py -
    def find_topics(self,
    -                search_term: str = None,
    -                image: str = None,
    -                top_n: int = 5) -> Tuple[List[int], List[float]]:
    -    """ Find topics most similar to a search_term
    +          
    def find_topics(self, search_term: str = None, image: str = None, top_n: int = 5) -> Tuple[List[int], List[float]]:
    +    """Find topics most similar to a search_term.
     
    -    Creates an embedding for search_term and compares that with
    +    Creates an embedding for a search query and compares that with
         the topic embeddings. The most similar topics are returned
         along with their similarity values.
     
    +    The query is specified using search_term for text queries or image for image queries.
    +
         The search_term can be of any size but since it is compared
         with the topic representation it is advised to keep it
         below 5 words.
     
         Arguments:
             search_term: the term you want to use to search for topics.
    +        image: path to the image you want to use to search for topics.
             top_n: the number of topics to return
     
         Returns:
    @@ -8466,7 +8831,6 @@ 

    similarity: the similarity scores from high to low Examples: - You can use the underlying embedding model to find topics that best represent the search term: @@ -8485,14 +8849,11 @@

    # Extract search_term embeddings and compare with topic embeddings if search_term is not None: - search_embedding = self._extract_embeddings([search_term], - method="word", - verbose=False).flatten() + search_embedding = self._extract_embeddings([search_term], method="word", verbose=False).flatten() elif image is not None: - search_embedding = self._extract_embeddings([None], - images=[image], - method="document", - verbose=False).flatten() + search_embedding = self._extract_embeddings( + [None], images=[image], method="document", verbose=False + ).flatten() sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten() # Extract topics most similar to search_term @@ -8521,7 +8882,7 @@

    -

    Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics

    +

    Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.

    Parameters:

    @@ -8586,12 +8947,14 @@

    Source code in bertopic\_bertopic.py -
    def fit(self,
    -        documents: List[str],
    -        embeddings: np.ndarray = None,
    -        images: List[str] = None,
    -        y: Union[List[int], np.ndarray] = None):
    -    """ Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics
    +          
    def fit(
    +    self,
    +    documents: List[str],
    +    embeddings: np.ndarray = None,
    +    images: List[str] = None,
    +    y: Union[List[int], np.ndarray] = None,
    +):
    +    """Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.
     
         Arguments:
             documents: A list of documents to fit on
    @@ -8602,7 +8965,6 @@ 

    specific instance is specified. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -8737,13 +9099,14 @@

    Source code in bertopic\_bertopic.py -
    def fit_transform(self,
    -                  documents: List[str],
    -                  embeddings: np.ndarray = None,
    -                  images: List[str] = None,
    -                  y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],
    -                                                                   Union[np.ndarray, None]]:
    -    """ Fit the models on a collection of documents, generate topics,
    +          
    def fit_transform(
    +    self,
    +    documents: List[str],
    +    embeddings: np.ndarray = None,
    +    images: List[str] = None,
    +    y: Union[List[int], np.ndarray] = None,
    +) -> Tuple[List[int], Union[np.ndarray, None]]:
    +    """Fit the models on a collection of documents, generate topics,
         and return the probabilities and topic per document.
     
         Arguments:
    @@ -8763,7 +9126,6 @@ 

    computation and may increase memory usage. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -8795,41 +9157,52 @@

    check_embeddings_shape(embeddings, documents) doc_ids = range(len(documents)) if documents is not None else range(len(images)) - documents = pd.DataFrame({"Document": documents, - "ID": doc_ids, - "Topic": None, - "Image": images}) + documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images}) # Extract embeddings if embeddings is None: logger.info("Embedding - Transforming documents to embeddings.") - self.embedding_model = select_backend(self.embedding_model, - language=self.language) - embeddings = self._extract_embeddings(documents.Document.values.tolist(), - images=images, - method="document", - verbose=self.verbose) + self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose) + embeddings = self._extract_embeddings( + documents.Document.values.tolist(), + images=images, + method="document", + verbose=self.verbose, + ) logger.info("Embedding - Completed \u2713") else: if self.embedding_model is not None: - self.embedding_model = select_backend(self.embedding_model, - language=self.language) + self.embedding_model = select_backend( + self.embedding_model, language=self.language, verbose=self.verbose + ) # Guided Topic Modeling if self.seed_topic_list is not None and self.embedding_model is not None: y, embeddings = self._guided_topic_modeling(embeddings) + # Reduce dimensionality and fit UMAP model + umap_embeddings = self._reduce_dimensionality(embeddings, y) + # Zero-shot Topic Modeling if self._is_zeroshot(): - documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(documents, embeddings) - if documents is None: - return self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings) - - # Reduce dimensionality - umap_embeddings = self._reduce_dimensionality(embeddings, y) + documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling( + documents, embeddings + ) + # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering + umap_embeddings = self.umap_model.transform(embeddings) - # Cluster reduced embeddings - documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) + if len(documents) > 0: # No zero-shot topics matched + # Cluster reduced embeddings + documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y) + if self._is_zeroshot() and len(assigned_documents) > 0: + documents, embeddings = self._combine_zeroshot_topics( + documents, embeddings, assigned_documents, assigned_embeddings + ) + else: + # All documents matches zero-shot topics + documents = assigned_documents + embeddings = assigned_embeddings + topics_before_reduction = self.topics_ # Sort and Map Topic IDs by their frequency if not self.nr_topics: @@ -8860,14 +9233,26 @@

    # Save the top 3 most representative documents per topic self._save_representative_docs(documents) + # In the case of zero-shot topics, probability will come from cosine similarity, + # and the HDBSCAN model will be removed + if self._is_zeroshot() and len(assigned_documents) > 0: + self.hdbscan_model = BaseCluster() + sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_)) + + if self.calculate_probabilities: + probabilities = sim_matrix + else: + # Use `topics_before_reduction` because `self.topics_` may have already been updated from + # reducing topics, and the original probabilities are needed for `self._map_probabilities()` + probabilities = sim_matrix[ + np.arange(len(documents)), + np.array(topics_before_reduction) + self._outliers, + ] + # Resulting output self.probabilities_ = self._map_probabilities(probabilities, original_topics=True) predictions = documents.Topic.to_list() - # Combine Zero-shot with outliers - if self._is_zeroshot() and len(documents) != len(doc_ids): - predictions = self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings) - return predictions, self.probabilities_

    @@ -8889,7 +9274,7 @@

    -

    Get labels for each topic in a user-defined format

    +

    Get labels for each topic in a user-defined format.

    Parameters:

    @@ -8966,13 +9351,15 @@

    Source code in bertopic\_bertopic.py -
    def generate_topic_labels(self,
    -                          nr_words: int = 3,
    -                          topic_prefix: bool = True,
    -                          word_length: int = None,
    -                          separator: str = "_",
    -                          aspect: str = None) -> List[str]:
    -    """ Get labels for each topic in a user-defined format
    +          
    def generate_topic_labels(
    +    self,
    +    nr_words: int = 3,
    +    topic_prefix: bool = True,
    +    word_length: int = None,
    +    separator: str = "_",
    +    aspect: str = None,
    +) -> List[str]:
    +    """Get labels for each topic in a user-defined format.
     
         Arguments:
             nr_words: Top `n` words per topic to use
    @@ -8994,7 +9381,6 @@ 

    otherwise it is 0. Examples: - To create our custom topic labels, usage is rather straightforward: ```python @@ -9125,11 +9511,13 @@

    Source code in bertopic\_bertopic.py -
    def get_document_info(self,
    -                      docs: List[str],
    -                      df: pd.DataFrame = None,
    -                      metadata: Mapping[str, Any] = None) -> pd.DataFrame:
    -    """ Get information about the documents on which the topic was trained
    +          
    def get_document_info(
    +    self,
    +    docs: List[str],
    +    df: pd.DataFrame = None,
    +    metadata: Mapping[str, Any] = None,
    +) -> pd.DataFrame:
    +    """Get information about the documents on which the topic was trained
         including the documents themselves, their respective topics, the name
         of each topic, the top n words of each topic, whether it is a
         representative document, and probability of the clustering if the cluster
    @@ -9200,8 +9588,10 @@ 

    if len(self.probabilities_.shape) == 1: document_info["Probability"] = self.probabilities_ else: - document_info["Probability"] = [max(probs) if topic != -1 else 1-sum(probs) - for topic, probs in zip(self.topics_, self.probabilities_)] + document_info["Probability"] = [ + max(probs) if topic != -1 else 1 - sum(probs) + for topic, probs in zip(self.topics_, self.probabilities_) + ] # Add representative document labels repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs] @@ -9276,7 +9666,7 @@

    Source code in bertopic\_bertopic.py
    def get_params(self, deep: bool = False) -> Mapping[str, Any]:
    -    """ Get parameters for this estimator.
    +    """Get parameters for this estimator.
     
         Adapted from:
             https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178
    @@ -9292,9 +9682,9 @@ 

    out = dict() for key in self._get_param_names(): value = getattr(self, key) - if deep and hasattr(value, 'get_params'): + if deep and hasattr(value, "get_params"): deep_items = value.get_params().items() - out.update((key + '__' + k, val) for k, val in deep_items) + out.update((key + "__" + k, val) for k, val in deep_items) out[key] = value return out

    @@ -9378,9 +9768,9 @@

    Source code in bertopic\_bertopic.py
    def get_representative_docs(self, topic: int = None) -> List[str]:
    -    """ Extract the best representing documents per topic.
    +    """Extract the best representing documents per topic.
     
    -    NOTE:
    +    Note:
             This does not extract all documents per topic as all documents
             are not saved within BERTopic. To get all documents, please
             run the following:
    @@ -9401,7 +9791,6 @@ 

    Representative documents of the chosen topic Examples: - To extract the representative docs of all topics: ```python @@ -9442,7 +9831,7 @@

    -

    Return top n words for a specific topic and their c-TF-IDF scores

    +

    Return top n words for a specific topic and their c-TF-IDF scores.

    Parameters:

    @@ -9493,7 +9882,7 @@

    Source code in bertopic\_bertopic.py
    def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]:
    -    """ Return top n words for a specific topic and their c-TF-IDF scores
    +    """Return top n words for a specific topic and their c-TF-IDF scores.
     
         Arguments:
             topic: A specific topic for which you want its representation
    @@ -9504,7 +9893,6 @@ 

    The top n words for a specific word and its respective c-TF-IDF scores Examples: - ```python topic = topic_model.get_topic(12) ``` @@ -9540,7 +9928,7 @@

    -

    Return the size of topics (descending order)

    +

    Return the size of topics (descending order).

    Parameters:

    @@ -9589,7 +9977,7 @@

    Source code in bertopic\_bertopic.py
    def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:
    -    """ Return the size of topics (descending order)
    +    """Return the size of topics (descending order).
     
         Arguments:
             topic: A specific topic for which you want the frequency
    @@ -9599,7 +9987,6 @@ 

    the frequencies of all topics Examples: - To extract the frequency of all topics: ```python @@ -9616,8 +10003,9 @@

    if isinstance(topic, int): return self.topic_sizes_[topic] else: - return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values("Count", - ascending=False) + return pd.DataFrame(self.topic_sizes_.items(), columns=["Topic", "Count"]).sort_values( + "Count", ascending=False + )

    @@ -9682,7 +10070,7 @@

    Source code in bertopic\_bertopic.py
    def get_topic_info(self, topic: int = None) -> pd.DataFrame:
    -    """ Get information about each topic including its ID, frequency, and name.
    +    """Get information about each topic including its ID, frequency, and name.
     
         Arguments:
             topic: A specific topic for which you want the frequency
    @@ -9691,7 +10079,6 @@ 

    info: The information relating to either a single topic or all topics Examples: - ```python info_df = topic_model.get_topic_info() ``` @@ -9715,7 +10102,9 @@

    if self.topic_aspects_: for aspect, values in self.topic_aspects_.items(): if isinstance(list(values.values())[-1], list): - if isinstance(list(values.values())[-1][0], tuple) or isinstance(list(values.values())[-1][0], list): + if isinstance(list(values.values())[-1][0], tuple) or isinstance( + list(values.values())[-1][0], list + ): values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()} elif isinstance(list(values.values())[-1][0], str): values = {topic: " ".join(value).strip() for topic, value in values.items()} @@ -9755,7 +10144,7 @@

    -

    Extract the topic tree such that it can be printed

    +

    Extract the topic tree such that it can be printed.

    Parameters:

    @@ -9772,7 +10161,7 @@

    + This is the output of topic_model.hierarchical_topics()

    @@ -9832,14 +10221,16 @@

    Source code in bertopic\_bertopic.py
    @staticmethod
    -def get_topic_tree(hier_topics: pd.DataFrame,
    -                   max_distance: float = None,
    -                   tight_layout: bool = False) -> str:
    -    """ Extract the topic tree such that it can be printed
    +def get_topic_tree(
    +    hier_topics: pd.DataFrame,
    +    max_distance: float = None,
    +    tight_layout: bool = False,
    +) -> str:
    +    """Extract the topic tree such that it can be printed.
     
         Arguments:
             hier_topics: A dataframe containing the structure of the topic tree.
    -                     This is the output of `topic_model.hierachical_topics()`
    +                     This is the output of `topic_model.hierarchical_topics()`
             max_distance: The maximum distance between two topics. This value is
                           based on the Distance column in `hier_topics`.
             tight_layout: Whether to use a tight layout (narrow width) for
    @@ -9861,7 +10252,6 @@ 

    from `topic_model.get_topic`. In other words, they are the original un-grouped topics. Examples: - ```python # Train model from bertopic import BERTopic @@ -9886,17 +10276,23 @@

    topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()} # Create tree - tree = {str(row[1].Parent_ID): [str(row[1].Child_Left_ID), str(row[1].Child_Right_ID)] - for row in hier_topics.iterrows()} + tree = { + str(row[1].Parent_ID): [ + str(row[1].Child_Left_ID), + str(row[1].Child_Right_ID), + ] + for row in hier_topics.iterrows() + } def get_tree(start, tree): - """ Based on: https://stackoverflow.com/a/51920869/10532563 """ + """Based on: https://stackoverflow.com/a/51920869/10532563.""" def _tree(to_print, start, parent, tree, grandpa=None, indent=""): - # Get distance between merged topics - distance = hier_topics.loc[(hier_topics.Child_Left_ID == parent) | - (hier_topics.Child_Right_ID == parent), "Distance"] + distance = hier_topics.loc[ + (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent), + "Distance", + ] distance = distance.values[0] if len(distance) > 0 else 10 if parent != start: @@ -9904,7 +10300,6 @@

    to_print += topic_to_name[parent] else: if int(parent) <= max_original_topic: - # Do not append topic ID if they are not merged if distance < max_distance: to_print += "■──" + topic_to_name[parent] + f" ── Topic: {parent}" + "\n" @@ -9922,7 +10317,7 @@

    child = tree[parent][-1] to_print += indent + "└" + "─" - to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width+1)) + to_print = _tree(to_print, start, child, tree, parent, indent + " " * (width + 1)) return to_print @@ -9952,7 +10347,7 @@

    -

    Return topics with top n words and their c-TF-IDF score

    +

    Return topics with top n words and their c-TF-IDF score.

    Parameters:

    hier_topics DataFrame

    A dataframe containing the structure of the topic tree. - This is the output of topic_model.hierachical_topics()

    required
    @@ -9997,7 +10392,7 @@

    Source code in bertopic\_bertopic.py
    def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:
    -    """ Return topics with top n words and their c-TF-IDF score
    +    """Return topics with top n words and their c-TF-IDF score.
     
         Arguments:
             full: If True, returns all different forms of topic representations
    @@ -10007,7 +10402,6 @@ 

    self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score Examples: - ```python all_topics = topic_model.get_topics() ``` @@ -10033,16 +10427,16 @@

    -hierarchical_topics(self, docs, linkage_function=None, distance_function=None) +hierarchical_topics(self, docs, use_ctfidf=True, linkage_function=None, distance_function=None)

    -

    Create a hierarchy of topics

    +

    Create a hierarchy of topics.

    To create this hierarchy, BERTopic needs to be already fitted once. -Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF +Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings representation using scipy.cluster.hierarchy.linkage.

    Based on that hierarchy, we calculate the topic representation at each merged step. This is a local representation, as we only assume that the @@ -10066,6 +10460,13 @@

    The documents you used when calling either fit or fit_transform

    + + + + + + @@ -10078,8 +10479,8 @@

    Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

    @@ -10123,14 +10524,17 @@

    Source code in bertopic\_bertopic.py -
    def hierarchical_topics(self,
    -                        docs: List[str],
    -                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,
    -                        distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame:
    -    """ Create a hierarchy of topics
    +          
    def hierarchical_topics(
    +    self,
    +    docs: List[str],
    +    use_ctfidf: bool = True,
    +    linkage_function: Callable[[csr_matrix], np.ndarray] = None,
    +    distance_function: Callable[[csr_matrix], csr_matrix] = None,
    +) -> pd.DataFrame:
    +    """Create a hierarchy of topics.
     
         To create this hierarchy, BERTopic needs to be already fitted once.
    -    Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF
    +    Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings
         representation using `scipy.cluster.hierarchy.linkage`.
     
         Based on that hierarchy, we calculate the topic representation at each
    @@ -10140,12 +10544,14 @@ 

    Arguments: docs: The documents you used when calling either `fit` or `fit_transform` + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. linkage_function: The linkage function to use. Default is: `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)` distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)`. - You can pass any function that returns either a square matrix of - shape (n_samples, n_samples) with zeros on the diagonal and + You can pass any function that returns either a square matrix of + shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. @@ -10155,7 +10561,6 @@

    represented by their parents and their children Examples: - ```python from bertopic import BERTopic topic_model = BERTopic() @@ -10181,21 +10586,26 @@

    distance_function = lambda x: 1 - cosine_similarity(x) if linkage_function is None: - linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) + linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True) # Calculate distance - embeddings = self.c_tf_idf_[self._outliers:] + embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][ + self._outliers : + ] X = distance_function(embeddings) X = validate_distance_matrix(X, embeddings.shape[0]) # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix Z = linkage_function(X) + # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with + # `sch.fcluster(...)` would produce incorrect values for "Topics" for these clusters + if len(Z[:, 2]) != len(np.unique(Z[:, 2])): + Z[:, 2] = get_unique_distances(Z[:, 2]) + # Calculate basic bag-of-words to be iteratively merged later - documents = pd.DataFrame({"Document": docs, - "ID": range(len(docs)), - "Topic": self.topics_}) - documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": self.topics_}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :] clean_documents = self._preprocess_text(documents_per_topic.Document.values) @@ -10209,13 +10619,20 @@

    bow = self.vectorizer_model.transform(clean_documents) # Extract clusters - hier_topics = pd.DataFrame(columns=["Parent_ID", "Parent_Name", "Topics", - "Child_Left_ID", "Child_Left_Name", - "Child_Right_ID", "Child_Right_Name"]) + hier_topics = pd.DataFrame( + columns=[ + "Parent_ID", + "Parent_Name", + "Topics", + "Child_Left_ID", + "Child_Left_Name", + "Child_Right_ID", + "Child_Right_Name", + ] + ) for index in tqdm(range(len(Z))): - # Find clustered documents - clusters = sch.fcluster(Z, t=Z[index][2], criterion='distance') - self._outliers + clusters = sch.fcluster(Z, t=Z[index][2], criterion="distance") - self._outliers nr_clusters = len(clusters) # Extract first topic we find to get the set of topics in a merged topic @@ -10258,14 +10675,21 @@

    child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name # Save results - hier_topics.loc[len(hier_topics), :] = [parent_id, parent_name, - clustered_topics, - int(Z[index][0]), child_left_name, - int(Z[index][1]), child_right_name] + hier_topics.loc[len(hier_topics), :] = [ + parent_id, + parent_name, + clustered_topics, + int(Z[index][0]), + child_left_name, + int(Z[index][1]), + child_right_name, + ] hier_topics["Distance"] = Z[:, 2] hier_topics = hier_topics.sort_values("Parent_ID", ascending=False) - hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]].astype(str) + hier_topics[["Parent_ID", "Child_Left_ID", "Child_Right_ID"]] = hier_topics[ + ["Parent_ID", "Child_Left_ID", "Child_Right_ID"] + ].astype(str) return hier_topics

    @@ -10291,7 +10715,7 @@

    -

    Loads the model from the specified path or directory

    +

    Loads the model from the specified path or directory.

    Parameters:

    required
    use_ctfidfbool

    Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used.

    True
    linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

    The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x). - You can pass any function that returns either a square matrix of - shape (n_samples, n_samples) with zeros on the diagonal and + You can pass any function that returns either a square matrix of + shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix.

    @@ -10331,10 +10755,8 @@

    Source code in bertopic\_bertopic.py
    @classmethod
    -def load(cls,
    -         path: str,
    -         embedding_model=None):
    -    """ Loads the model from the specified path or directory
    +def load(cls, path: str, embedding_model=None):
    +    """Loads the model from the specified path or directory.
     
         Arguments:
             path: Either load a BERTopic model from a file (`.pickle`) or a folder containing
    @@ -10343,7 +10765,6 @@ 

    in the BERTopic model file or directory. Examples: - ```python BERTopic.load("model_dir") ``` @@ -10358,10 +10779,10 @@

    # Load from Pickle if file_or_dir.is_file(): - with open(file_or_dir, 'rb') as file: + with open(file_or_dir, "rb") as file: if embedding_model: topic_model = joblib.load(file) - topic_model.embedding_model = select_backend(embedding_model) + topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) else: topic_model = joblib.load(file) return topic_model @@ -10373,12 +10794,19 @@

    topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path) else: raise ValueError("Make sure to either pass a valid directory or HF model.") - topic_model = _create_model_from_files(topics, params, tensors, ctfidf_tensors, ctfidf_config, images, - warn_no_backend=(embedding_model is None)) + topic_model = _create_model_from_files( + topics, + params, + tensors, + ctfidf_tensors, + ctfidf_config, + images, + warn_no_backend=(embedding_model is None), + ) # Replace embedding model if one is specifically chosen if embedding_model is not None: - topic_model.embedding_model = select_backend(embedding_model) + topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose) return topic_model

    @@ -10484,8 +10912,8 @@

    Source code in bertopic\_bertopic.py
    @classmethod
    -def merge_models(cls, models, min_similarity: float = .7, embedding_model=None):
    -    """ Merge multiple pre-trained BERTopic models into a single model.
    +def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None):
    +    """Merge multiple pre-trained BERTopic models into a single model.
     
         The models are merged as if they were all saved using pytorch or
         safetensors, so a minimal version without c-TF-IDF.
    @@ -10512,7 +10940,6 @@ 

    loading a model from the HuggingFace Hub without c-TF-IDF Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -10532,7 +10959,6 @@

    # Temporarily save model and push to HF with TemporaryDirectory() as tmpdir: - # Save model weights and config. all_topics, all_params, all_tensors = [], [], [] for index, model in enumerate(models): @@ -10555,7 +10981,9 @@

    sims = np.max(sim_matrix, axis=1) # Extract new topics - new_topics = sorted([index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity]) + new_topics = sorted( + [index - selected_topics["_outliers"] for index, sim in enumerate(sims) if sim < min_similarity] + ) max_topic = max(set(merged_topics["topics"])) # Merge Topic Representations @@ -10564,7 +10992,9 @@

    if new_topic != -1: max_topic += 1 new_topics_dict[new_topic] = max_topic - merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][str(new_topic)] + merged_topics["topic_representations"][str(max_topic)] = selected_topics["topic_representations"][ + str(new_topic) + ] merged_topics["topic_labels"][str(max_topic)] = selected_topics["topic_labels"][str(new_topic)] # Add new aspects @@ -10590,14 +11020,14 @@

    merged_tensors = np.vstack([merged_tensors, new_tensors]) # Topic Mapper - merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic+1, 1))).mappings_ + merged_topics["topic_mapper"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_ # Find similar topics and re-assign those from the new models sims_idx = np.argmax(sim_matrix, axis=1) sims = np.max(sim_matrix, axis=1) to_merge = { - a - selected_topics["_outliers"]: - b - merged_topics["_outliers"] for a, (b, val) in enumerate(zip(sims_idx, sims)) + a - selected_topics["_outliers"]: b - merged_topics["_outliers"] + for a, (b, val) in enumerate(zip(sims_idx, sims)) if val >= min_similarity } to_merge.update(new_topics_dict) @@ -10608,12 +11038,21 @@

    # Create a new model from the merged parameters merged_tensors = {"topic_embeddings": torch.from_numpy(merged_tensors)} - merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False) + merged_model = _create_model_from_files( + merged_topics, + merged_params, + merged_tensors, + None, + None, + None, + warn_no_backend=False, + ) merged_model.embedding_model = models[0].embedding_model # Replace embedding model if one is specifically chosen + verbose = any([model.verbose for model in models]) if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder: - merged_model.embedding_model = select_backend(embedding_model) + merged_model.embedding_model = select_backend(embedding_model, verbose=verbose) return merged_model

    @@ -10667,7 +11106,7 @@

    + fit or fit_transform.

    @@ -10687,12 +11126,13 @@

    Source code in bertopic\_bertopic.py -
    def merge_topics(self,
    -                 docs: List[str],
    -                 topics_to_merge: List[Union[Iterable[int], int]],
    -                 images: List[str] = None) -> None:
    -    """
    -    Arguments:
    +          
    def merge_topics(
    +    self,
    +    docs: List[str],
    +    topics_to_merge: List[Union[Iterable[int], int]],
    +    images: List[str] = None,
    +) -> None:
    +    """Arguments:
             docs: The documents you used when calling either `fit` or `fit_transform`
             topics_to_merge: Either a list of topics or a list of list of topics
                              to merge. For example:
    @@ -10700,10 +11140,9 @@ 

    [[1, 2], [3, 4]] will merge topics 1 and 2, and separately merge topics 3 and 4. images: A list of paths to the images used when calling either - `fit` or `fit_transform` + `fit` or `fit_transform`. Examples: - If you want to merge topics 1, 2, and 3: ```python @@ -10722,7 +11161,14 @@

    """ check_is_fitted(self) check_documents_type(docs) - documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))}) + documents = pd.DataFrame( + { + "Document": docs, + "Topic": self.topics_, + "Image": images, + "ID": range(len(docs)), + } + ) mapping = {topic: topic for topic in set(self.topics_)} if isinstance(topics_to_merge[0], int): @@ -10733,17 +11179,21 @@

    for topic in topic_group: mapping[topic] = topic_group[0] else: - raise ValueError("Make sure that `topics_to_merge` is either" - "a list of topics or a list of list of topics.") + raise ValueError( + "Make sure that `topics_to_merge` is either" "a list of topics or a list of list of topics." + ) # Track mappings and sizes of topics for merging topic embeddings mappings = defaultdict(list) for key, val in sorted(mapping.items()): mappings[val].append(key) - mappings = {topic_from: - {"topics_to": topics_to, - "topic_sizes": [self.topic_sizes_[topic] for topic in topics_to]} - for topic_from, topics_to in mappings.items()} + mappings = { + topic_to: { + "topics_from": topics_from, + "topic_sizes": [self.topic_sizes_[topic] for topic in topics_from], + } + for topic_to, topics_from in mappings.items() + } # Update topics documents.Topic = documents.Topic.map(mapping) @@ -10786,7 +11236,7 @@

    procedure now works as follows:

    For each subset of the data:

      -
    1. Generate embeddings with a pre-traing language model
    2. +
    3. Generate embeddings with a pre-trained language model
    4. Incrementally update the dimensionality reduction algorithm with partial_fit
    5. Incrementally update the cluster algorithm with partial_fit
    6. Incrementally update the OnlineCountVectorizer and apply some form of decay
    7. @@ -10854,11 +11304,13 @@

      Source code in bertopic\_bertopic.py -
      def partial_fit(self,
      -                documents: List[str],
      -                embeddings: np.ndarray = None,
      -                y: Union[List[int], np.ndarray] = None):
      -    """ Fit BERTopic on a subset of the data and perform online learning
      +          
      def partial_fit(
      +    self,
      +    documents: List[str],
      +    embeddings: np.ndarray = None,
      +    y: Union[List[int], np.ndarray] = None,
      +):
      +    """Fit BERTopic on a subset of the data and perform online learning
           with batch-like data.
       
           Online topic modeling in BERTopic is performed by using dimensionality
      @@ -10875,7 +11327,7 @@ 

      For each subset of the data: - 1. Generate embeddings with a pre-traing language model + 1. Generate embeddings with a pre-trained language model 2. Incrementally update the dimensionality reduction algorithm with `partial_fit` 3. Incrementally update the cluster algorithm with `partial_fit` 4. Incrementally update the OnlineCountVectorizer and apply some form of decay @@ -10891,7 +11343,6 @@

      specific instance is specified. Examples: - ```python from sklearn.datasets import fetch_20newsgroups from sklearn.cluster import MiniBatchKMeans @@ -10919,28 +11370,31 @@

      # Checks check_embeddings_shape(embeddings, documents) if not hasattr(self.hdbscan_model, "partial_fit"): - raise ValueError("In order to use `.partial_fit`, the cluster model should have " - "a `.partial_fit` function.") + raise ValueError( + "In order to use `.partial_fit`, the cluster model should have " "a `.partial_fit` function." + ) # Prepare documents if isinstance(documents, str): documents = [documents] - documents = pd.DataFrame({"Document": documents, - "ID": range(len(documents)), - "Topic": None}) + documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": None}) # Extract embeddings if embeddings is None: if self.topic_representations_ is None: - self.embedding_model = select_backend(self.embedding_model, - language=self.language) - embeddings = self._extract_embeddings(documents.Document.values.tolist(), - method="document", - verbose=self.verbose) + self.embedding_model = select_backend( + self.embedding_model, language=self.language, verbose=self.verbose + ) + embeddings = self._extract_embeddings( + documents.Document.values.tolist(), + method="document", + verbose=self.verbose, + ) else: if self.embedding_model is not None and self.topic_representations_ is None: - self.embedding_model = select_backend(self.embedding_model, - language=self.language) + self.embedding_model = select_backend( + self.embedding_model, language=self.language, verbose=self.verbose + ) # Reduce dimensionality if self.seed_topic_list is not None and self.embedding_model is not None: @@ -10971,25 +11425,25 @@

      missing_topics = {} # Prepare documents - documents_per_topic = documents.sort_values("Topic").groupby(['Topic'], as_index=False) + documents_per_topic = documents.sort_values("Topic").groupby(["Topic"], as_index=False) updated_topics = documents_per_topic.first().Topic.astype(int) - documents_per_topic = documents_per_topic.agg({'Document': ' '.join}) + documents_per_topic = documents_per_topic.agg({"Document": " ".join}) # Update topic representations self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True) - self.topic_representations_ = self._extract_words_per_topic(updated_words, documents, self.c_tf_idf_, calculate_aspects=False) + self.topic_representations_ = self._extract_words_per_topic( + updated_words, documents, self.c_tf_idf_, calculate_aspects=False + ) self._create_topic_vectors() - self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) - for key, values in self.topic_representations_.items()} # Update topic sizes if len(missing_topics) > 0: - documents = documents.iloc[:-len(missing_topics)] + documents = documents.iloc[: -len(missing_topics)] if self.topic_sizes_ is None: self._update_topic_size(documents) else: - sizes = documents.groupby(['Topic'], as_index=False).count() + sizes = documents.groupby(["Topic"], as_index=False).count() for _, row in sizes.iterrows(): topic = int(row.Topic) if self.topic_sizes_.get(topic) is not None and topic not in missing_topics: @@ -11019,7 +11473,7 @@

      -

      Push your BERTopic model to a HuggingFace Hub

      +

      Push your BERTopic model to a HuggingFace Hub.

      Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:

      • Log in to your HuggingFace account with the following command: @@ -11123,19 +11577,19 @@

        Source code in bertopic\_bertopic.py
        def push_to_hf_hub(
        -        self,
        -        repo_id: str,
        -        commit_message: str = 'Add BERTopic model',
        -        token: str = None,
        -        revision: str = None,
        -        private: bool = False,
        -        create_pr: bool = False,
        -        model_card: bool = True,
        -        serialization: str = "safetensors",
        -        save_embedding_model: Union[str, bool] = True,
        -        save_ctfidf: bool = False,
        -        ):
        -    """ Push your BERTopic model to a HuggingFace Hub
        +    self,
        +    repo_id: str,
        +    commit_message: str = "Add BERTopic model",
        +    token: str = None,
        +    revision: str = None,
        +    private: bool = False,
        +    create_pr: bool = False,
        +    model_card: bool = True,
        +    serialization: str = "safetensors",
        +    save_embedding_model: Union[str, bool] = True,
        +    save_ctfidf: bool = False,
        +):
        +    """Push your BERTopic model to a HuggingFace Hub.
         
             Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:
         
        @@ -11170,7 +11624,6 @@ 

        Examples: - ```python topic_model.push_to_hf_hub( repo_id="ArXiv", @@ -11179,10 +11632,19 @@

        ) ``` """ - return save_utils.push_to_hf_hub(model=self, repo_id=repo_id, commit_message=commit_message, - token=token, revision=revision, private=private, create_pr=create_pr, - model_card=model_card, serialization=serialization, - save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf) + return save_utils.push_to_hf_hub( + model=self, + repo_id=repo_id, + commit_message=commit_message, + token=token, + revision=revision, + private=private, + create_pr=create_pr, + model_card=model_card, + serialization=serialization, + save_embedding_model=save_embedding_model, + save_ctfidf=save_ctfidf, + )

      @@ -11279,6 +11741,12 @@

    + + + + + + @@ -11332,16 +11800,18 @@

    Source code in bertopic\_bertopic.py -
    def reduce_outliers(self,
    -                    documents: List[str],
    -                    topics: List[int],
    -                    images: List[str] = None,
    -                    strategy: str = "distributions",
    -                    probabilities: np.ndarray = None,
    -                    threshold: float = 0,
    -                    embeddings: np.ndarray = None,
    -                    distributions_params: Mapping[str, Any] = {}) -> List[int]:
    -    """ Reduce outliers by merging them with their nearest topic according
    +          
    def reduce_outliers(
    +    self,
    +    documents: List[str],
    +    topics: List[int],
    +    images: List[str] = None,
    +    strategy: str = "distributions",
    +    probabilities: np.ndarray = None,
    +    threshold: float = 0,
    +    embeddings: np.ndarray = None,
    +    distributions_params: Mapping[str, Any] = {},
    +) -> List[int]:
    +    """Reduce outliers by merging them with their nearest topic according
         to one of several strategies.
     
         When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created
    @@ -11388,6 +11858,7 @@ 

    * "embeddings" Calculate the embeddings for outlier documents and find the best matching topic embedding. + probabilities: Probabilities generated by HDBSCAN for each document when using the strategy `"probabilities"`. threshold: The threshold for assigning topics to outlier documents. This value represents the minimum probability when `strategy="probabilities"`. For all other strategies, it represents the minimum similarity. @@ -11418,6 +11889,9 @@

    new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities") ``` """ + if not self._outliers: + raise ValueError("No outliers to reduce.") + if images is not None: strategy = "embeddings" @@ -11427,14 +11901,18 @@

    # Reduce outliers by extracting most likely topics through the topic-term probability matrix if strategy.lower() == "probabilities": - new_topics = [np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic - for topic, prob in zip(topics, probabilities)] + new_topics = [ + np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic + for topic, prob in zip(topics, probabilities) + ] # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions elif strategy.lower() == "distributions": outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] outlier_docs = [documents[index] for index in outlier_ids] - topic_distr, _ = self.approximate_distribution(outlier_docs, min_similarity=threshold, **distributions_params) + topic_distr, _ = self.approximate_distribution( + outlier_docs, min_similarity=threshold, **distributions_params + ) outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr]) new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics] @@ -11446,7 +11924,7 @@

    # Calculate c-TF-IDF of outlier documents with all topics bow_doc = self.vectorizer_model.transform(outlier_docs) c_tf_idf_doc = self.ctfidf_model.transform(bow_doc) - similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:]) + similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 @@ -11456,8 +11934,10 @@

    # Reduce outliers by finding the most similar topic embeddings elif strategy.lower() == "embeddings": if self.embedding_model is None and embeddings is None: - raise ValueError("To use this strategy, you will need to pass a model to `embedding_model`" - "when instantiating BERTopic.") + raise ValueError( + "To use this strategy, you will need to pass a model to `embedding_model`" + "when instantiating BERTopic." + ) outlier_ids = [index for index, topic in enumerate(topics) if topic == -1] if images is not None: outlier_docs = [images[index] for index in outlier_ids] @@ -11472,7 +11952,7 @@

    outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose) else: outlier_embeddings = self.embedding_model.embed_documents(outlier_docs) - similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers:]) + similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :]) # Update topics similarity[similarity < threshold] = 0 @@ -11493,7 +11973,7 @@

    -reduce_topics(self, docs, nr_topics=20, images=None) +reduce_topics(self, docs, nr_topics=20, images=None, use_ctfidf=False)

    @@ -11504,7 +11984,7 @@

    or automatically.

    If nr_topics is an integer, then the number of topics is reduced to nr_topics using AgglomerativeClustering on the cosine distance matrix -of the topic embeddings.

    +of the topic c-TF-IDF or semantic embeddings.

    If nr_topics is "auto", then HDBSCAN is used to automatically reduce the number of topics by running it on the topic embeddings.

    The topics, their sizes, and representations are updated.

    @@ -11539,6 +12019,13 @@

    fit or fit_transform

    + + + + + +
    images List[str]

    A list of paths to the images used when calling either - fit or fit_transform

    None
    'distributions'
    probabilitiesndarray

    Probabilities generated by HDBSCAN for each document when using the strategy "probabilities".

    None
    threshold floatNone
    use_ctfidfbool

    Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used.

    False

    Updates

    @@ -11559,16 +12046,19 @@

    Source code in bertopic\_bertopic.py -
    def reduce_topics(self,
    -                  docs: List[str],
    -                  nr_topics: Union[int, str] = 20,
    -                  images: List[str] = None) -> None:
    -    """ Reduce the number of topics to a fixed number of topics
    +          
    def reduce_topics(
    +    self,
    +    docs: List[str],
    +    nr_topics: Union[int, str] = 20,
    +    images: List[str] = None,
    +    use_ctfidf: bool = False,
    +) -> None:
    +    """Reduce the number of topics to a fixed number of topics
         or automatically.
     
         If nr_topics is an integer, then the number of topics is reduced
         to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
    -    of the topic embeddings.
    +    of the topic c-TF-IDF or semantic embeddings.
     
         If nr_topics is `"auto"`, then HDBSCAN is used to automatically
         reduce the number of topics by running it on the topic embeddings.
    @@ -11580,13 +12070,14 @@ 

    nr_topics: The number of topics you want reduced to images: A list of paths to the images used when calling either `fit` or `fit_transform` + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. Updates: topics_ : Assigns topics to their merged representations. probabilities_ : Assigns probabilities to their merged representations. Examples: - You can further reduce the topics by passing the documents with their topics and probabilities (if they were calculated): @@ -11605,10 +12096,17 @@

    check_documents_type(docs) self.nr_topics = nr_topics - documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))}) + documents = pd.DataFrame( + { + "Document": docs, + "Topic": self.topics_, + "Image": images, + "ID": range(len(docs)), + } + ) # Reduce number of topics - documents = self._reduce_topics(documents) + documents = self._reduce_topics(documents, use_ctfidf) self._merged_topics = None self._save_representative_docs(documents) self.probabilities_ = self._map_probabilities(self.probabilities_) @@ -11634,7 +12132,7 @@

    -

    Saves the model to the specified path or folder

    +

    Saves the model to the specified path or folder.

    When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models @@ -11703,12 +12201,14 @@

    Source code in bertopic\_bertopic.py -
    def save(self,
    -         path,
    -         serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle",
    -         save_embedding_model: Union[bool, str] = True,
    -         save_ctfidf: bool = False):
    -    """ Saves the model to the specified path or folder
    +          
    def save(
    +    self,
    +    path,
    +    serialization: Literal["safetensors", "pickle", "pytorch"] = "pickle",
    +    save_embedding_model: Union[bool, str] = True,
    +    save_ctfidf: bool = False,
    +):
    +    """Saves the model to the specified path or folder.
     
         When saving the model, make sure to also keep track of the versions
         of dependencies and Python used. Loading and saving the model should
    @@ -11730,7 +12230,6 @@ 

    or `pytorch` Examples: - To save the model in an efficient and safe format (safetensors) with c-TF-IDF information: ```python @@ -11755,13 +12254,14 @@

    safetensors. """ if serialization == "pickle": - logger.warning("When you use `pickle` to save/load a BERTopic model," - "please make sure that the environments in which you save" - "and load the model are **exactly** the same. The version of BERTopic," - "its dependencies, and python need to remain the same.") - - with open(path, 'wb') as file: + logger.warning( + "When you use `pickle` to save/load a BERTopic model," + "please make sure that the environments in which you save" + "and load the model are **exactly** the same. The version of BERTopic," + "its dependencies, and python need to remain the same." + ) + with open(path, "wb") as file: # This prevents the vectorizer from being too large in size if `min_df` was # set to a value higher than 1 self.vectorizer_model.stop_words_ = None @@ -11774,30 +12274,43 @@

    else: joblib.dump(self, file) elif serialization == "safetensors" or serialization == "pytorch": - # Directory save_directory = Path(path) save_directory.mkdir(exist_ok=True, parents=True) # Check embedding model - if save_embedding_model and hasattr(self.embedding_model, '_hf_model') and not isinstance(save_embedding_model, str): + if ( + save_embedding_model + and hasattr(self.embedding_model, "_hf_model") + and not isinstance(save_embedding_model, str) + ): save_embedding_model = self.embedding_model._hf_model elif not save_embedding_model: - logger.warning("You are saving a BERTopic model without explicitly defining an embedding model." - "If you are using a sentence-transformers model or a HuggingFace model supported" - "by sentence-transformers, please save the model by using a pointer towards that model." - "For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`") + logger.warning( + "You are saving a BERTopic model without explicitly defining an embedding model." + "If you are using a sentence-transformers model or a HuggingFace model supported" + "by sentence-transformers, please save the model by using a pointer towards that model." + "For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`" + ) # Minimal save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization) save_utils.save_topics(model=self, path=save_directory / "topics.json") save_utils.save_images(model=self, path=save_directory / "images") - save_utils.save_config(model=self, path=save_directory / 'config.json', embedding_model=save_embedding_model) + save_utils.save_config( + model=self, + path=save_directory / "config.json", + embedding_model=save_embedding_model, + ) # Additional if save_ctfidf: - save_utils.save_ctfidf(model=self, save_directory=save_directory, serialization=serialization) - save_utils.save_ctfidf_config(model=self, path=save_directory / 'ctfidf_config.json') + save_utils.save_ctfidf( + model=self, + save_directory=save_directory, + serialization=serialization, + ) + save_utils.save_ctfidf_config(model=self, path=save_directory / "ctfidf_config.json")

    @@ -11818,7 +12331,7 @@

    -

    Set custom topic labels in your fitted BERTopic model

    +

    Set custom topic labels in your fitted BERTopic model.

    Parameters:

    @@ -11869,7 +12382,7 @@

    Source code in bertopic\_bertopic.py
    def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None:
    -    """ Set custom topic labels in your fitted BERTopic model
    +    """Set custom topic labels in your fitted BERTopic model.
     
         Arguments:
             topic_labels: If a list of topic labels, it should contain the same number
    @@ -11881,7 +12394,6 @@ 

    in the dictionary. Examples: - First, we define our topic labels with `.generate_topic_labels` in which we can customize our topic labels: @@ -11917,14 +12429,18 @@

    else: info = self.get_topic_info() original_labels = dict(zip(info.Topic, info.Name)) - custom_labels = [topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics] + custom_labels = [ + topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] + for topic in unique_topics + ] elif isinstance(topic_labels, list): if len(topic_labels) == len(unique_topics): custom_labels = topic_labels else: - raise ValueError("Make sure that `topic_labels` contains the same number " - "of labels as there are topics.") + raise ValueError( + "Make sure that `topic_labels` contains the same number " "of labels as there are topics." + ) self.custom_labels_ = custom_labels

    @@ -11947,7 +12463,7 @@

    -

    Create topics over time

    +

    Create topics over time.

    To create the topics over time, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculate at each timestamp t. Then, the c-TF-IDF representations at timestamp t are @@ -12060,16 +12576,17 @@

    Source code in bertopic\_bertopic.py -
    def topics_over_time(self,
    -                     docs: List[str],
    -                     timestamps: Union[List[str],
    -                                       List[int]],
    -                     topics: List[int] = None,
    -                     nr_bins: int = None,
    -                     datetime_format: str = None,
    -                     evolution_tuning: bool = True,
    -                     global_tuning: bool = True) -> pd.DataFrame:
    -    """ Create topics over time
    +          
    def topics_over_time(
    +    self,
    +    docs: List[str],
    +    timestamps: Union[List[str], List[int]],
    +    topics: List[int] = None,
    +    nr_bins: int = None,
    +    datetime_format: str = None,
    +    evolution_tuning: bool = True,
    +    global_tuning: bool = True,
    +) -> pd.DataFrame:
    +    """Create topics over time.
     
         To create the topics over time, BERTopic needs to be already fitted once.
         From the fitted models, the c-TF-IDF representations are calculate at
    @@ -12077,7 +12594,7 @@ 

    averaged with the global c-TF-IDF representations in order to fine-tune the local representations. - NOTE: + Note: Make sure to use a limited number of unique timestamps (<100) as the c-TF-IDF representation will be calculated at each single unique timestamp. Having a large number of unique timestamps can take some time to be calculated. @@ -12113,7 +12630,6 @@

    at timestamp *t*. Examples: - The timestamps variable represents the timestamp of each document. If you have over 100 unique timestamps, it is advised to bin the timestamps as shown below: @@ -12128,16 +12644,18 @@

    check_documents_type(docs) selected_topics = topics if topics else self.topics_ documents = pd.DataFrame({"Document": docs, "Topic": selected_topics, "Timestamps": timestamps}) - global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False) + global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) all_topics = sorted(list(documents.Topic.unique())) all_topics_indices = {topic: index for index, topic in enumerate(all_topics)} if isinstance(timestamps[0], str): infer_datetime_format = True if not datetime_format else False - documents["Timestamps"] = pd.to_datetime(documents["Timestamps"], - infer_datetime_format=infer_datetime_format, - format=datetime_format) + documents["Timestamps"] = pd.to_datetime( + documents["Timestamps"], + infer_datetime_format=infer_datetime_format, + format=datetime_format, + ) if nr_bins: documents["Bins"] = pd.cut(documents.Timestamps, bins=nr_bins) @@ -12147,34 +12665,45 @@

    documents = documents.sort_values("Timestamps") timestamps = documents.Timestamps.unique() if len(timestamps) > 100: - logger.warning(f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) " - "which significantly slows down the application. Consider setting `nr_bins` " - "to a value lower than 100 to speed up calculation. ") + logger.warning( + f"There are more than 100 unique timestamps (i.e., {len(timestamps)}) " + "which significantly slows down the application. Consider setting `nr_bins` " + "to a value lower than 100 to speed up calculation. " + ) # For each unique timestamp, create topic representations topics_over_time = [] for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose): - # Calculate c-TF-IDF representation for a specific timestamp selection = documents.loc[documents.Timestamps == timestamp, :] - documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join, - "Timestamps": "count"}) + documents_per_topic = selection.groupby(["Topic"], as_index=False).agg( + {"Document": " ".join, "Timestamps": "count"} + ) c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) if global_tuning or evolution_tuning: - c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False) + c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF # matrix at timestamp t-1 if evolution_tuning and index != 0: current_topics = sorted(list(documents_per_topic.Topic.values)) - overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics)))) + overlapping_topics = sorted( + list(set(previous_topics).intersection(set(current_topics))) # noqa: F821 + ) current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics] - previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics] - - c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] + - previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil() + previous_overlap_idx = [ + previous_topics.index(topic) # noqa: F821 + for topic in overlapping_topics + ] + + c_tf_idf.tolil()[current_overlap_idx] = ( + ( + c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx] # noqa: F821 + ) + / 2.0 + ).tolil() # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two @@ -12184,19 +12713,25 @@

    # Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) - topic_frequency = pd.Series(documents_per_topic.Timestamps.values, - index=documents_per_topic.Topic).to_dict() + topic_frequency = pd.Series( + documents_per_topic.Timestamps.values, index=documents_per_topic.Topic + ).to_dict() # Fill dataframe with results - topics_at_timestamp = [(topic, - ", ".join([words[0] for words in values][:5]), - topic_frequency[topic], - timestamp) for topic, values in words_per_topic.items()] + topics_at_timestamp = [ + ( + topic, + ", ".join([words[0] for words in values][:5]), + topic_frequency[topic], + timestamp, + ) + for topic, values in words_per_topic.items() + ] topics_over_time.extend(topics_at_timestamp) if evolution_tuning: - previous_topics = sorted(list(documents_per_topic.Topic.values)) - previous_c_tf_idf = c_tf_idf.copy() + previous_topics = sorted(list(documents_per_topic.Topic.values)) # noqa: F841 + previous_c_tf_idf = c_tf_idf.copy() # noqa: F841 return pd.DataFrame(topics_over_time, columns=["Topic", "Words", "Frequency", "Timestamp"])

    @@ -12219,7 +12754,7 @@

    -

    Create topics per class

    +

    Create topics per class.

    To create the topics per class, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculated at each class c. Then, the c-TF-IDF representations at class c are @@ -12292,11 +12827,13 @@

    Source code in bertopic\_bertopic.py -
    def topics_per_class(self,
    -                     docs: List[str],
    -                     classes: Union[List[int], List[str]],
    -                     global_tuning: bool = True) -> pd.DataFrame:
    -    """ Create topics per class
    +          
    def topics_per_class(
    +    self,
    +    docs: List[str],
    +    classes: Union[List[int], List[str]],
    +    global_tuning: bool = True,
    +) -> pd.DataFrame:
    +    """Create topics per class.
     
         To create the topics per class, BERTopic needs to be already fitted once.
         From the fitted models, the c-TF-IDF representations are calculated at
    @@ -12305,7 +12842,7 @@ 

    local representations. This can be turned off if the pure representation is needed. - NOTE: + Note: Make sure to use a limited number of unique classes (<100) as the c-TF-IDF representation will be calculated at each single unique class. Having a large number of unique classes can take some time to be calculated. @@ -12322,7 +12859,6 @@

    for each class. Examples: - ```python from bertopic import BERTopic topic_model = BERTopic() @@ -12332,34 +12868,38 @@

    """ check_documents_type(docs) documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Class": classes}) - global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False) + global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm="l1", copy=False) # For each unique timestamp, create topic representations topics_per_class = [] for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose): - # Calculate c-TF-IDF representation for a specific timestamp selection = documents.loc[documents.Class == class_, :] - documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join, - "Class": "count"}) + documents_per_topic = selection.groupby(["Topic"], as_index=False).agg( + {"Document": " ".join, "Class": "count"} + ) c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False) # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation # by simply taking the average of the two if global_tuning: - c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False) + c_tf_idf = normalize(c_tf_idf, axis=1, norm="l1", copy=False) c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0 # Extract the words per topic words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False) - topic_frequency = pd.Series(documents_per_topic.Class.values, - index=documents_per_topic.Topic).to_dict() + topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict() # Fill dataframe with results - topics_at_class = [(topic, - ", ".join([words[0] for words in values][:5]), - topic_frequency[topic], - class_) for topic, values in words_per_topic.items()] + topics_at_class = [ + ( + topic, + ", ".join([words[0] for words in values][:5]), + topic_frequency[topic], + class_, + ) + for topic, values in words_per_topic.items() + ] topics_per_class.extend(topics_at_class) topics_per_class = pd.DataFrame(topics_per_class, columns=["Topic", "Words", "Frequency", "Class"]) @@ -12385,7 +12925,7 @@

    -

    After having fit a model, use transform to predict new instances

    +

    After having fit a model, use transform to predict new instances.

    Parameters:

    @@ -12464,11 +13004,13 @@

    Source code in bertopic\_bertopic.py -
    def transform(self,
    -              documents: Union[str, List[str]],
    -              embeddings: np.ndarray = None,
    -              images: List[str] = None) -> Tuple[List[int], np.ndarray]:
    -    """ After having fit a model, use transform to predict new instances
    +          
    def transform(
    +    self,
    +    documents: Union[str, List[str]],
    +    embeddings: np.ndarray = None,
    +    images: List[str] = None,
    +) -> Tuple[List[int], np.ndarray]:
    +    """After having fit a model, use transform to predict new instances.
     
         Arguments:
             documents: A single document or a list of documents to predict on
    @@ -12484,7 +13026,6 @@ 

    decrease memory usage. Examples: - ```python from bertopic import BERTopic from sklearn.datasets import fetch_20newsgroups @@ -12518,16 +13059,15 @@

    documents = [documents] if embeddings is None: - embeddings = self._extract_embeddings(documents, - images=images, - method="document", - verbose=self.verbose) + embeddings = self._extract_embeddings(documents, images=images, method="document", verbose=self.verbose) # Check if an embedding model was found if embeddings is None: - raise ValueError("No embedding model was found to embed the documents." - "Make sure when loading in the model using BERTopic.load()" - "to also specify the embedding model.") + raise ValueError( + "No embedding model was found to embed the documents." + "Make sure when loading in the model using BERTopic.load()" + "to also specify the embedding model." + ) # Transform without hdbscan_model and umap_model using only cosine similarity elif type(self.hdbscan_model) == BaseCluster: @@ -12549,7 +13089,9 @@

    # Extract predictions and probabilities if it is a HDBSCAN-like model logger.info("Clustering - Approximating new points with `hdbscan_model`") if is_supported_hdbscan(self.hdbscan_model): - predictions, probabilities = hdbscan_delegator(self.hdbscan_model, "approximate_predict", umap_embeddings) + predictions, probabilities = hdbscan_delegator( + self.hdbscan_model, "approximate_predict", umap_embeddings + ) # Calculate probabilities if self.calculate_probabilities: @@ -12679,16 +13221,18 @@

    Source code in bertopic\_bertopic.py -
    def update_topics(self,
    -                  docs: List[str],
    -                  images: List[str] = None,
    -                  topics: List[int] = None,
    -                  top_n_words: int = 10,
    -                  n_gram_range: Tuple[int, int] = None,
    -                  vectorizer_model: CountVectorizer = None,
    -                  ctfidf_model: ClassTfidfTransformer = None,
    -                  representation_model: BaseRepresentation = None):
    -    """ Updates the topic representation by recalculating c-TF-IDF with the new
    +          
    def update_topics(
    +    self,
    +    docs: List[str],
    +    images: List[str] = None,
    +    topics: List[int] = None,
    +    top_n_words: int = 10,
    +    n_gram_range: Tuple[int, int] = None,
    +    vectorizer_model: CountVectorizer = None,
    +    ctfidf_model: ClassTfidfTransformer = None,
    +    representation_model: BaseRepresentation = None,
    +):
    +    """Updates the topic representation by recalculating c-TF-IDF with the new
         parameters as defined in this function.
     
         When you have trained a model and viewed the topics and the words that represent them,
    @@ -12715,7 +13259,6 @@ 

    are supported. Examples: - In order to update the topic representation, you will need to first fit the topic model and extract topics from them. Based on these, you can update the representation: @@ -12744,8 +13287,9 @@

    n_gram_range = self.n_gram_range if top_n_words > 100: - logger.warning("Note that extracting more than 100 words from a sparse " - "can slow down computation quite a bit.") + logger.warning( + "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit." + ) self.top_n_words = top_n_words self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range) self.ctfidf_model = ctfidf_model or ClassTfidfTransformer() @@ -12754,35 +13298,38 @@

    if topics is None: topics = self.topics_ else: - logger.warning("Using a custom list of topic assignments may lead to errors if " - "topic reduction techniques are used afterwards. Make sure that " - "manually assigning topics is the last step in the pipeline." - "Note that topic embeddings will also be created through weighted" - "c-TF-IDF embeddings instead of centroid embeddings.") - - self._outliers = 1 if -1 in set(topics) else 0 + logger.warning( + "Using a custom list of topic assignments may lead to errors if " + "topic reduction techniques are used afterwards. Make sure that " + "manually assigning topics is the last step in the pipeline." + "Note that topic embeddings will also be created through weighted" + "c-TF-IDF embeddings instead of centroid embeddings." + ) - # Extract words documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images}) - documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join}) + documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) + + # Update topic sizes and assignments + self._update_topic_size(documents) + + # Extract words and update topic labels self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) self.topic_representations_ = self._extract_words_per_topic(words, documents) # Update topic vectors if set(topics) != self.topics_: - # Remove outlier topic embedding if all that has changed is the outlier class - same_position = all([True if old_topic == new_topic else False for old_topic, new_topic in zip(self.topics_, topics) if old_topic != -1]) + same_position = all( + [ + True if old_topic == new_topic else False + for old_topic, new_topic in zip(self.topics_, topics) + if old_topic != -1 + ] + ) if same_position and -1 not in topics and -1 in self.topics_: self.topic_embeddings_ = self.topic_embeddings_[1:] else: self._create_topic_vectors() - - # Update topic labels - self.topic_labels_ = {key: f"{key}_" + "_".join([word[0] for word in values[:4]]) - for key, values in - self.topic_representations_.items()} - self._update_topic_size(documents)

    @@ -12885,11 +13432,13 @@

    Source code in bertopic\_bertopic.py -
    def visualize_approximate_distribution(self,
    -                                       document: str,
    -                                       topic_token_distribution: np.ndarray,
    -                                       normalize: bool = False):
    -    """ Visualize the topic distribution calculated by `.approximate_topic_distribution`
    +          
    def visualize_approximate_distribution(
    +    self,
    +    document: str,
    +    topic_token_distribution: np.ndarray,
    +    normalize: bool = False,
    +):
    +    """Visualize the topic distribution calculated by `.approximate_topic_distribution`
         on a token level. Thereby indicating the extent to which a certain word or phrase belongs
         to a specific topic. The assumption here is that a single word can belong to multiple
         similar topics and as such can give information about the broader set of topics within
    @@ -12909,7 +13458,6 @@ 

    for each token. Examples: - ```python # Calculate the topic distributions on a token level # Note that we need to have `calculate_token_level=True` @@ -12931,10 +13479,12 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_approximate_distribution(self, - document=document, - topic_token_distribution=topic_token_distribution, - normalize=normalize) + return plotting.visualize_approximate_distribution( + self, + document=document, + topic_token_distribution=topic_token_distribution, + normalize=normalize, + )

    @@ -12955,7 +13505,7 @@

    -

    Visualize a barchart of selected topics

    +

    Visualize a barchart of selected topics.

    Parameters:

    @@ -13047,16 +13597,18 @@

    Source code in bertopic\_bertopic.py -
    def visualize_barchart(self,
    -                       topics: List[int] = None,
    -                       top_n_topics: int = 8,
    -                       n_words: int = 5,
    -                       custom_labels: bool = False,
    -                       title: str = "Topic Word Scores",
    -                       width: int = 250,
    -                       height: int = 250,
    -                       autoscale: bool=False) -> go.Figure:
    -    """ Visualize a barchart of selected topics
    +          
    def visualize_barchart(
    +    self,
    +    topics: List[int] = None,
    +    top_n_topics: int = 8,
    +    n_words: int = 5,
    +    custom_labels: bool = False,
    +    title: str = "Topic Word Scores",
    +    width: int = 250,
    +    height: int = 250,
    +    autoscale: bool = False,
    +) -> go.Figure:
    +    """Visualize a barchart of selected topics.
     
         Arguments:
             topics: A selection of topics to visualize.
    @@ -13073,7 +13625,6 @@ 

    fig: A plotly figure Examples: - To visualize the barchart of selected topics simply run: @@ -13089,15 +13640,17 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_barchart(self, - topics=topics, - top_n_topics=top_n_topics, - n_words=n_words, - custom_labels=custom_labels, - title=title, - width=width, - height=height, - autoscale=autoscale) + return plotting.visualize_barchart( + self, + topics=topics, + top_n_topics=top_n_topics, + n_words=n_words, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + autoscale=autoscale, + )

    @@ -13118,7 +13671,7 @@

    -

    Visualize the distribution of topic probabilities

    +

    Visualize the distribution of topic probabilities.

    Parameters:

    @@ -13184,14 +13737,16 @@

    Source code in bertopic\_bertopic.py -
    def visualize_distribution(self,
    -                           probabilities: np.ndarray,
    -                           min_probability: float = 0.015,
    -                           custom_labels: bool = False,
    -                           title: str = "<b>Topic Probability Distribution</b>",
    -                           width: int = 800,
    -                           height: int = 600) -> go.Figure:
    -    """ Visualize the distribution of topic probabilities
    +          
    def visualize_distribution(
    +    self,
    +    probabilities: np.ndarray,
    +    min_probability: float = 0.015,
    +    custom_labels: bool = False,
    +    title: str = "<b>Topic Probability Distribution</b>",
    +    width: int = 800,
    +    height: int = 600,
    +) -> go.Figure:
    +    """Visualize the distribution of topic probabilities.
     
         Arguments:
             probabilities: An array of probability scores
    @@ -13204,7 +13759,6 @@ 

    height: The height of the figure. Examples: - Make sure to fit the model before and only input the probabilities of a single document: @@ -13220,13 +13774,15 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_distribution(self, - probabilities=probabilities, - min_probability=min_probability, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_distribution( + self, + probabilities=probabilities, + min_probability=min_probability, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -13274,6 +13830,18 @@

    Source code in bertopic\_bertopic.py -
    def visualize_document_datamap(self,
    -                               docs: List[str],
    -                               topics: List[int] = None,
    -                               embeddings: np.ndarray = None,
    -                               reduced_embeddings: np.ndarray = None,
    -                               custom_labels: Union[bool, str] = False,
    -                               title: str = "Documents and Topics",
    -                               sub_title: Union[str, None] = None,
    -                               width: int = 1200,
    -                               height: int = 1200,
    -                               **datamap_kwds):
    -    """ Visualize documents and their topics in 2D as a static plot for publication using
    +          
    def visualize_document_datamap(
    +    self,
    +    docs: List[str],
    +    topics: List[int] = None,
    +    embeddings: np.ndarray = None,
    +    reduced_embeddings: np.ndarray = None,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "Documents and Topics",
    +    sub_title: Union[str, None] = None,
    +    width: int = 1200,
    +    height: int = 1200,
    +    **datamap_kwds,
    +):
    +    """Visualize documents and their topics in 2D as a static plot for publication using
         DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best
         to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model.
     
         Arguments:
             topic_model:  A fitted BERTopic instance.
             docs: The documents you used when calling either `fit` or `fit_transform`
    +        topics: A selection of topics to visualize.
    +        Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points.
             embeddings:  The embeddings of all documents in `docs`.
             reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.
             custom_labels:  If bool, whether to use custom topic labels that were defined using
    @@ -13414,7 +13986,6 @@ 

    figure: A Matplotlib Figure object. Examples: - To visualize the topics simply run: ```python @@ -13457,17 +14028,19 @@

    """ check_is_fitted(self) check_documents_type(docs) - return plotting.visualize_document_datamap(self, - docs, - topics, - embeddings, - reduced_embeddings, - custom_labels, - title, - sub_title, - width, - height, - **datamap_kwds) + return plotting.visualize_document_datamap( + self, + docs, + topics, + embeddings, + reduced_embeddings, + custom_labels, + title, + sub_title, + width, + height, + **datamap_kwds, + )

    @@ -13488,7 +14061,7 @@

    -

    Visualize documents and their topics in 2D

    +

    Visualize documents and their topics in 2D.

    Parameters:

    @@ -13621,19 +14194,21 @@

    Source code in bertopic\_bertopic.py -
    def visualize_documents(self,
    -                        docs: List[str],
    -                        topics: List[int] = None,
    -                        embeddings: np.ndarray = None,
    -                        reduced_embeddings: np.ndarray = None,
    -                        sample: float = None,
    -                        hide_annotations: bool = False,
    -                        hide_document_hover: bool = False,
    -                        custom_labels: bool = False,
    -                        title: str = "<b>Documents and Topics</b>",
    -                        width: int = 1200,
    -                        height: int = 750) -> go.Figure:
    -    """ Visualize documents and their topics in 2D
    +          
    def visualize_documents(
    +    self,
    +    docs: List[str],
    +    topics: List[int] = None,
    +    embeddings: np.ndarray = None,
    +    reduced_embeddings: np.ndarray = None,
    +    sample: float = None,
    +    hide_annotations: bool = False,
    +    hide_document_hover: bool = False,
    +    custom_labels: bool = False,
    +    title: str = "<b>Documents and Topics</b>",
    +    width: int = 1200,
    +    height: int = 750,
    +) -> go.Figure:
    +    """Visualize documents and their topics in 2D.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -13658,7 +14233,6 @@ 

    height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -13704,18 +14278,20 @@

    """ check_is_fitted(self) check_documents_type(docs) - return plotting.visualize_documents(self, - docs=docs, - topics=topics, - embeddings=embeddings, - reduced_embeddings=reduced_embeddings, - sample=sample, - hide_annotations=hide_annotations, - hide_document_hover=hide_document_hover, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_documents( + self, + docs=docs, + topics=topics, + embeddings=embeddings, + reduced_embeddings=reduced_embeddings, + sample=sample, + hide_annotations=hide_annotations, + hide_document_hover=hide_document_hover, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -13729,15 +14305,15 @@

    -visualize_heatmap(self, topics=None, top_n_topics=None, n_clusters=None, custom_labels=False, title='<b>Similarity Matrix</b>', width=800, height=800) +visualize_heatmap(self, topics=None, top_n_topics=None, n_clusters=None, use_ctfidf=False, custom_labels=False, title='<b>Similarity Matrix</b>', width=800, height=800)

    -

    Visualize a heatmap of the topic's similarity matrix

    -

    Based on the cosine similarity matrix between topic embeddings, +

    Visualize a heatmap of the topic's similarity matrix.

    +

    Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics, a heatmap is created showing the similarity between topics.

    Parameters:

    @@ -13770,6 +14346,13 @@

    matrix by those clusters.

    + + + + + + @@ -13825,17 +14408,20 @@

    Source code in bertopic\_bertopic.py -
    def visualize_heatmap(self,
    -                      topics: List[int] = None,
    -                      top_n_topics: int = None,
    -                      n_clusters: int = None,
    -                      custom_labels: bool = False,
    -                      title: str = "<b>Similarity Matrix</b>",
    -                      width: int = 800,
    -                      height: int = 800) -> go.Figure:
    -    """ Visualize a heatmap of the topic's similarity matrix
    -
    -    Based on the cosine similarity matrix between topic embeddings,
    +          
    def visualize_heatmap(
    +    self,
    +    topics: List[int] = None,
    +    top_n_topics: int = None,
    +    n_clusters: int = None,
    +    use_ctfidf: bool = False,
    +    custom_labels: bool = False,
    +    title: str = "<b>Similarity Matrix</b>",
    +    width: int = 800,
    +    height: int = 800,
    +) -> go.Figure:
    +    """Visualize a heatmap of the topic's similarity matrix.
    +
    +    Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics,
         a heatmap is created showing the similarity between topics.
     
         Arguments:
    @@ -13843,6 +14429,8 @@ 

    top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. @@ -13853,7 +14441,6 @@

    fig: A plotly figure Examples: - To visualize the similarity matrix of topics simply run: @@ -13869,14 +14456,17 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_heatmap(self, - topics=topics, - top_n_topics=top_n_topics, - n_clusters=n_clusters, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_heatmap( + self, + topics=topics, + top_n_topics=top_n_topics, + n_clusters=n_clusters, + use_ctfidf=use_ctfidf, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -13897,7 +14487,7 @@

    -

    Visualize documents and their topics in 2D at different levels of hierarchy

    +

    Visualize documents and their topics in 2D at different levels of hierarchy.

    Parameters:

    None
    use_ctfidfbool

    Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used.

    False
    custom_labels bool
    @@ -13971,7 +14561,7 @@

    Source code in bertopic\_bertopic.py -
    def visualize_hierarchical_documents(self,
    -                                     docs: List[str],
    -                                     hierarchical_topics: pd.DataFrame,
    -                                     topics: List[int] = None,
    -                                     embeddings: np.ndarray = None,
    -                                     reduced_embeddings: np.ndarray = None,
    -                                     sample: Union[float, int] = None,
    -                                     hide_annotations: bool = False,
    -                                     hide_document_hover: bool = True,
    -                                     nr_levels: int = 10,
    -                                     level_scale: str = 'linear',
    -                                     custom_labels: bool = False,
    -                                     title: str = "<b>Hierarchical Documents and Topics</b>",
    -                                     width: int = 1200,
    -                                     height: int = 750) -> go.Figure:
    -    """ Visualize documents and their topics in 2D at different levels of hierarchy
    +          
    def visualize_hierarchical_documents(
    +    self,
    +    docs: List[str],
    +    hierarchical_topics: pd.DataFrame,
    +    topics: List[int] = None,
    +    embeddings: np.ndarray = None,
    +    reduced_embeddings: np.ndarray = None,
    +    sample: Union[float, int] = None,
    +    hide_annotations: bool = False,
    +    hide_document_hover: bool = True,
    +    nr_levels: int = 10,
    +    level_scale: str = "linear",
    +    custom_labels: bool = False,
    +    title: str = "<b>Hierarchical Documents and Topics</b>",
    +    width: int = 1200,
    +    height: int = 750,
    +) -> go.Figure:
    +    """Visualize documents and their topics in 2D at different levels of hierarchy.
     
         Arguments:
             docs: The documents you used when calling either `fit` or `fit_transform`
    @@ -14091,7 +14683,7 @@ 

    specific points. Helps to speed up generation of visualizations. nr_levels: The number of levels to be visualized in the hierarchy. First, the distances in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with - equal length. Then, for each list of distances, the merged topics, that have + equal length. Then, for each list of distances, the merged topics, that have a distance less or equal to the maximum distance of the selected list of distances, are selected. NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to the length of `hierarchical_topics`. @@ -14109,7 +14701,6 @@

    height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -14156,21 +14747,23 @@

    """ check_is_fitted(self) check_documents_type(docs) - return plotting.visualize_hierarchical_documents(self, - docs=docs, - hierarchical_topics=hierarchical_topics, - topics=topics, - embeddings=embeddings, - reduced_embeddings=reduced_embeddings, - sample=sample, - hide_annotations=hide_annotations, - hide_document_hover=hide_document_hover, - nr_levels=nr_levels, - level_scale=level_scale, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_hierarchical_documents( + self, + docs=docs, + hierarchical_topics=hierarchical_topics, + topics=topics, + embeddings=embeddings, + reduced_embeddings=reduced_embeddings, + sample=sample, + hide_annotations=hide_annotations, + hide_document_hover=hide_document_hover, + nr_levels=nr_levels, + level_scale=level_scale, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -14184,17 +14777,17 @@

    -visualize_hierarchy(self, orientation='left', topics=None, top_n_topics=None, custom_labels=False, title='<b>Hierarchical Clustering</b>', width=1000, height=600, hierarchical_topics=None, linkage_function=None, distance_function=None, color_threshold=1) +visualize_hierarchy(self, orientation='left', topics=None, top_n_topics=None, use_ctfidf=True, custom_labels=False, title='<b>Hierarchical Clustering</b>', width=1000, height=600, hierarchical_topics=None, linkage_function=None, distance_function=None, color_threshold=1)

    -

    Visualize a hierarchical structure of the topics

    +

    Visualize a hierarchical structure of the topics.

    A ward linkage function is used to perform the hierarchical clustering based on the cosine distance -matrix between topic embeddings.

    +matrix between c-TF-IDF or semantic embeddings of the topics.

    Parameters:

    @@ -14232,6 +14825,13 @@

    Only select the top n most frequent topics

    + + + + + + @@ -14334,23 +14934,26 @@

    Source code in bertopic\_bertopic.py -
    def visualize_hierarchy(self,
    -                        orientation: str = "left",
    -                        topics: List[int] = None,
    -                        top_n_topics: int = None,
    -                        custom_labels: bool = False,
    -                        title: str = "<b>Hierarchical Clustering</b>",
    -                        width: int = 1000,
    -                        height: int = 600,
    -                        hierarchical_topics: pd.DataFrame = None,
    -                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,
    -                        distance_function: Callable[[csr_matrix], csr_matrix] = None,
    -                        color_threshold: int = 1) -> go.Figure:
    -    """ Visualize a hierarchical structure of the topics
    +          
    def visualize_hierarchy(
    +    self,
    +    orientation: str = "left",
    +    topics: List[int] = None,
    +    top_n_topics: int = None,
    +    use_ctfidf: bool = True,
    +    custom_labels: bool = False,
    +    title: str = "<b>Hierarchical Clustering</b>",
    +    width: int = 1000,
    +    height: int = 600,
    +    hierarchical_topics: pd.DataFrame = None,
    +    linkage_function: Callable[[csr_matrix], np.ndarray] = None,
    +    distance_function: Callable[[csr_matrix], csr_matrix] = None,
    +    color_threshold: int = 1,
    +) -> go.Figure:
    +    """Visualize a hierarchical structure of the topics.
     
         A ward linkage function is used to perform the
         hierarchical clustering based on the cosine distance
    -    matrix between topic embeddings.
    +    matrix between c-TF-IDF or semantic embeddings of the topics.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -14358,6 +14961,8 @@ 

    Either 'left' or 'bottom' topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. NOTE: Custom labels are only generated for the original @@ -14385,7 +14990,6 @@

    fig: A plotly figure Examples: - To visualize the hierarchical structure of topics simply run: @@ -14414,19 +15018,21 @@

    style="width:1000px; height: 680px; border: 0px;""></iframe> """ check_is_fitted(self) - return plotting.visualize_hierarchy(self, - orientation=orientation, - topics=topics, - top_n_topics=top_n_topics, - custom_labels=custom_labels, - title=title, - width=width, - height=height, - hierarchical_topics=hierarchical_topics, - linkage_function=linkage_function, - distance_function=distance_function, - color_threshold=color_threshold - ) + return plotting.visualize_hierarchy( + self, + orientation=orientation, + topics=topics, + top_n_topics=top_n_topics, + use_ctfidf=use_ctfidf, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + hierarchical_topics=hierarchical_topics, + linkage_function=linkage_function, + distance_function=distance_function, + color_threshold=color_threshold, + )

    @@ -14447,7 +15053,7 @@

    -

    Visualize the ranks of all terms across all topics

    +

    Visualize the ranks of all terms across all topics.

    Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point @@ -14538,14 +15144,16 @@

    Source code in bertopic\_bertopic.py -
    def visualize_term_rank(self,
    -                        topics: List[int] = None,
    -                        log_scale: bool = False,
    -                        custom_labels: bool = False,
    -                        title: str = "<b>Term score decline per Topic</b>",
    -                        width: int = 800,
    -                        height: int = 500) -> go.Figure:
    -    """ Visualize the ranks of all terms across all topics
    +          
    def visualize_term_rank(
    +    self,
    +    topics: List[int] = None,
    +    log_scale: bool = False,
    +    custom_labels: bool = False,
    +    title: str = "<b>Term score decline per Topic</b>",
    +    width: int = 800,
    +    height: int = 500,
    +) -> go.Figure:
    +    """Visualize the ranks of all terms across all topics.
     
         Each topic is represented by a set of words. These words, however,
         do not all equally represent the topic. This visualization shows
    @@ -14566,7 +15174,6 @@ 

    fig: A plotly figure Examples: - To visualize the ranks of all words across all topics simply run: @@ -14590,13 +15197,15 @@

    [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). """ check_is_fitted(self) - return plotting.visualize_term_rank(self, - topics=topics, - log_scale=log_scale, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_term_rank( + self, + topics=topics, + log_scale=log_scale, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -14610,14 +15219,14 @@

    -visualize_topics(self, topics=None, top_n_topics=None, custom_labels=False, title='<b>Intertopic Distance Map</b>', width=650, height=650) +visualize_topics(self, topics=None, top_n_topics=None, use_ctfidf=False, custom_labels=False, title='<b>Intertopic Distance Map</b>', width=650, height=650)

    -

    Visualize topics, their sizes, and their corresponding words

    +

    Visualize topics, their sizes, and their corresponding words.

    This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA.

    @@ -14647,6 +15256,12 @@

    + + + + + + @@ -14686,14 +15301,17 @@

    Source code in bertopic\_bertopic.py -
    def visualize_topics(self,
    -                     topics: List[int] = None,
    -                     top_n_topics: int = None,
    -                     custom_labels: bool = False,
    -                     title: str = "<b>Intertopic Distance Map</b>",
    -                     width: int = 650,
    -                     height: int = 650) -> go.Figure:
    -    """ Visualize topics, their sizes, and their corresponding words
    +          
    def visualize_topics(
    +    self,
    +    topics: List[int] = None,
    +    top_n_topics: int = None,
    +    use_ctfidf: bool = False,
    +    custom_labels: bool = False,
    +    title: str = "<b>Intertopic Distance Map</b>",
    +    width: int = 650,
    +    height: int = 650,
    +) -> go.Figure:
    +    """Visualize topics, their sizes, and their corresponding words.
     
         This visualization is highly inspired by LDAvis, a great visualization
         technique typically reserved for LDA.
    @@ -14704,6 +15322,7 @@ 

    For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. top_n_topics: Only select the top n most frequent topics + use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. @@ -14711,7 +15330,6 @@

    height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -14726,13 +15344,16 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_topics(self, - topics=topics, - top_n_topics=top_n_topics, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_topics( + self, + topics=topics, + top_n_topics=top_n_topics, + use_ctfidf=use_ctfidf, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -14753,7 +15374,7 @@

    -

    Visualize topics over time

    +

    Visualize topics over time.

    Parameters:

    None
    use_ctfidfbool

    Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the + embeddings from the embedding model are used.

    True
    custom_labels bool

    Only select the top n most frequent topics

    None
    use_ctfidfbool

    Whether to use c-TF-IDF representations instead of the embeddings from the embedding model.

    False
    custom_labels bool
    @@ -14846,16 +15467,18 @@

    Source code in bertopic\_bertopic.py -
    def visualize_topics_over_time(self,
    -                               topics_over_time: pd.DataFrame,
    -                               top_n_topics: int = None,
    -                               topics: List[int] = None,
    -                               normalize_frequency: bool = False,
    -                               custom_labels: bool = False,
    -                               title: str = "<b>Topics over Time</b>",
    -                               width: int = 1250,
    -                               height: int = 450) -> go.Figure:
    -    """ Visualize topics over time
    +          
    def visualize_topics_over_time(
    +    self,
    +    topics_over_time: pd.DataFrame,
    +    top_n_topics: int = None,
    +    topics: List[int] = None,
    +    normalize_frequency: bool = False,
    +    custom_labels: bool = False,
    +    title: str = "<b>Topics over Time</b>",
    +    width: int = 1250,
    +    height: int = 450,
    +) -> go.Figure:
    +    """Visualize topics over time.
     
         Arguments:
             topics_over_time: The topics you would like to be visualized with the
    @@ -14873,7 +15496,6 @@ 

    A plotly.graph_objects.Figure including all traces Examples: - To visualize the topics over time, simply run: ```python @@ -14889,15 +15511,17 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_topics_over_time(self, - topics_over_time=topics_over_time, - top_n_topics=top_n_topics, - topics=topics, - normalize_frequency=normalize_frequency, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_topics_over_time( + self, + topics_over_time=topics_over_time, + top_n_topics=top_n_topics, + topics=topics, + normalize_frequency=normalize_frequency, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    @@ -14918,7 +15542,7 @@

    -

    Visualize topics per class

    +

    Visualize topics per class.

    Parameters:

    @@ -15011,16 +15635,18 @@

    Source code in bertopic\_bertopic.py -
    def visualize_topics_per_class(self,
    -                               topics_per_class: pd.DataFrame,
    -                               top_n_topics: int = 10,
    -                               topics: List[int] = None,
    -                               normalize_frequency: bool = False,
    -                               custom_labels: bool = False,
    -                               title: str = "<b>Topics per Class</b>",
    -                               width: int = 1250,
    -                               height: int = 900) -> go.Figure:
    -    """ Visualize topics per class
    +          
    def visualize_topics_per_class(
    +    self,
    +    topics_per_class: pd.DataFrame,
    +    top_n_topics: int = 10,
    +    topics: List[int] = None,
    +    normalize_frequency: bool = False,
    +    custom_labels: bool = False,
    +    title: str = "<b>Topics per Class</b>",
    +    width: int = 1250,
    +    height: int = 900,
    +) -> go.Figure:
    +    """Visualize topics per class.
     
         Arguments:
             topics_per_class: The topics you would like to be visualized with the
    @@ -15038,7 +15664,6 @@ 

    A plotly.graph_objects.Figure including all traces Examples: - To visualize the topics per class, simply run: ```python @@ -15054,15 +15679,17 @@

    ``` """ check_is_fitted(self) - return plotting.visualize_topics_per_class(self, - topics_per_class=topics_per_class, - top_n_topics=top_n_topics, - topics=topics, - normalize_frequency=normalize_frequency, - custom_labels=custom_labels, - title=title, - width=width, - height=height) + return plotting.visualize_topics_per_class( + self, + topics_per_class=topics_per_class, + top_n_topics=top_n_topics, + topics=topics, + normalize_frequency=normalize_frequency, + custom_labels=custom_labels, + title=title, + width=width, + height=height, + )

    diff --git a/api/cluster/base.html b/api/cluster/base.html index 25ab75ea..b972d594 100755 --- a/api/cluster/base.html +++ b/api/cluster/base.html @@ -2618,13 +2618,15 @@

    BaseCluster
    -

    The Base Cluster class

    +

    The Base Cluster class.

    Using this class directly in BERTopic will make it skip -over the cluster step. As a result, topics need to be passed -to BERTopic in the form of its y parameter in order to create -topic representations.

    -

    Examples:

    -

    This will skip over the cluster step in BERTopic:

    +over the cluster step. As a result, topics need to be passed +to BERTopic in the form of its y parameter in order to create +topic representations.

    + +

    Examples:

    + +

    This will skip over the cluster step in BERTopic:

    from bertopic import BERTopic
     from bertopic.dimensionality import BaseCluster
     
    @@ -2632,8 +2634,8 @@ 

    BaseClustertopic_model = BERTopic(hdbscan_model=empty_cluster_model)

    -

    Then, this class can be used to perform manual topic modeling. -That is, topic modeling on a topics that were already generated before +

    Then, this class can be used to perform manual topic modeling. +That is, topic modeling on a topics that were already generated before without the need to learn them:

    topic_model.fit(docs, y=y)
     
    @@ -2641,15 +2643,14 @@

    BaseCluster Source code in bertopic\cluster\_base.py

    - @@ -2707,8 +2707,7 @@

    c-TF-IDF Source code in bertopic\vectorizers\_ctfidf.py
    class ClassTfidfTransformer(TfidfTransformer):
    -    """
    -    A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.
    +    """A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.
     
         ![](../algorithm/c-TF-IDF.svg)
     
    @@ -2727,24 +2726,25 @@ 

    c-TF-IDF `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))` reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix. Helps to reduce the impact of words that appear too frequently. - seed_words: Specific words that will have their idf value increased by - the value of `seed_multiplier`. + seed_words: Specific words that will have their idf value increased by + the value of `seed_multiplier`. NOTE: This will only increase the value of words that have an exact match. seed_multiplier: The value with which the idf values of the words in `seed_words` are multiplied. Examples: - ```python transformer = ClassTfidfTransformer() ``` """ - def __init__(self, - bm25_weighting: bool = False, - reduce_frequent_words: bool = False, - seed_words: List[str] = None, - seed_multiplier: float = 2 - ): + + def __init__( + self, + bm25_weighting: bool = False, + reduce_frequent_words: bool = False, + seed_words: List[str] = None, + seed_multiplier: float = 2, + ): self.bm25_weighting = bm25_weighting self.reduce_frequent_words = reduce_frequent_words self.seed_words = seed_words @@ -2758,7 +2758,7 @@

    c-TF-IDF X: A matrix of term/token counts. multiplier: A multiplier for increasing/decreasing certain IDF scores """ - X = check_array(X, accept_sparse=('csr', 'csc')) + X = check_array(X, accept_sparse=("csr", "csc")) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = np.float64 @@ -2774,26 +2774,29 @@

    c-TF-IDF# BM25-inspired weighting procedure if self.bm25_weighting: - idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5))) + idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5))) # Divide the average number of samples by the word frequency # +1 is added to force values to be positive else: - idf = np.log((avg_nr_samples / df)+1) + idf = np.log((avg_nr_samples / df) + 1) # Multiplier to increase/decrease certain idf scores if multiplier is not None: idf = idf * multiplier - self._idf_diag = sp.diags(idf, offsets=0, - shape=(n_features, n_features), - format='csr', - dtype=dtype) + self._idf_diag = sp.diags( + idf, + offsets=0, + shape=(n_features, n_features), + format="csr", + dtype=dtype, + ) return self def transform(self, X: sp.csr_matrix): - """Transform a count-based matrix to c-TF-IDF + """Transform a count-based matrix to c-TF-IDF. Arguments: X (sparse matrix): A matrix of term/token counts. @@ -2802,7 +2805,7 @@

    c-TF-IDF X (sparse matrix): A c-TF-IDF matrix """ if self.use_idf: - X = normalize(X, axis=1, norm='l1', copy=False) + X = normalize(X, axis=1, norm="l1", copy=False) if self.reduce_frequent_words: X.data = np.sqrt(X.data) @@ -2874,7 +2877,7 @@

    X: A matrix of term/token counts. multiplier: A multiplier for increasing/decreasing certain IDF scores """ - X = check_array(X, accept_sparse=('csr', 'csc')) + X = check_array(X, accept_sparse=("csr", "csc")) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = np.float64 @@ -2890,21 +2893,24 @@

    # BM25-inspired weighting procedure if self.bm25_weighting: - idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5))) + idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5))) # Divide the average number of samples by the word frequency # +1 is added to force values to be positive else: - idf = np.log((avg_nr_samples / df)+1) + idf = np.log((avg_nr_samples / df) + 1) # Multiplier to increase/decrease certain idf scores if multiplier is not None: idf = idf * multiplier - self._idf_diag = sp.diags(idf, offsets=0, - shape=(n_features, n_features), - format='csr', - dtype=dtype) + self._idf_diag = sp.diags( + idf, + offsets=0, + shape=(n_features, n_features), + format="csr", + dtype=dtype, + ) return self

    @@ -2927,7 +2933,7 @@

    -

    Transform a count-based matrix to c-TF-IDF

    +

    Transform a count-based matrix to c-TF-IDF.

    Parameters:

    seed_words List[str]

    Specific words that will have their idf value increased by - the value of seed_multiplier. +

    Specific words that will have their idf value increased by + the value of seed_multiplier. NOTE: This will only increase the value of words that have an exact match.

    None
    @@ -2966,7 +2972,7 @@

    Source code in bertopic\vectorizers\_ctfidf.py
    def transform(self, X: sp.csr_matrix):
    -    """Transform a count-based matrix to c-TF-IDF
    +    """Transform a count-based matrix to c-TF-IDF.
     
         Arguments:
             X (sparse matrix): A matrix of term/token counts.
    @@ -2975,7 +2981,7 @@ 

    X (sparse matrix): A c-TF-IDF matrix """ if self.use_idf: - X = normalize(X, axis=1, norm='l1', copy=False) + X = normalize(X, axis=1, norm="l1", copy=False) if self.reduce_frequent_words: X.data = np.sqrt(X.data) diff --git a/api/dimensionality/base.html b/api/dimensionality/base.html index c9b89d1f..83f86e86 100755 --- a/api/dimensionality/base.html +++ b/api/dimensionality/base.html @@ -2618,7 +2618,7 @@

    BaseDimensionalityReduction
    -

    The Base Dimensionality Reduction class

    +

    The Base Dimensionality Reduction class.

    You can use this to skip over the dimensionality reduction step in BERTopic.

    Examples:

    @@ -2635,12 +2635,11 @@

    BaseDimensionalityReduction Source code in bertopic\dimensionality\_base.py
    class BaseDimensionalityReduction:
    -    """ The Base Dimensionality Reduction class
    +    """The Base Dimensionality Reduction class.
     
         You can use this to skip over the dimensionality reduction step in BERTopic.
     
         Examples:
    -
         This will skip over the reduction step in BERTopic:
     
         ```python
    @@ -2652,6 +2651,7 @@ 

    BaseDimensionalityReduction topic_model = BERTopic(umap_model=empty_reduction_model) ``` """ + def fit(self, X: np.ndarray = None): return self diff --git a/api/onlinecv.html b/api/onlinecv.html index b237d177..1f1a61ac 100755 --- a/api/onlinecv.html +++ b/api/onlinecv.html @@ -2743,7 +2743,7 @@

    OnlineCountVectorizer Source code in bertopic\vectorizers\_online_cv.py
    class OnlineCountVectorizer(CountVectorizer):
    -    """ An online variant of the CountVectorizer with updating vocabulary.
    +    """An online variant of the CountVectorizer with updating vocabulary.
     
         At each `.partial_fit`, its vocabulary is updated based on any OOV words
         it might find. Then, `.update_bow` can be used to track and update
    @@ -2776,7 +2776,6 @@ 

    OnlineCountVectorizer X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation Examples: - ```python from bertopic.vectorizers import OnlineCountVectorizer vectorizer = OnlineCountVectorizer(stop_words="english") @@ -2802,21 +2801,19 @@

    OnlineCountVectorizer References: Adapted from: https://github.com/idoshlomo/online_vectorizers """ - def __init__(self, - decay: float = None, - delete_min_df: float = None, - **kwargs): + + def __init__(self, decay: float = None, delete_min_df: float = None, **kwargs): self.decay = decay self.delete_min_df = delete_min_df super(OnlineCountVectorizer, self).__init__(**kwargs) def partial_fit(self, raw_documents: List[str]) -> None: - """ Perform a partial fit and update vocabulary with OOV tokens + """Perform a partial fit and update vocabulary with OOV tokens. Arguments: raw_documents: A list of documents """ - if not hasattr(self, 'vocabulary_'): + if not hasattr(self, "vocabulary_"): return self.fit(raw_documents) analyzer = self.build_analyzer() @@ -2826,13 +2823,18 @@

    OnlineCountVectorizerif oov_tokens: max_index = max(self.vocabulary_.values()) - oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)))) + oov_vocabulary = dict( + zip( + oov_tokens, + list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)), + ) + ) self.vocabulary_.update(oov_vocabulary) return self def update_bow(self, raw_documents: List[str]) -> csr_matrix: - """ Create or update the bag-of-words matrix + """Create or update the bag-of-words matrix. Update the bag-of-words matrix by adding the newly transformed documents. This may add empty columns if new words are found and/or @@ -2874,7 +2876,7 @@

    OnlineCountVectorizerreturn self.X_ def _clean_bow(self) -> None: - """ Remove words that do not exceed `self.delete_min_df` """ + """Remove words that do not exceed `self.delete_min_df`.""" # Only keep words with a minimum frequency indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1] indices_dict = {index: index for index in indices} @@ -2916,7 +2918,7 @@

    -

    Perform a partial fit and update vocabulary with OOV tokens

    +

    Perform a partial fit and update vocabulary with OOV tokens.

    Parameters:

    @@ -2940,12 +2942,12 @@

    Source code in bertopic\vectorizers\_online_cv.py
    def partial_fit(self, raw_documents: List[str]) -> None:
    -    """ Perform a partial fit and update vocabulary with OOV tokens
    +    """Perform a partial fit and update vocabulary with OOV tokens.
     
         Arguments:
             raw_documents: A list of documents
         """
    -    if not hasattr(self, 'vocabulary_'):
    +    if not hasattr(self, "vocabulary_"):
             return self.fit(raw_documents)
     
         analyzer = self.build_analyzer()
    @@ -2955,7 +2957,12 @@ 

    if oov_tokens: max_index = max(self.vocabulary_.values()) - oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)))) + oov_vocabulary = dict( + zip( + oov_tokens, + list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)), + ) + ) self.vocabulary_.update(oov_vocabulary) return self @@ -2979,7 +2986,7 @@

    -

    Create or update the bag-of-words matrix

    +

    Create or update the bag-of-words matrix.

    Update the bag-of-words matrix by adding the newly transformed documents. This may add empty columns if new words are found and/or add empty rows if new topics are found.

    @@ -3025,7 +3032,7 @@

    Source code in bertopic\vectorizers\_online_cv.py

    @@ -2656,7 +2656,7 @@

    Barchart

    - @@ -2717,23 +2717,25 @@

    Barchart Source code in bertopic\plotting\_barchart.py -
    def visualize_barchart(topic_model,
    -                       topics: List[int] = None,
    -                       top_n_topics: int = 8,
    -                       n_words: int = 5,
    -                       custom_labels: Union[bool, str] = False,
    -                       title: str = "<b>Topic Word Scores</b>",
    -                       width: int = 250,
    -                       height: int = 250,
    -                       autoscale: bool=False) -> go.Figure:
    -    """ Visualize a barchart of selected topics
    +          
    def visualize_barchart(
    +    topic_model,
    +    topics: List[int] = None,
    +    top_n_topics: int = 8,
    +    n_words: int = 5,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Topic Word Scores</b>",
    +    width: int = 250,
    +    height: int = 250,
    +    autoscale: bool = False,
    +) -> go.Figure:
    +    """Visualize a barchart of selected topics.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
             topics: A selection of topics to visualize.
             top_n_topics: Only select the top n most frequent topics.
             n_words: Number of words to show in a topic
    -        custom_labels: If bool, whether to use custom topic labels that were defined using 
    +        custom_labels: If bool, whether to use custom topic labels that were defined using
                            `topic_model.set_topic_labels`.
                            If `str`, it uses labels from other aspects, e.g., "Aspect1".
             title: Title of the plot.
    @@ -2745,7 +2747,6 @@ 

    Barchart fig: A plotly figure Examples: - To visualize the barchart of selected topics simply run: @@ -2785,12 +2786,14 @@

    Barchartsubplot_titles = [f"Topic {topic}" for topic in topics] columns = 4 rows = int(np.ceil(len(topics) / columns)) - fig = make_subplots(rows=rows, - cols=columns, - shared_xaxes=False, - horizontal_spacing=.1, - vertical_spacing=.4 / rows if rows > 1 else 0, - subplot_titles=subplot_titles) + fig = make_subplots( + rows=rows, + cols=columns, + shared_xaxes=False, + horizontal_spacing=0.1, + vertical_spacing=0.4 / rows if rows > 1 else 0, + subplot_titles=subplot_titles, + ) # Add barchart for each topic row = 1 @@ -2800,20 +2803,17 @@

    Barchartscores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1] fig.add_trace( - go.Bar(x=scores, - y=words, - orientation='h', - marker_color=next(colors)), - row=row, col=column) + go.Bar(x=scores, y=words, orientation="h", marker_color=next(colors)), + row=row, + col=column, + ) if autoscale: if len(words) > 12: height = 250 + (len(words) - 12) * 11 if len(words) > 9: - fig.update_yaxes( - tickfont=dict(size=(height - 140) // len(words)) - ) + fig.update_yaxes(tickfont=dict(size=(height - 140) // len(words))) if column == columns: column = 1 @@ -2826,21 +2826,15 @@

    Barcharttemplate="plotly_white", showlegend=False, title={ - 'text': f"{title}", - 'x': .5, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, - width=width*4, - height=height*rows if rows > 1 else height * 1.3, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + width=width * 4, + height=height * rows if rows > 1 else height * 1.3, + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) fig.update_xaxes(showgrid=True) diff --git a/api/plotting/distribution.html b/api/plotting/distribution.html index ebdaf2f8..2e92920d 100755 --- a/api/plotting/distribution.html +++ b/api/plotting/distribution.html @@ -2616,7 +2616,7 @@

    Distribution
    -

    Visualize the distribution of topic probabilities

    +

    Visualize the distribution of topic probabilities.

    Parameters:

    custom_labels Union[bool, str]

    If bool, whether to use custom topic labels that were defined using +

    If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., "Aspect1".

    False
    @@ -2651,7 +2651,7 @@

    Distribution

    @@ -2691,21 +2691,23 @@

    Distribution Source code in bertopic\plotting\_distribution.py -
    def visualize_distribution(topic_model,
    -                           probabilities: np.ndarray,
    -                           min_probability: float = 0.015,
    -                           custom_labels: Union[bool, str] = False,
    -                           title: str = "<b>Topic Probability Distribution</b>",
    -                           width: int = 800,
    -                           height: int = 600) -> go.Figure:
    -    """ Visualize the distribution of topic probabilities
    +          
    def visualize_distribution(
    +    topic_model,
    +    probabilities: np.ndarray,
    +    min_probability: float = 0.015,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Topic Probability Distribution</b>",
    +    width: int = 800,
    +    height: int = 600,
    +) -> go.Figure:
    +    """Visualize the distribution of topic probabilities.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
             probabilities: An array of probability scores
             min_probability: The minimum probability score to visualize.
                              All others are ignored.
    -        custom_labels: If bool, whether to use custom topic labels that were defined using 
    +        custom_labels: If bool, whether to use custom topic labels that were defined using
                            `topic_model.set_topic_labels`.
                            If `str`, it uses labels from other aspects, e.g., "Aspect1".
             title: Title of the plot.
    @@ -2713,7 +2715,6 @@ 

    Distribution height: The height of the figure. Examples: - Make sure to fit the model before and only input the probabilities of a single document: @@ -2731,11 +2732,15 @@

    Distribution style="width:1000px; height: 500px; border: 0px;""></iframe> """ if len(probabilities.shape) != 1: - raise ValueError("This visualization cannot be used if you have set `calculate_probabilities` to False " - "as it uses the topic probabilities of all topics. ") + raise ValueError( + "This visualization cannot be used if you have set `calculate_probabilities` to False " + "as it uses the topic probabilities of all topics. " + ) if len(probabilities[probabilities > min_probability]) == 0: - raise ValueError("There are no values where `min_probability` is higher than the " - "probabilities that were supplied. Lower `min_probability` to prevent this error.") + raise ValueError( + "There are no values where `min_probability` is higher than the " + "probabilities that were supplied. Lower `min_probability` to prevent this error." + ) # Get values and indices equal or exceed the minimum probability labels_idx = np.argwhere(probabilities >= min_probability).flatten() @@ -2744,7 +2749,7 @@

    Distribution# Create labels if isinstance(custom_labels, str): labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in labels_idx] - labels = ["_".join([label[0] for label in l[:4]]) for l in labels] + labels = ["_".join([label[0] for label in l[:4]]) for l in labels] # noqa: E741 labels = [label if len(label) < 30 else label[:27] + "..." for label in labels] elif topic_model.custom_labels_ is not None and custom_labels: labels = [topic_model.custom_labels_[idx + topic_model._outliers] for idx in labels_idx] @@ -2761,38 +2766,32 @@

    Distributionvals.remove(probabilities[idx]) # Create Figure - fig = go.Figure(go.Bar( - x=vals, - y=labels, - marker=dict( - color='#C8D2D7', - line=dict( - color='#6E8484', - width=1), - ), - orientation='h') + fig = go.Figure( + go.Bar( + x=vals, + y=labels, + marker=dict( + color="#C8D2D7", + line=dict(color="#6E8484", width=1), + ), + orientation="h", + ) ) fig.update_layout( xaxis_title="Probability", title={ - 'text': f"{title}", - 'y': .95, - 'x': 0.5, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "y": 0.95, + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, template="simple_white", width=width, height=height, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) return fig diff --git a/api/plotting/document_datamap.html b/api/plotting/document_datamap.html index fafdca01..8327cda5 100755 --- a/api/plotting/document_datamap.html +++ b/api/plotting/document_datamap.html @@ -2759,18 +2759,20 @@

    Documents with DataMapPlot Source code in bertopic\plotting\_datamap.py -
    def visualize_document_datamap(topic_model,
    -                               docs: List[str],
    -                               topics: List[int] = None,
    -                               embeddings: np.ndarray = None,
    -                               reduced_embeddings: np.ndarray = None,
    -                               custom_labels: Union[bool, str] = False,
    -                               title: str = "Documents and Topics",
    -                               sub_title: Union[str, None] = None,
    -                               width: int = 1200,
    -                               height: int = 1200,
    -                               **datamap_kwds) -> Figure:
    -    """ Visualize documents and their topics in 2D as a static plot for publication using
    +          
    def visualize_document_datamap(
    +    topic_model,
    +    docs: List[str],
    +    topics: List[int] = None,
    +    embeddings: np.ndarray = None,
    +    reduced_embeddings: np.ndarray = None,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "Documents and Topics",
    +    sub_title: Union[str, None] = None,
    +    width: int = 1200,
    +    height: int = 1200,
    +    **datamap_kwds,
    +) -> Figure:
    +    """Visualize documents and their topics in 2D as a static plot for publication using
         DataMapPlot.
     
         Arguments:
    @@ -2798,7 +2800,6 @@ 

    Documents with DataMapPlot figure: A Matplotlib Figure object. Examples: - To visualize the topics simply run: ```python @@ -2841,7 +2842,6 @@

    Documents with DataMapPlot <img src="../../getting_started/visualization/datamapplot.png", alt="DataMapPlot of 20-Newsgroups", width=800, height=800></img> """ - topic_per_doc = topic_model.topics_ df = pd.DataFrame({"topic": np.array(topic_per_doc)}) @@ -2856,7 +2856,7 @@

    Documents with DataMapPlot# Reduce input embeddings if reduced_embeddings is None: - umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric='cosine').fit(embeddings_to_reduce) + umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ else: embeddings_2d = reduced_embeddings @@ -2871,7 +2871,10 @@

    Documents with DataMapPlotelif topic_model.custom_labels_ is not None and custom_labels: names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics] else: - names = [f"Topic-{topic}: " + " ".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics] + names = [ + f"Topic-{topic}: " + " ".join([word for word, value in topic_model.get_topic(topic)][:3]) + for topic in unique_topics + ] topic_name_mapping = {topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)} topic_name_mapping[-1] = "Unlabelled" @@ -2889,7 +2892,7 @@

    Documents with DataMapPlotfigure, axes = datamapplot.create_plot( embeddings_2d, named_topic_per_doc, - figsize=(width/100, height/100), + figsize=(width / 100, height / 100), dpi=100, title=title, sub_title=sub_title, diff --git a/api/plotting/documents.html b/api/plotting/documents.html index 4dbd85ea..67adeb2a 100755 --- a/api/plotting/documents.html +++ b/api/plotting/documents.html @@ -2616,7 +2616,7 @@

    Documents
    -

    Visualize documents and their topics in 2D

    +

    Visualize documents and their topics in 2D.

    Parameters:

    False
    @@ -2687,7 +2687,7 @@

    Documents

    - @@ -2750,19 +2750,21 @@

    Documents Source code in bertopic\plotting\_documents.py -
    def visualize_documents(topic_model,
    -                        docs: List[str],
    -                        topics: List[int] = None,
    -                        embeddings: np.ndarray = None,
    -                        reduced_embeddings: np.ndarray = None,
    -                        sample: float = None,
    -                        hide_annotations: bool = False,
    -                        hide_document_hover: bool = False,
    -                        custom_labels: Union[bool, str] = False,
    -                        title: str = "<b>Documents and Topics</b>",
    -                        width: int = 1200,
    -                        height: int = 750):
    -    """ Visualize documents and their topics in 2D
    +          
    def visualize_documents(
    +    topic_model,
    +    docs: List[str],
    +    topics: List[int] = None,
    +    embeddings: np.ndarray = None,
    +    reduced_embeddings: np.ndarray = None,
    +    sample: float = None,
    +    hide_annotations: bool = False,
    +    hide_document_hover: bool = False,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Documents and Topics</b>",
    +    width: int = 1200,
    +    height: int = 750,
    +):
    +    """Visualize documents and their topics in 2D.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -2780,7 +2782,7 @@ 

    Documents hide_annotations: Hide the names of the traces on top of each cluster. hide_document_hover: Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization. - custom_labels: If bool, whether to use custom topic labels that were defined using + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. @@ -2788,7 +2790,6 @@

    Documents height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -2863,7 +2864,7 @@

    Documents# Reduce input embeddings if reduced_embeddings is None: - umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce) + umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ elif sample is not None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings[indices] @@ -2886,7 +2887,10 @@

    Documentselif topic_model.custom_labels_ is not None and custom_labels: names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics] else: - names = [f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics] + names = [ + f"{topic}_" + "_".join([word for word, value in topic_model.get_topic(topic)][:3]) + for topic in unique_topics + ] # Visualize fig = go.Figure() @@ -2898,7 +2902,13 @@

    Documentsselection = df.loc[df.topic.isin(non_selected_topics), :] selection["text"] = "" - selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), "Other documents"] + selection.loc[len(selection), :] = [ + None, + None, + selection.x.mean(), + selection.y.mean(), + "Other documents", + ] fig.add_trace( go.Scattergl( @@ -2906,10 +2916,10 @@

    Documentsy=selection.y, hovertext=selection.doc if not hide_document_hover else None, hoverinfo="text", - mode='markers+text', + mode="markers+text", name="other", showlegend=False, - marker=dict(color='#CFD8DC', size=5, opacity=0.5) + marker=dict(color="#CFD8DC", size=5, opacity=0.5), ) ) @@ -2920,7 +2930,13 @@

    Documentsselection["text"] = "" if not hide_annotations: - selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), name] + selection.loc[len(selection), :] = [ + None, + None, + selection.x.mean(), + selection.y.mean(), + name, + ] fig.add_trace( go.Scattergl( @@ -2929,24 +2945,40 @@

    Documentshovertext=selection.doc if not hide_document_hover else None, hoverinfo="text", text=selection.text, - mode='markers+text', + mode="markers+text", name=name, textfont=dict( size=12, ), - marker=dict(size=5, opacity=0.5) + marker=dict(size=5, opacity=0.5), ) ) # Add grid in a 'plus' shape - x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15)) - y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15)) - fig.add_shape(type="line", - x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1], - line=dict(color="#CFD8DC", width=2)) - fig.add_shape(type="line", - x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2, - line=dict(color="#9E9E9E", width=2)) + x_range = ( + df.x.min() - abs((df.x.min()) * 0.15), + df.x.max() + abs((df.x.max()) * 0.15), + ) + y_range = ( + df.y.min() - abs((df.y.min()) * 0.15), + df.y.max() + abs((df.y.max()) * 0.15), + ) + fig.add_shape( + type="line", + x0=sum(x_range) / 2, + y0=y_range[0], + x1=sum(x_range) / 2, + y1=y_range[1], + line=dict(color="#CFD8DC", width=2), + ) + fig.add_shape( + type="line", + x0=x_range[0], + y0=sum(y_range) / 2, + x1=x_range[1], + y1=sum(y_range) / 2, + line=dict(color="#9E9E9E", width=2), + ) fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) @@ -2954,16 +2986,14 @@

    Documentsfig.update_layout( template="simple_white", title={ - 'text': f"{title}", - 'x': 0.5, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, width=width, - height=height + height=height, ) fig.update_xaxes(visible=False) diff --git a/api/plotting/dtm.html b/api/plotting/dtm.html index 0331df15..f10d738f 100755 --- a/api/plotting/dtm.html +++ b/api/plotting/dtm.html @@ -2616,7 +2616,7 @@

    DTM
    -

    Visualize topics over time

    +

    Visualize topics over time.

    Parameters:

    custom_labels Union[bool, str]

    If bool, whether to use custom topic labels that were defined using +

    If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., "Aspect1".

    False
    @@ -2663,7 +2663,7 @@

    DTM

    @@ -2718,16 +2718,18 @@

    DTM Source code in bertopic\plotting\_topics_over_time.py -
    def visualize_topics_over_time(topic_model,
    -                               topics_over_time: pd.DataFrame,
    -                               top_n_topics: int = None,
    -                               topics: List[int] = None,
    -                               normalize_frequency: bool = False,
    -                               custom_labels: Union[bool, str] = False,
    -                               title: str = "<b>Topics over Time</b>",
    -                               width: int = 1250,
    -                               height: int = 450) -> go.Figure:
    -    """ Visualize topics over time
    +          
    def visualize_topics_over_time(
    +    topic_model,
    +    topics_over_time: pd.DataFrame,
    +    top_n_topics: int = None,
    +    topics: List[int] = None,
    +    normalize_frequency: bool = False,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Topics over Time</b>",
    +    width: int = 1250,
    +    height: int = 450,
    +) -> go.Figure:
    +    """Visualize topics over time.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -2736,7 +2738,7 @@ 

    DTM top_n_topics: To visualize the most frequent topics instead of all topics: Select which topics you would like to be visualized normalize_frequency: Whether to normalize each topic's frequency individually - custom_labels: If bool, whether to use custom topic labels that were defined using + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. @@ -2747,7 +2749,6 @@

    DTM A plotly.graph_objects.Figure including all traces Examples: - To visualize the topics over time, simply run: ```python @@ -2764,7 +2765,15 @@

    DTM <iframe src="../../getting_started/visualization/trump.html" style="width:1000px; height: 680px; border: 0px;""></iframe> """ - colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7"] + colors = [ + "#E69F00", + "#56B4E9", + "#009E73", + "#F0E442", + "#D55E00", + "#0072B2", + "#CC79A7", + ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -2783,10 +2792,13 @@

    DTMtopic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names] topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())} elif topic_model.custom_labels_ is not None and custom_labels: - topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()} + topic_names = { + key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items() + } else: - topic_names = {key: value[:40] + "..." if len(value) > 40 else value - for key, value in topic_model.topic_labels_.items()} + topic_names = { + key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items() + } topics_over_time["Name"] = topics_over_time.Topic.map(topic_names) data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"]) @@ -2800,12 +2812,17 @@

    DTMy = normalize(trace_data.Frequency.values.reshape(1, -1))[0] else: y = trace_data.Frequency - fig.add_trace(go.Scatter(x=trace_data.Timestamp, y=y, - mode='lines', - marker_color=colors[index % 7], - hoverinfo="text", - name=topic_name, - hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words])) + fig.add_trace( + go.Scatter( + x=trace_data.Timestamp, + y=y, + mode="lines", + marker_color=colors[index % 7], + hoverinfo="text", + name=topic_name, + hovertext=[f"<b>Topic {topic}</b><br>Words: {word}" for word in words], + ) + ) # Styling of the visualization fig.update_xaxes(showgrid=True) @@ -2813,26 +2830,20 @@

    DTMfig.update_layout( yaxis_title="Normalized Frequency" if normalize_frequency else "Frequency", title={ - 'text': f"{title}", - 'y': .95, - 'x': 0.40, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "y": 0.95, + "x": 0.40, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, template="simple_white", width=width, height=height, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), legend=dict( title="<b>Global Topic Representation", - ) + ), ) return fig

    diff --git a/api/plotting/heatmap.html b/api/plotting/heatmap.html index 13e50ba0..463db139 100755 --- a/api/plotting/heatmap.html +++ b/api/plotting/heatmap.html @@ -2616,9 +2616,9 @@

    Heatmap
    -

    Visualize a heatmap of the topic's similarity matrix

    -

    Based on the cosine similarity matrix between topic embeddings, -a heatmap is created showing the similarity between topics.

    +

    Visualize a heatmap of the topic's similarity matrix.

    +

    Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding +model), a heatmap is created showing the similarity between topics.

    Parameters:

    False
    @@ -2656,10 +2656,17 @@

    Heatmap

    + + + + + + - @@ -2714,18 +2721,21 @@

    Heatmap Source code in bertopic\plotting\_heatmap.py -
    def visualize_heatmap(topic_model,
    -                      topics: List[int] = None,
    -                      top_n_topics: int = None,
    -                      n_clusters: int = None,
    -                      custom_labels: Union[bool, str] = False,
    -                      title: str = "<b>Similarity Matrix</b>",
    -                      width: int = 800,
    -                      height: int = 800) -> go.Figure:
    -    """ Visualize a heatmap of the topic's similarity matrix
    -
    -    Based on the cosine similarity matrix between topic embeddings,
    -    a heatmap is created showing the similarity between topics.
    +          
    def visualize_heatmap(
    +    topic_model,
    +    topics: List[int] = None,
    +    top_n_topics: int = None,
    +    n_clusters: int = None,
    +    use_ctfidf: bool = False,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Similarity Matrix</b>",
    +    width: int = 800,
    +    height: int = 800,
    +) -> go.Figure:
    +    """Visualize a heatmap of the topic's similarity matrix.
    +
    +    Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding
    +    model), a heatmap is created showing the similarity between topics.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -2733,7 +2743,9 @@ 

    Heatmap top_n_topics: Only select the top n most frequent topics. n_clusters: Create n clusters and order the similarity matrix by those clusters. - custom_labels: If bool, whether to use custom topic labels that were defined using + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings + from the embedding model are used. + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. @@ -2744,7 +2756,6 @@

    Heatmap fig: A plotly figure Examples: - To visualize the similarity matrix of topics simply run: @@ -2761,12 +2772,9 @@

    Heatmap <iframe src="../../getting_started/visualization/heatmap.html" style="width:1000px; height: 720px; border: 0px;""></iframe> """ - - # Select topic embeddings - if topic_model.topic_embeddings_ is not None: - embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:] - else: - embeddings = topic_model.c_tf_idf_[topic_model._outliers:] + embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][ + topic_model._outliers : + ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -2782,12 +2790,11 @@

    Heatmapsorted_topics = topics if n_clusters: if n_clusters >= len(set(topics)): - raise ValueError("Make sure to set `n_clusters` lower than " - "the total number of unique topics.") + raise ValueError("Make sure to set `n_clusters` lower than " "the total number of unique topics.") distance_matrix = cosine_similarity(embeddings[topics]) - Z = linkage(distance_matrix, 'ward') - clusters = fcluster(Z, t=n_clusters, criterion='maxclust') + Z = linkage(distance_matrix, "ward") + clusters = fcluster(Z, t=n_clusters, criterion="maxclust") # Extract new order of topics mapping = {cluster: [] for cluster in clusters} @@ -2803,7 +2810,9 @@

    Heatmap# Create labels if isinstance(custom_labels, str): - new_labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics] + new_labels = [ + [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics + ] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] elif topic_model.custom_labels_ is not None and custom_labels: @@ -2813,34 +2822,29 @@

    Heatmapnew_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] - fig = px.imshow(distance_matrix, - labels=dict(color="Similarity Score"), - x=new_labels, - y=new_labels, - color_continuous_scale='GnBu' - ) + fig = px.imshow( + distance_matrix, + labels=dict(color="Similarity Score"), + x=new_labels, + y=new_labels, + color_continuous_scale="GnBu", + ) fig.update_layout( title={ - 'text': f"{title}", - 'y': .95, - 'x': 0.55, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "y": 0.95, + "x": 0.55, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, width=width, height=height, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) fig.update_layout(showlegend=True) - fig.update_layout(legend_title_text='Trend') + fig.update_layout(legend_title_text="Trend") return fig

    diff --git a/api/plotting/hierarchical_documents.html b/api/plotting/hierarchical_documents.html index e1148d51..c4c60190 100755 --- a/api/plotting/hierarchical_documents.html +++ b/api/plotting/hierarchical_documents.html @@ -2616,7 +2616,7 @@

    Hierarchical Documents
    -

    Visualize documents and their topics in 2D at different levels of hierarchy

    +

    Visualize documents and their topics in 2D at different levels of hierarchy.

    Parameters:

    None
    use_ctfidfbool

    Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings + from the embedding model are used.

    False
    custom_labels Union[bool, str]

    If bool, whether to use custom topic labels that were defined using +

    If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., "Aspect1".

    False
    @@ -2629,6 +2629,12 @@

    Hierarchical DocumentsHierarchical DocumentsHierarchical Documents

    @@ -2780,24 +2786,27 @@

    Hierarchical Documents Source code in bertopic\plotting\_hierarchical_documents.py -
    def visualize_hierarchical_documents(topic_model,
    -                                     docs: List[str],
    -                                     hierarchical_topics: pd.DataFrame,
    -                                     topics: List[int] = None,
    -                                     embeddings: np.ndarray = None,
    -                                     reduced_embeddings: np.ndarray = None,
    -                                     sample: Union[float, int] = None,
    -                                     hide_annotations: bool = False,
    -                                     hide_document_hover: bool = True,
    -                                     nr_levels: int = 10,
    -                                     level_scale: str = 'linear', 
    -                                     custom_labels: Union[bool, str] = False,
    -                                     title: str = "<b>Hierarchical Documents and Topics</b>",
    -                                     width: int = 1200,
    -                                     height: int = 750) -> go.Figure:
    -    """ Visualize documents and their topics in 2D at different levels of hierarchy
    +          
    def visualize_hierarchical_documents(
    +    topic_model,
    +    docs: List[str],
    +    hierarchical_topics: pd.DataFrame,
    +    topics: List[int] = None,
    +    embeddings: np.ndarray = None,
    +    reduced_embeddings: np.ndarray = None,
    +    sample: Union[float, int] = None,
    +    hide_annotations: bool = False,
    +    hide_document_hover: bool = True,
    +    nr_levels: int = 10,
    +    level_scale: str = "linear",
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Hierarchical Documents and Topics</b>",
    +    width: int = 1200,
    +    height: int = 750,
    +) -> go.Figure:
    +    """Visualize documents and their topics in 2D at different levels of hierarchy.
     
         Arguments:
    +        topic_model: A fitted BERTopic instance.
             docs: The documents you used when calling either `fit` or `fit_transform`
             hierarchical_topics: A dataframe that contains a hierarchy of topics
                                  represented by their parents and their children
    @@ -2815,27 +2824,26 @@ 

    Hierarchical Documents hide_document_hover: Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations. nr_levels: The number of levels to be visualized in the hierarchy. First, the distances - in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances. - Then, for each list of distances, the merged topics are selected that have a + in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances. + Then, for each list of distances, the merged topics are selected that have a distance less or equal to the maximum distance of the selected list of distances. NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to the length of `hierarchical_topics`. - level_scale: Whether to apply a linear or logarithmic (log) scale levels of the distance - vector. Linear scaling will perform an equal number of merges at each level - while logarithmic scaling will perform more mergers in earlier levels to - provide more resolution at higher levels (this can be used for when the number - of topics is large). - custom_labels: If bool, whether to use custom topic labels that were defined using + level_scale: Whether to apply a linear or logarithmic (log) scale levels of the distance + vector. Linear scaling will perform an equal number of merges at each level + while logarithmic scaling will perform more mergers in earlier levels to + provide more resolution at higher levels (this can be used for when the number + of topics is large). + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". - NOTE: Custom labels are only generated for the original + NOTE: Custom labels are only generated for the original un-merged topics. title: Title of the plot. width: The width of the figure. height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -2877,7 +2885,7 @@

    Hierarchical Documents fig.write_html("path/to/file.html") ``` - NOTE: + Note: This visualization was inspired by the scatter plot representation of Doc2Map: https://github.com/louisgeisler/Doc2Map @@ -2893,7 +2901,7 @@

    Hierarchical Documentsindices = [] for topic in set(topic_per_doc): s = np.where(np.array(topic_per_doc) == topic)[0] - size = len(s) if len(s) < 100 else int(len(s)*sample) + size = len(s) if len(s) < 100 else int(len(s) * sample) indices.extend(np.random.choice(s, size=size, replace=False)) indices = np.array(indices) @@ -2915,7 +2923,7 @@

    Hierarchical Documents# Reduce input embeddings if reduced_embeddings is None: - umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce) + umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce) embeddings_2d = umap_model.embedding_ elif sample is not None and reduced_embeddings is not None: embeddings_2d = reduced_embeddings[indices] @@ -2928,17 +2936,28 @@

    Hierarchical Documents# Create topic list for each level, levels are created by calculating the distance distances = hierarchical_topics.Distance.to_list() - if level_scale == 'log' or level_scale == 'logarithmic': - log_indices = np.round(np.logspace(start=math.log(1,10), stop=math.log(len(distances)-1,10), num=nr_levels)).astype(int).tolist() + if level_scale == "log" or level_scale == "logarithmic": + log_indices = ( + np.round( + np.logspace( + start=math.log(1, 10), + stop=math.log(len(distances) - 1, 10), + num=nr_levels, + ) + ) + .astype(int) + .tolist() + ) log_indices.reverse() max_distances = [distances[i] for i in log_indices] - elif level_scale == 'lin' or level_scale == 'linear': - max_distances = [distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels)][::-1] + elif level_scale == "lin" or level_scale == "linear": + max_distances = [ + distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels) + ][::-1] else: raise ValueError("level_scale needs to be one of 'log' or 'linear'") for index, max_distance in enumerate(max_distances): - # Get topics below `max_distance` mapping = {topic: topic for topic in df.topic.unique()} selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :] @@ -2969,17 +2988,28 @@

    Hierarchical Documentsif topic < hierarchical_topics.Parent_ID.astype(int).min(): if topic_model.get_topic(topic): if isinstance(custom_labels, str): - trace_name = f"{topic}_" + "_".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3]) + trace_name = f"{topic}_" + "_".join( + list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3] + ) elif topic_model.custom_labels_ is not None and custom_labels: trace_name = topic_model.custom_labels_[topic + topic_model._outliers] else: trace_name = f"{topic}_" + "_".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3]) - topic_names[topic] = {"trace_name": trace_name[:40], "plot_text": trace_name[:40]} + topic_names[topic] = { + "trace_name": trace_name[:40], + "plot_text": trace_name[:40], + } trace_names.append(trace_name) else: - trace_name = f"{topic}_" + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0] + trace_name = ( + f"{topic}_" + + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), "Parent_Name"].values[0] + ) plot_text = "_".join([name[:20] for name in trace_name.split("_")[:3]]) - topic_names[topic] = {"trace_name": trace_name[:40], "plot_text": plot_text[:40]} + topic_names[topic] = { + "trace_name": trace_name[:40], + "plot_text": plot_text[:40], + } trace_names.append(trace_name) # Prepare traces @@ -2990,17 +3020,17 @@

    Hierarchical Documents# Outliers if topic_model._outliers: traces.append( - go.Scattergl( - x=df.loc[(df[f"level_{level+1}"] == -1), "x"], - y=df.loc[df[f"level_{level+1}"] == -1, "y"], - mode='markers+text', - name="other", - hoverinfo="text", - hovertext=df.loc[(df[f"level_{level+1}"] == -1), "doc"] if not hide_document_hover else None, - showlegend=False, - marker=dict(color='#CFD8DC', size=5, opacity=0.5) - ) + go.Scattergl( + x=df.loc[(df[f"level_{level+1}"] == -1), "x"], + y=df.loc[df[f"level_{level+1}"] == -1, "y"], + mode="markers+text", + name="other", + hoverinfo="text", + hovertext=df.loc[(df[f"level_{level+1}"] == -1), "doc"] if not hide_document_hover else None, + showlegend=False, + marker=dict(color="#CFD8DC", size=5, opacity=0.5), ) + ) # Selected topics if topics: @@ -3012,8 +3042,7 @@

    Hierarchical Documentsfor topic in unique_topics: if topic != -1: if topics: - selection = df.loc[(df[f"level_{level+1}"] == topic) & - (df.topic.isin(topics)), :] + selection = df.loc[(df[f"level_{level+1}"] == topic) & (df.topic.isin(topics)), :] else: selection = df.loc[df[f"level_{level+1}"] == topic, :] @@ -3032,8 +3061,8 @@

    Hierarchical Documentshovertext=selection.doc if not hide_document_hover else None, hoverinfo="text", name=topic_names[int(topic)]["trace_name"], - mode='markers+text', - marker=dict(size=5, opacity=0.5) + mode="markers+text", + marker=dict(size=5, opacity=0.5), ) ) @@ -3063,27 +3092,39 @@

    Hierarchical Documentsstep = dict( method="update", label=str(index), - args=[{"visible": [False] * len(fig.data)}] + args=[{"visible": [False] * len(fig.data)}], ) - for index in range(indices[1]-indices[0]): - step["args"][0]["visible"][index+indices[0]] = True + for index in range(indices[1] - indices[0]): + step["args"][0]["visible"][index + indices[0]] = True steps.append(step) - sliders = [dict( - currentvalue={"prefix": "Level: "}, - pad={"t": 20}, - steps=steps - )] + sliders = [dict(currentvalue={"prefix": "Level: "}, pad={"t": 20}, steps=steps)] # Add grid in a 'plus' shape - x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15)) - y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15)) - fig.add_shape(type="line", - x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1], - line=dict(color="#CFD8DC", width=2)) - fig.add_shape(type="line", - x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2, - line=dict(color="#9E9E9E", width=2)) + x_range = ( + df.x.min() - abs((df.x.min()) * 0.15), + df.x.max() + abs((df.x.max()) * 0.15), + ) + y_range = ( + df.y.min() - abs((df.y.min()) * 0.15), + df.y.max() + abs((df.y.max()) * 0.15), + ) + fig.add_shape( + type="line", + x0=sum(x_range) / 2, + y0=y_range[0], + x1=sum(x_range) / 2, + y1=y_range[1], + line=dict(color="#CFD8DC", width=2), + ) + fig.add_shape( + type="line", + x0=x_range[0], + y0=sum(y_range) / 2, + x1=x_range[1], + y1=sum(y_range) / 2, + line=dict(color="#9E9E9E", width=2), + ) fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10) fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10) @@ -3092,13 +3133,11 @@

    Hierarchical Documentssliders=sliders, template="simple_white", title={ - 'text': f"{title}", - 'x': 0.5, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, width=width, height=height, diff --git a/api/plotting/hierarchy.html b/api/plotting/hierarchy.html index 364ef002..b852b2ee 100755 --- a/api/plotting/hierarchy.html +++ b/api/plotting/hierarchy.html @@ -2616,10 +2616,10 @@

    Hierarchy
    -

    Visualize a hierarchical structure of the topics

    +

    Visualize a hierarchical structure of the topics.

    A ward linkage function is used to perform the hierarchical clustering based on the cosine distance -matrix between topic embeddings.

    +matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model).

    Parameters:

    False
    @@ -2657,13 +2657,20 @@

    Hierarchy

    Only select the top n most frequent topics

    + + + + + + - @@ -2708,10 +2715,10 @@

    HierarchyCallable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

    @@ -2765,23 +2772,26 @@

    Hierarchy Source code in bertopic\plotting\_hierarchy.py -
    def visualize_hierarchy(topic_model,
    -                        orientation: str = "left",
    -                        topics: List[int] = None,
    -                        top_n_topics: int = None,
    -                        custom_labels: Union[bool, str] = False,
    -                        title: str = "<b>Hierarchical Clustering</b>",
    -                        width: int = 1000,
    -                        height: int = 600,
    -                        hierarchical_topics: pd.DataFrame = None,
    -                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,
    -                        distance_function: Callable[[csr_matrix], csr_matrix] = None,
    -                        color_threshold: int = 1) -> go.Figure:
    -    """ Visualize a hierarchical structure of the topics
    +          
    def visualize_hierarchy(
    +    topic_model,
    +    orientation: str = "left",
    +    topics: List[int] = None,
    +    top_n_topics: int = None,
    +    use_ctfidf: bool = True,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Hierarchical Clustering</b>",
    +    width: int = 1000,
    +    height: int = 600,
    +    hierarchical_topics: pd.DataFrame = None,
    +    linkage_function: Callable[[csr_matrix], np.ndarray] = None,
    +    distance_function: Callable[[csr_matrix], csr_matrix] = None,
    +    color_threshold: int = 1,
    +) -> go.Figure:
    +    """Visualize a hierarchical structure of the topics.
     
         A ward linkage function is used to perform the
         hierarchical clustering based on the cosine distance
    -    matrix between topic embeddings.
    +    matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model).
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -2789,10 +2799,12 @@ 

    Hierarchy Either 'left' or 'bottom' topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics - custom_labels: If bool, whether to use custom topic labels that were defined using + use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings + from the embedding model are used. + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". - NOTE: Custom labels are only generated for the original + NOTE: Custom labels are only generated for the original un-merged topics. title: Title of the plot. width: The width of the figure. Only works if orientation is set to 'left' @@ -2807,10 +2819,10 @@

    Hierarchy in `topic_model.hierarchical_topics`. distance_function: The distance function to use on the c-TF-IDF matrix. Default is: `lambda x: 1 - cosine_similarity(x)`. - You can pass any function that returns either a square matrix of - shape (n_samples, n_samples) with zeros on the diagonal and - non-negative values or condensed distance matrix of shape - (n_samples * (n_samples - 1) / 2,) containing the upper + You can pass any function that returns either a square matrix of + shape (n_samples, n_samples) with zeros on the diagonal and + non-negative values or condensed distance matrix of shape + (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. NOTE: Make sure to use the same `distance_function` as used in `topic_model.hierarchical_topics`. @@ -2822,7 +2834,6 @@

    Hierarchy fig: A plotly figure Examples: - To visualize the hierarchical structure of topics simply run: @@ -2854,7 +2865,7 @@

    Hierarchydistance_function = lambda x: 1 - cosine_similarity(x) if linkage_function is None: - linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True) + linkage_function = lambda x: sch.linkage(x, "ward", optimal_ordering=True) # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -2871,85 +2882,88 @@

    Hierarchyindices = np.array([all_topics.index(topic) for topic in topics]) # Select topic embeddings - if topic_model.c_tf_idf_ is not None: - embeddings = topic_model.c_tf_idf_[indices] - else: - embeddings = np.array(topic_model.topic_embeddings_)[indices] + embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][ + indices + ] # Annotations if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()): - annotations = _get_annotations(topic_model=topic_model, - hierarchical_topics=hierarchical_topics, - embeddings=embeddings, - distance_function=distance_function, - linkage_function=linkage_function, - orientation=orientation, - custom_labels=custom_labels) + annotations = _get_annotations( + topic_model=topic_model, + hierarchical_topics=hierarchical_topics, + embeddings=embeddings, + distance_function=distance_function, + linkage_function=linkage_function, + orientation=orientation, + custom_labels=custom_labels, + ) else: annotations = None # wrap distance function to validate input and return a condensed distance matrix - distance_function_viz = lambda x: validate_distance_matrix( - distance_function(x), embeddings.shape[0]) + distance_function_viz = lambda x: validate_distance_matrix(distance_function(x), embeddings.shape[0]) # Create dendogram - fig = ff.create_dendrogram(embeddings, - orientation=orientation, - distfun=distance_function_viz, - linkagefun=linkage_function, - hovertext=annotations, - color_threshold=color_threshold) + fig = ff.create_dendrogram( + embeddings, + orientation=orientation, + distfun=distance_function_viz, + linkagefun=linkage_function, + hovertext=annotations, + color_threshold=color_threshold, + ) # Create nicer labels axis = "yaxis" if orientation == "left" else "xaxis" if isinstance(custom_labels, str): - new_labels = [[[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis]["ticktext"]] + new_labels = [ + [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis]["ticktext"] + ] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] elif topic_model.custom_labels_ is not None and custom_labels: - new_labels = [topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis]["ticktext"]] + new_labels = [ + topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis]["ticktext"] + ] else: - new_labels = [[[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)]) - for x in fig.layout[axis]["ticktext"]] + new_labels = [ + [[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)]) for x in fig.layout[axis]["ticktext"] + ] new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels] new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels] # Stylize layout fig.update_layout( - plot_bgcolor='#ECEFF1', + plot_bgcolor="#ECEFF1", template="plotly_white", title={ - 'text': f"{title}", - 'x': 0.5, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) # Stylize orientation if orientation == "left": - fig.update_layout(height=200 + (15 * len(topics)), - width=width, - yaxis=dict(tickmode="array", - ticktext=new_labels)) + fig.update_layout( + height=200 + (15 * len(topics)), + width=width, + yaxis=dict(tickmode="array", ticktext=new_labels), + ) # Fix empty space on the bottom of the graph - y_max = max([trace['y'].max() + 5 for trace in fig['data']]) - y_min = min([trace['y'].min() - 5 for trace in fig['data']]) + y_max = max([trace["y"].max() + 5 for trace in fig["data"]]) + y_min = min([trace["y"].min() - 5 for trace in fig["data"]]) fig.update_layout(yaxis=dict(range=[y_min, y_max])) else: - fig.update_layout(width=200 + (15 * len(topics)), - height=height, - xaxis=dict(tickmode="array", - ticktext=new_labels)) + fig.update_layout( + width=200 + (15 * len(topics)), + height=height, + xaxis=dict(tickmode="array", ticktext=new_labels), + ) if hierarchical_topics is not None: for index in [0, 3]: @@ -2958,9 +2972,17 @@

    Hierarchyys = [data["y"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] hovertext = [data["text"][index] for data in fig.data if (data["text"] and data[axis][index] > 0)] - fig.add_trace(go.Scatter(x=xs, y=ys, marker_color='black', - hovertext=hovertext, hoverinfo="text", - mode='markers', showlegend=False)) + fig.add_trace( + go.Scatter( + x=xs, + y=ys, + marker_color="black", + hovertext=hovertext, + hoverinfo="text", + mode="markers", + showlegend=False, + ) + ) return fig

    diff --git a/api/plotting/term.html b/api/plotting/term.html index a3c5a06e..4709a60e 100755 --- a/api/plotting/term.html +++ b/api/plotting/term.html @@ -2616,7 +2616,7 @@

    Term Score Decline
    -

    Visualize the ranks of all terms across all topics

    +

    Visualize the ranks of all terms across all topics.

    Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point @@ -2655,7 +2655,7 @@

    Term Score Decline

    - @@ -2720,14 +2720,16 @@

    Term Score Decline Source code in bertopic\plotting\_term_rank.py -
    def visualize_term_rank(topic_model,
    -                        topics: List[int] = None,
    -                        log_scale: bool = False,
    -                        custom_labels: Union[bool, str] = False,
    -                        title: str = "<b>Term score decline per Topic</b>",
    -                        width: int = 800,
    -                        height: int = 500) -> go.Figure:
    -    """ Visualize the ranks of all terms across all topics
    +          
    def visualize_term_rank(
    +    topic_model,
    +    topics: List[int] = None,
    +    log_scale: bool = False,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Term score decline per Topic</b>",
    +    width: int = 800,
    +    height: int = 500,
    +) -> go.Figure:
    +    """Visualize the ranks of all terms across all topics.
     
         Each topic is represented by a set of words. These words, however,
         do not all equally represent the topic. This visualization shows
    @@ -2739,7 +2741,7 @@ 

    Term Score Decline topics: A selection of topics to visualize. These will be colored red where all others will be colored black. log_scale: Whether to represent the ranking on a log scale - custom_labels: If bool, whether to use custom topic labels that were defined using + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. @@ -2750,7 +2752,6 @@

    Term Score Decline fig: A plotly figure Examples: - To visualize the ranks of all words across all topics simply run: @@ -2779,7 +2780,6 @@

    Term Score Decline Reference to that specific analysis can be found [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html). """ - topics = [] if topics is None else topics topic_ids = topic_model.get_topic_info().Topic.unique().tolist() @@ -2792,7 +2792,6 @@

    Term Score Declinelines = [] for topic, x, y in zip(topic_ids, indices, values): if not any(y > 1.5): - # labels if isinstance(custom_labels, str): label = f"{topic}_" + "_".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3]) @@ -2804,17 +2803,20 @@

    Term Score Decline# line parameters color = "red" if topic in topics else "black" - opacity = 1 if topic in topics else .1 + opacity = 1 if topic in topics else 0.1 if any(y == 0): y[y == 0] = min(values[values > 0]) y = np.log10(y, out=y, where=y > 0) if log_scale else y - line = go.Scatter(x=x, y=y, - name="", - hovertext=label, - mode="lines+lines", - opacity=opacity, - line=dict(color=color, width=1.5)) + line = go.Scatter( + x=x, + y=y, + name="", + hovertext=label, + mode="lines+lines", + opacity=opacity, + line=dict(color=color, width=1.5), + ) lines.append(line) fig = go.Figure(data=lines) @@ -2825,29 +2827,23 @@

    Term Score Declineshowlegend=False, template="plotly_white", title={ - 'text': f"{title}", - 'y': .9, - 'x': 0.5, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "y": 0.9, + "x": 0.5, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, width=width, height=height, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), ) - fig.update_xaxes(title_text='Term Rank') + fig.update_xaxes(title_text="Term Rank") if log_scale: - fig.update_yaxes(title_text='c-TF-IDF score (log scale)') + fig.update_yaxes(title_text="c-TF-IDF score (log scale)") else: - fig.update_yaxes(title_text='c-TF-IDF score') + fig.update_yaxes(title_text="c-TF-IDF score") return fig

    diff --git a/api/plotting/topics.html b/api/plotting/topics.html index 416fd4d0..5e71da18 100755 --- a/api/plotting/topics.html +++ b/api/plotting/topics.html @@ -2616,7 +2616,7 @@

    Topics
    -

    Visualize topics, their sizes, and their corresponding words

    +

    Visualize topics, their sizes, and their corresponding words.

    This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA.

    @@ -2649,10 +2649,16 @@

    Topics

    @@ -2691,14 +2697,17 @@

    Topics Source code in bertopic\plotting\_topics.py -
    def visualize_topics(topic_model,
    -                     topics: List[int] = None,
    -                     top_n_topics: int = None,
    -                     custom_labels: Union[bool, str] = False,
    -                     title: str = "<b>Intertopic Distance Map</b>",
    -                     width: int = 650,
    -                     height: int = 650) -> go.Figure:
    -    """ Visualize topics, their sizes, and their corresponding words
    +          
    def visualize_topics(
    +    topic_model,
    +    topics: List[int] = None,
    +    top_n_topics: int = None,
    +    use_ctfidf: bool = False,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Intertopic Distance Map</b>",
    +    width: int = 650,
    +    height: int = 650,
    +) -> go.Figure:
    +    """Visualize topics, their sizes, and their corresponding words.
     
         This visualization is highly inspired by LDAvis, a great visualization
         technique typically reserved for LDA.
    @@ -2707,7 +2716,8 @@ 

    Topics topic_model: A fitted BERTopic instance. topics: A selection of topics to visualize top_n_topics: Only select the top n most frequent topics - custom_labels: If bool, whether to use custom topic labels that were defined using + use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model. + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. @@ -2715,7 +2725,6 @@

    Topics height: The height of the figure. Examples: - To visualize the topics simply run: ```python @@ -2757,17 +2766,30 @@

    Topicsall_topics = sorted(list(topic_model.get_topics().keys())) indices = np.array([all_topics.index(topic) for topic in topics]) - if topic_model.topic_embeddings_ is not None: - embeddings = topic_model.topic_embeddings_[indices] - embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings) - else: - embeddings = topic_model.c_tf_idf_.toarray()[indices] + embeddings, c_tfidf_used = select_topic_representation( + topic_model.c_tf_idf_, + topic_model.topic_embeddings_, + use_ctfidf=use_ctfidf, + output_ndarray=True, + ) + embeddings = embeddings[indices] + + if c_tfidf_used: embeddings = MinMaxScaler().fit_transform(embeddings) - embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings) + embeddings = UMAP(n_neighbors=2, n_components=2, metric="hellinger", random_state=42).fit_transform(embeddings) + else: + embeddings = UMAP(n_neighbors=2, n_components=2, metric="cosine", random_state=42).fit_transform(embeddings) # Visualize with plotly - df = pd.DataFrame({"x": embeddings[:, 0], "y": embeddings[:, 1], - "Topic": topic_list, "Words": words, "Size": frequencies}) + df = pd.DataFrame( + { + "x": embeddings[:, 0], + "y": embeddings[:, 1], + "Topic": topic_list, + "Words": words, + "Size": frequencies, + } + ) return _plotly_topic_visualization(df, topic_list, title, width, height)

    diff --git a/api/plotting/topics_per_class.html b/api/plotting/topics_per_class.html index e0f953af..d5b5a6d2 100755 --- a/api/plotting/topics_per_class.html +++ b/api/plotting/topics_per_class.html @@ -2616,7 +2616,7 @@

    Topics per Class
    -

    Visualize topics per class

    +

    Visualize topics per class.

    Parameters:

    None
    use_ctfidfbool

    Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings + from the embedding model are used.

    True
    custom_labels Union[bool, str]

    If bool, whether to use custom topic labels that were defined using +

    If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., "Aspect1". - NOTE: Custom labels are only generated for the original + NOTE: Custom labels are only generated for the original un-merged topics.

    False

    The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x). - You can pass any function that returns either a square matrix of - shape (n_samples, n_samples) with zeros on the diagonal and - non-negative values or condensed distance matrix of shape - (n_samples * (n_samples - 1) / 2,) containing the upper + You can pass any function that returns either a square matrix of + shape (n_samples, n_samples) with zeros on the diagonal and + non-negative values or condensed distance matrix of shape + (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. NOTE: Make sure to use the same distance_function as used in topic_model.hierarchical_topics.

    custom_labels Union[bool, str]

    If bool, whether to use custom topic labels that were defined using +

    If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., "Aspect1".

    FalseFalse
    @@ -2663,7 +2663,7 @@

    Topics per Class

    - @@ -2718,16 +2718,18 @@

    Topics per Class Source code in bertopic\plotting\_topics_per_class.py -
    def visualize_topics_per_class(topic_model,
    -                               topics_per_class: pd.DataFrame,
    -                               top_n_topics: int = 10,
    -                               topics: List[int] = None,
    -                               normalize_frequency: bool = False,
    -                               custom_labels: Union[bool, str] = False,
    -                               title: str = "<b>Topics per Class</b>",
    -                               width: int = 1250,
    -                               height: int = 900) -> go.Figure:
    -    """ Visualize topics per class
    +          
    def visualize_topics_per_class(
    +    topic_model,
    +    topics_per_class: pd.DataFrame,
    +    top_n_topics: int = 10,
    +    topics: List[int] = None,
    +    normalize_frequency: bool = False,
    +    custom_labels: Union[bool, str] = False,
    +    title: str = "<b>Topics per Class</b>",
    +    width: int = 1250,
    +    height: int = 900,
    +) -> go.Figure:
    +    """Visualize topics per class.
     
         Arguments:
             topic_model: A fitted BERTopic instance.
    @@ -2736,7 +2738,7 @@ 

    Topics per Class top_n_topics: To visualize the most frequent topics instead of all topics: Select which topics you would like to be visualized normalize_frequency: Whether to normalize each topic's frequency individually - custom_labels: If bool, whether to use custom topic labels that were defined using + custom_labels: If bool, whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. If `str`, it uses labels from other aspects, e.g., "Aspect1". title: Title of the plot. @@ -2747,7 +2749,6 @@

    Topics per Class A plotly.graph_objects.Figure including all traces Examples: - To visualize the topics per class, simply run: ```python @@ -2764,7 +2765,15 @@

    Topics per Class <iframe src="../../getting_started/visualization/topics_per_class.html" style="width:1400px; height: 1000px; border: 0px;""></iframe> """ - colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#D55E00", "#0072B2", "#CC79A7"] + colors = [ + "#E69F00", + "#56B4E9", + "#009E73", + "#F0E442", + "#D55E00", + "#0072B2", + "#CC79A7", + ] # Select topics based on top_n and topics args freq_df = topic_model.get_topic_freq() @@ -2783,10 +2792,13 @@

    Topics per Classtopic_names = [label if len(label) < 30 else label[:27] + "..." for label in topic_names] topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())} elif topic_model.custom_labels_ is not None and custom_labels: - topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()} + topic_names = { + key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items() + } else: - topic_names = {key: value[:40] + "..." if len(value) > 40 else value - for key, value in topic_model.topic_labels_.items()} + topic_names = { + key: value[:40] + "..." if len(value) > 40 else value for key, value in topic_model.topic_labels_.items() + } topics_per_class["Name"] = topics_per_class.Topic.map(topic_names) data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :] @@ -2804,14 +2816,18 @@

    Topics per Classx = normalize(trace_data.Frequency.values.reshape(1, -1))[0] else: x = trace_data.Frequency - fig.add_trace(go.Bar(y=trace_data.Class, - x=x, - visible=visible, - marker_color=colors[index % 7], - hoverinfo="text", - name=topic_name, - orientation="h", - hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words])) + fig.add_trace( + go.Bar( + y=trace_data.Class, + x=x, + visible=visible, + marker_color=colors[index % 7], + hoverinfo="text", + name=topic_name, + orientation="h", + hovertext=[f"<b>Topic {topic}</b><br>Words: {word}" for word in words], + ) + ) # Styling of the visualization fig.update_xaxes(showgrid=True) @@ -2820,26 +2836,20 @@

    Topics per Classxaxis_title="Normalized Frequency" if normalize_frequency else "Frequency", yaxis_title="Class", title={ - 'text': f"{title}", - 'y': .95, - 'x': 0.40, - 'xanchor': 'center', - 'yanchor': 'top', - 'font': dict( - size=22, - color="Black") + "text": f"{title}", + "y": 0.95, + "x": 0.40, + "xanchor": "center", + "yanchor": "top", + "font": dict(size=22, color="Black"), }, template="simple_white", width=width, height=height, - hoverlabel=dict( - bgcolor="white", - font_size=16, - font_family="Rockwell" - ), + hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell"), legend=dict( title="<b>Global Topic Representation", - ) + ), ) return fig

    diff --git a/api/representation/base.html b/api/representation/base.html index d10bfc11..0f8dd03f 100755 --- a/api/representation/base.html +++ b/api/representation/base.html @@ -2632,19 +2632,21 @@

    BaseRepresentation
    -

    The base representation model for fine-tuning topic representations

    +

    The base representation model for fine-tuning topic representations.

    Source code in bertopic\representation\_base.py
    class BaseRepresentation(BaseEstimator):
    -    """ The base representation model for fine-tuning topic representations """
    -    def extract_topics(self,
    -                       topic_model,
    -                       documents: pd.DataFrame,
    -                       c_tf_idf: csr_matrix,
    -                       topics: Mapping[str, List[Tuple[str, float]]]
    -                       ) -> Mapping[str, List[Tuple[str, float]]]:
    -        """ Extract topics
    +    """The base representation model for fine-tuning topic representations."""
    +
    +    def extract_topics(
    +        self,
    +        topic_model,
    +        documents: pd.DataFrame,
    +        c_tf_idf: csr_matrix,
    +        topics: Mapping[str, List[Tuple[str, float]]],
    +    ) -> Mapping[str, List[Tuple[str, float]]]:
    +        """Extract topics.
     
             Each representation model that inherits this class will have
             its arguments (topic_model, documents, c_tf_idf, topics)
    @@ -2695,7 +2697,7 @@ 

    -

    Extract topics

    +

    Extract topics.

    Each representation model that inherits this class will have its arguments (topic_model, documents, c_tf_idf, topics) automatically passed. Therefore, the representation model @@ -2750,13 +2752,14 @@

    Source code in bertopic\representation\_base.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topics
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topics.
     
         Each representation model that inherits this class will have
         its arguments (topic_model, documents, c_tf_idf, topics)
    diff --git a/api/representation/cohere.html b/api/representation/cohere.html
    index c4229833..f5dd8b4c 100755
    --- a/api/representation/cohere.html
    +++ b/api/representation/cohere.html
    @@ -2675,8 +2675,8 @@ 

    CohereCohereCohereCohere Source code in bertopic\representation\_cohere.py
    class Cohere(BaseRepresentation):
    -    """ Use the Cohere API to generate topic labels based on their
    +    """Use the Cohere API to generate topic labels based on their
         generative model.
     
         Find more about their models here:
    @@ -2758,26 +2758,26 @@ 

    Cohere NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. - delay_in_seconds: The delay in seconds between consecutive prompts - in order to prevent RateLimitErrors. + delay_in_seconds: The delay in seconds between consecutive prompts + in order to prevent RateLimitErrors. nr_docs: The number of documents to pass to OpenAI if a prompt with the `["DOCUMENTS"]` tag is used. diversity: The diversity of documents to pass to OpenAI. - Accepts values between 0 and 1. A higher + Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments - used to count the length of a document. - * If tokenizer is 'char', then the document is split up + used to count the length of a document. + * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted - and trunctated depending on `doc_length` + and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` @@ -2810,16 +2810,18 @@

    Cohere representation_model = Cohere(co, prompt=prompt) ``` """ - def __init__(self, - client, - model: str = "xlarge", - prompt: str = None, - delay_in_seconds: float = None, - nr_docs: int = 4, - diversity: float = None, - doc_length: int = None, - tokenizer: Union[str, Callable] = None - ): + + def __init__( + self, + client, + model: str = "xlarge", + prompt: str = None, + delay_in_seconds: float = None, + nr_docs: int = 4, + diversity: float = None, + doc_length: int = None, + tokenizer: Union[str, Callable] = None, + ): self.client = client self.model = model self.prompt = prompt if prompt is not None else DEFAULT_PROMPT @@ -2831,13 +2833,14 @@

    Cohereself.tokenizer = tokenizer self.prompts_ = [] - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topics + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topics. Arguments: topic_model: Not used @@ -2849,7 +2852,9 @@

    Cohere updated_topics: Updated topic representations """ # Extract the top 4 representative documents per topic - repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity) + repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( + c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity + ) # Generate using Cohere's Language Model updated_topics = {} @@ -2862,11 +2867,13 @@

    Cohereif self.delay_in_seconds: time.sleep(self.delay_in_seconds) - request = self.client.generate(model=self.model, - prompt=prompt, - max_tokens=50, - num_generations=1, - stop_sequences=["\n"]) + request = self.client.generate( + model=self.model, + prompt=prompt, + max_tokens=50, + num_generations=1, + stop_sequences=["\n"], + ) label = request.generations[0].text.strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] @@ -2926,7 +2933,7 @@

    -

    Extract topics

    +

    Extract topics.

    Parameters:

    custom_labels Union[bool, str]

    If bool, whether to use custom topic labels that were defined using +

    If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., "Aspect1".

    False
    @@ -2982,13 +2989,14 @@

    Source code in bertopic\representation\_cohere.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topics
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topics.
     
         Arguments:
             topic_model: Not used
    @@ -3000,7 +3008,9 @@ 

    updated_topics: Updated topic representations """ # Extract the top 4 representative documents per topic - repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity) + repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( + c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity + ) # Generate using Cohere's Language Model updated_topics = {} @@ -3013,11 +3023,13 @@

    if self.delay_in_seconds: time.sleep(self.delay_in_seconds) - request = self.client.generate(model=self.model, - prompt=prompt, - max_tokens=50, - num_generations=1, - stop_sequences=["\n"]) + request = self.client.generate( + model=self.model, + prompt=prompt, + max_tokens=50, + num_generations=1, + stop_sequences=["\n"], + ) label = request.generations[0].text.strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] diff --git a/api/representation/generation.html b/api/representation/generation.html index 979be98d..b05b95f5 100755 --- a/api/representation/generation.html +++ b/api/representation/generation.html @@ -2634,7 +2634,7 @@

    TextGeneration
    -

    Text2Text or text generation with transformers

    +

    Text2Text or text generation with transformers.

    Parameters:

    @@ -2714,7 +2714,7 @@

    TextGenerationTextGeneration Source code in bertopic\representation\_textgeneration.py
    class TextGeneration(BaseRepresentation):
    -    """ Text2Text or text generation with transformers
    +    """Text2Text or text generation with transformers.
     
         Arguments:
             model: A transformers pipeline that should be initialized as "text-generation"
    @@ -2779,7 +2779,7 @@ 

    TextGeneration and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted - and trunctated depending on `doc_length` + and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` @@ -2813,16 +2813,18 @@

    TextGeneration representation_model = TextGeneration(generator) ``` """ - def __init__(self, - model: Union[str, pipeline], - prompt: str = None, - pipeline_kwargs: Mapping[str, Any] = {}, - random_state: int = 42, - nr_docs: int = 4, - diversity: float = None, - doc_length: int = None, - tokenizer: Union[str, Callable] = None - ): + + def __init__( + self, + model: Union[str, pipeline], + prompt: str = None, + pipeline_kwargs: Mapping[str, Any] = {}, + random_state: int = 42, + nr_docs: int = 4, + diversity: float = None, + doc_length: int = None, + tokenizer: Union[str, Callable] = None, + ): self.random_state = random_state set_seed(random_state) if isinstance(model, str): @@ -2830,9 +2832,11 @@

    TextGenerationelif isinstance(model, Pipeline): self.model = model else: - raise ValueError("Make sure that the HF model that you" - "pass is either a string referring to a" - "HF model or a `transformers.pipeline` object.") + raise ValueError( + "Make sure that the HF model that you" + "pass is either a string referring to a" + "HF model or a `transformers.pipeline` object." + ) self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.default_prompt_ = DEFAULT_PROMPT self.pipeline_kwargs = pipeline_kwargs @@ -2843,13 +2847,14 @@

    TextGenerationself.prompts_ = [] - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topic representations and return a single label + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topic representations and return a single label. Arguments: topic_model: A BERTopic model @@ -2863,30 +2868,30 @@

    TextGeneration# Extract the top 4 representative documents per topic if self.prompt != DEFAULT_PROMPT and "[DOCUMENTS]" in self.prompt: repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( - c_tf_idf, - documents, - topics, - 500, - self.nr_docs, - self.diversity + c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) else: repr_docs_mappings = {topic: None for topic in topics.keys()} updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): - # Prepare prompt - truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs + truncated_docs = ( + [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] + if docs is not None + else docs + ) prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Extract result from generator and use that as label topic_description = self.model(prompt, **self.pipeline_kwargs) - topic_description = [(description["generated_text"].replace(prompt, ""), 1) for description in topic_description] + topic_description = [ + (description["generated_text"].replace(prompt, ""), 1) for description in topic_description + ] if len(topic_description) < 10: - topic_description += [("", 0) for _ in range(10-len(topic_description))] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description @@ -2940,7 +2945,7 @@

    -

    Extract topic representations and return a single label

    +

    Extract topic representations and return a single label.

    Parameters:

    @@ -2996,13 +3001,14 @@

    Source code in bertopic\representation\_textgeneration.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topic representations and return a single label
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topic representations and return a single label.
     
         Arguments:
             topic_model: A BERTopic model
    @@ -3016,30 +3022,30 @@ 

    # Extract the top 4 representative documents per topic if self.prompt != DEFAULT_PROMPT and "[DOCUMENTS]" in self.prompt: repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( - c_tf_idf, - documents, - topics, - 500, - self.nr_docs, - self.diversity + c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity ) else: repr_docs_mappings = {topic: None for topic in topics.keys()} updated_topics = {} for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose): - # Prepare prompt - truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs + truncated_docs = ( + [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] + if docs is not None + else docs + ) prompt = self._create_prompt(truncated_docs, topic, topics) self.prompts_.append(prompt) # Extract result from generator and use that as label topic_description = self.model(prompt, **self.pipeline_kwargs) - topic_description = [(description["generated_text"].replace(prompt, ""), 1) for description in topic_description] + topic_description = [ + (description["generated_text"].replace(prompt, ""), 1) for description in topic_description + ] if len(topic_description) < 10: - topic_description += [("", 0) for _ in range(10-len(topic_description))] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description diff --git a/api/representation/keybert.html b/api/representation/keybert.html index 6f462f34..25ee979e 100755 --- a/api/representation/keybert.html +++ b/api/representation/keybert.html @@ -2650,13 +2650,15 @@

    KeyBERTInspired Source code in bertopic\representation\_keybert.py
    class KeyBERTInspired(BaseRepresentation):
    -    def __init__(self,
    -                 top_n_words: int = 10,
    -                 nr_repr_docs: int = 5,
    -                 nr_samples: int = 500,
    -                 nr_candidate_words: int = 100,
    -                 random_state: int = 42):
    -        """ Use a KeyBERT-like model to fine-tune the topic representations
    +    def __init__(
    +        self,
    +        top_n_words: int = 10,
    +        nr_repr_docs: int = 5,
    +        nr_samples: int = 500,
    +        nr_candidate_words: int = 100,
    +        random_state: int = 42,
    +    ):
    +        """Use a KeyBERT-like model to fine-tune the topic representations.
     
             The algorithm follows KeyBERT but does some optimization in
             order to speed up inference.
    @@ -2703,13 +2705,14 @@ 

    KeyBERTInspiredself.nr_candidate_words = nr_candidate_words self.random_state = random_state - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topics + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topics. Arguments: topic_model: A BERTopic model @@ -2721,7 +2724,9 @@

    KeyBERTInspired updated_topics: Updated topic representations """ # We extract the top n representative documents per class - _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs) + _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs( + c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs + ) # We extract the top n words per class topics = self._extract_candidate_words(topic_model, c_tf_idf, topics) @@ -2735,12 +2740,13 @@

    KeyBERTInspiredreturn updated_topics - def _extract_candidate_words(self, - topic_model, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ For each topic, extract candidate words based on the c-TF-IDF + def _extract_candidate_words( + self, + topic_model, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """For each topic, extract candidate words based on the c-TF-IDF representation. Arguments: @@ -2767,23 +2773,25 @@

    KeyBERTInspiredscores = np.take_along_axis(scores, sorted_indices, axis=1) # Get top 30 words per topic based on c-TF-IDF score - topics = {label: [(words[word_index], score) - if word_index is not None and score > 0 - else ("", 0.00001) - for word_index, score in zip(indices[index][::-1], scores[index][::-1]) - ] - for index, label in enumerate(labels)} - topics = {label: list(zip(*values[:self.nr_candidate_words]))[0] for label, values in topics.items()} + topics = { + label: [ + (words[word_index], score) if word_index is not None and score > 0 else ("", 0.00001) + for word_index, score in zip(indices[index][::-1], scores[index][::-1]) + ] + for index, label in enumerate(labels) + } + topics = {label: list(zip(*values[: self.nr_candidate_words]))[0] for label, values in topics.items()} return topics - def _extract_embeddings(self, - topic_model, - topics: Mapping[str, List[Tuple[str, float]]], - representative_docs: List[str], - repr_doc_indices: List[List[int]] - ) -> Union[np.ndarray, List[str]]: - """ Extract the representative document embeddings and create topic embeddings. + def _extract_embeddings( + self, + topic_model, + topics: Mapping[str, List[Tuple[str, float]]], + representative_docs: List[str], + repr_doc_indices: List[List[int]], + ) -> Union[np.ndarray, List[str]]: + """Extract the representative document embeddings and create topic embeddings. Then extract word embeddings and calculate the cosine similarity between topic embeddings and the word embeddings. Topic embeddings are the average of representative document embeddings. @@ -2801,7 +2809,7 @@

    KeyBERTInspired """ # Calculate representative docs embeddings and create topic embeddings repr_embeddings = topic_model._extract_embeddings(representative_docs, method="document", verbose=False) - topic_embeddings = [np.mean(repr_embeddings[i[0]:i[-1]+1], axis=0) for i in repr_doc_indices] + topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices] # Calculate word embeddings and extract best matching with updated topic_embeddings vocab = list(set([word for words in topics.values() for word in words])) @@ -2810,12 +2818,13 @@

    KeyBERTInspiredreturn sim, vocab - def _extract_top_words(self, - vocab: List[str], - topics: Mapping[str, List[Tuple[str, float]]], - sim: np.ndarray - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract the top n words per topic based on the + def _extract_top_words( + self, + vocab: List[str], + topics: Mapping[str, List[Tuple[str, float]]], + sim: np.ndarray, + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract the top n words per topic based on the similarity matrix between topics and words. Arguments: @@ -2832,8 +2841,10 @@

    KeyBERTInspiredfor i, topic in enumerate(labels): indices = [vocab.index(word) for word in topics[topic]] values = sim[:, indices][i] - word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words:]] - updated_topics[topic] = [(vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words:], word_indices)][::-1] + word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]] + updated_topics[topic] = [ + (vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices) + ][::-1] return updated_topics

    @@ -2866,7 +2877,7 @@

    -

    Use a KeyBERT-like model to fine-tune the topic representations

    +

    Use a KeyBERT-like model to fine-tune the topic representations.

    The algorithm follows KeyBERT but does some optimization in order to speed up inference.

    The steps are as follows. First, we extract the top n representative @@ -2939,13 +2950,15 @@

    Source code in bertopic\representation\_keybert.py -
    def __init__(self,
    -             top_n_words: int = 10,
    -             nr_repr_docs: int = 5,
    -             nr_samples: int = 500,
    -             nr_candidate_words: int = 100,
    -             random_state: int = 42):
    -    """ Use a KeyBERT-like model to fine-tune the topic representations
    +          
    def __init__(
    +    self,
    +    top_n_words: int = 10,
    +    nr_repr_docs: int = 5,
    +    nr_samples: int = 500,
    +    nr_candidate_words: int = 100,
    +    random_state: int = 42,
    +):
    +    """Use a KeyBERT-like model to fine-tune the topic representations.
     
         The algorithm follows KeyBERT but does some optimization in
         order to speed up inference.
    @@ -3011,7 +3024,7 @@ 

    -

    Extract topics

    +

    Extract topics.

    Parameters:

    @@ -3067,13 +3080,14 @@

    Source code in bertopic\representation\_keybert.py -

    @@ -2698,15 +2698,15 @@

    LangChaintokenizer

    float

    The diversity of documents to pass to LangChain. - Accepts values between 0 and 1. A higher + Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

    NoneUnion[str, Callable]

    The tokenizer used to calculate to split the document into segments - used to count the length of a document. - * If tokenizer is 'char', then the document is split up + used to count the length of a document. + * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted - and trunctated depending on doc_length. They are decoded with + and truncated depending on doc_length. They are decoded with whitespaces. * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending @@ -2786,7 +2786,7 @@

    LangChain Source code in bertopic\representation\_langchain.py
    class LangChain(BaseRepresentation):
    -    """ Using chains in langchain to generate topic labels.
    +    """Using chains in langchain to generate topic labels.
     
         The classic example uses `langchain.chains.question_answering.load_qa_chain`.
         This returns a chain that takes a list of documents and a question as input.
    @@ -2808,21 +2808,21 @@ 

    LangChain formats the representative documents within the prompt. nr_docs: The number of documents to pass to LangChain diversity: The diversity of documents to pass to LangChain. - Accepts values between 0 and 1. A higher + Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents. doc_length: The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed. tokenizer: The tokenizer used to calculate to split the document into segments - used to count the length of a document. - * If tokenizer is 'char', then the document is split up + used to count the length of a document. + * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to `doc_length` * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted - and trunctated depending on `doc_length`. They are decoded with + and truncated depending on `doc_length`. They are decoded with whitespaces. * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending @@ -2905,15 +2905,17 @@

    LangChain representation_model = LangChain(chain, prompt=representation_prompt) ``` """ - def __init__(self, - chain, - prompt: str = None, - nr_docs: int = 4, - diversity: float = None, - doc_length: int = None, - tokenizer: Union[str, Callable] = None, - chain_config = None, - ): + + def __init__( + self, + chain, + prompt: str = None, + nr_docs: int = 4, + diversity: float = None, + doc_length: int = None, + tokenizer: Union[str, Callable] = None, + chain_config=None, + ): self.chain = chain self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.default_prompt_ = DEFAULT_PROMPT @@ -2923,13 +2925,14 @@

    LangChainself.doc_length = doc_length self.tokenizer = tokenizer - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, int]]]: - """ Extract topics + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, int]]]: + """Extract topics. Arguments: topic_model: A BERTopic model @@ -2947,20 +2950,13 @@

    LangChaintopics=topics, nr_samples=500, nr_repr_docs=self.nr_docs, - diversity=self.diversity + diversity=self.diversity, ) # Generate label using langchain's batch functionality chain_docs: List[List[Document]] = [ [ - Document( - page_content=truncate_document( - topic_model, - self.doc_length, - self.tokenizer, - doc - ) - ) + Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc)) for doc in docs ] for docs in repr_docs_mappings.values() @@ -2975,16 +2971,10 @@

    LangChainprompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompts.append(prompt) - inputs = [ - {"input_documents": docs, "question": prompt} - for docs, prompt in zip(chain_docs, prompts) - ] + inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] else: - inputs = [ - {"input_documents": docs, "question": self.prompt} - for docs in chain_docs - ] + inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] # `self.chain` must return a dict with an `output_text` key # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` @@ -2992,8 +2982,7 @@

    LangChainlabels = [output["output_text"].strip() for output in outputs] updated_topics = { - topic: [(label, 1)] + [("", 0) for _ in range(9)] - for topic, label in zip(repr_docs_mappings.keys(), labels) + topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) } return updated_topics @@ -3025,7 +3014,7 @@

    -

    Extract topics

    +

    Extract topics.

    Parameters:

    @@ -3081,13 +3070,14 @@

    Source code in bertopic\representation\_langchain.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, int]]]:
    -    """ Extract topics
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, int]]]:
    +    """Extract topics.
     
         Arguments:
             topic_model: A BERTopic model
    @@ -3105,20 +3095,13 @@ 

    topics=topics, nr_samples=500, nr_repr_docs=self.nr_docs, - diversity=self.diversity + diversity=self.diversity, ) # Generate label using langchain's batch functionality chain_docs: List[List[Document]] = [ [ - Document( - page_content=truncate_document( - topic_model, - self.doc_length, - self.tokenizer, - doc - ) - ) + Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc)) for doc in docs ] for docs in repr_docs_mappings.values() @@ -3133,16 +3116,10 @@

    prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) prompts.append(prompt) - inputs = [ - {"input_documents": docs, "question": prompt} - for docs, prompt in zip(chain_docs, prompts) - ] + inputs = [{"input_documents": docs, "question": prompt} for docs, prompt in zip(chain_docs, prompts)] else: - inputs = [ - {"input_documents": docs, "question": self.prompt} - for docs in chain_docs - ] + inputs = [{"input_documents": docs, "question": self.prompt} for docs in chain_docs] # `self.chain` must return a dict with an `output_text` key # same output key as the `StuffDocumentsChain` returned by `load_qa_chain` @@ -3150,8 +3127,7 @@

    labels = [output["output_text"].strip() for output in outputs] updated_topics = { - topic: [(label, 1)] + [("", 0) for _ in range(9)] - for topic, label in zip(repr_docs_mappings.keys(), labels) + topic: [(label, 1)] + [("", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels) } return updated_topics diff --git a/api/representation/mmr.html b/api/representation/mmr.html index a93ef582..7e5b7644 100755 --- a/api/representation/mmr.html +++ b/api/representation/mmr.html @@ -2679,7 +2679,7 @@

    MaximalMarginalRelevance Source code in bertopic\representation\_mmr.py
    class MaximalMarginalRelevance(BaseRepresentation):
    -    """ Calculate Maximal Marginal Relevance (MMR)
    +    """Calculate Maximal Marginal Relevance (MMR)
         between candidate keywords and the document.
     
         MMR considers the similarity of keywords/keyphrases with the
    @@ -2706,17 +2706,19 @@ 

    MaximalMarginalRelevance topic_model = BERTopic(representation_model=representation_model) ``` """ + def __init__(self, diversity: float = 0.1, top_n_words: int = 10): self.diversity = diversity self.top_n_words = top_n_words - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topic representations + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topic representations. Arguments: topic_model: The BERTopic model @@ -2727,18 +2729,27 @@

    MaximalMarginalRelevance Returns: updated_topics: Updated topic representations """ - if topic_model.embedding_model is None: - warnings.warn("MaximalMarginalRelevance can only be used BERTopic was instantiated" - "with the `embedding_model` parameter.") + warnings.warn( + "MaximalMarginalRelevance can only be used BERTopic was instantiated" + "with the `embedding_model` parameter." + ) return topics updated_topics = {} for topic, topic_words in topics.items(): words = [word[0] for word in topic_words] word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False) - topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(1, -1) - topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words) + topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape( + 1, -1 + ) + topic_words = mmr( + topic_embedding, + word_embeddings, + words, + self.diversity, + self.top_n_words, + ) updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words] return updated_topics

    @@ -2769,7 +2780,7 @@

    -

    Extract topic representations

    +

    Extract topic representations.

    Parameters:

    @@ -2825,13 +2836,14 @@

    Source code in bertopic\representation\_mmr.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topic representations
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topic representations.
     
         Arguments:
             topic_model: The BERTopic model
    @@ -2842,18 +2854,27 @@ 

    Returns: updated_topics: Updated topic representations """ - if topic_model.embedding_model is None: - warnings.warn("MaximalMarginalRelevance can only be used BERTopic was instantiated" - "with the `embedding_model` parameter.") + warnings.warn( + "MaximalMarginalRelevance can only be used BERTopic was instantiated" + "with the `embedding_model` parameter." + ) return topics updated_topics = {} for topic, topic_words in topics.items(): words = [word[0] for word in topic_words] word_embeddings = topic_model._extract_embeddings(words, method="word", verbose=False) - topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape(1, -1) - topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words) + topic_embedding = topic_model._extract_embeddings(" ".join(words), method="word", verbose=False).reshape( + 1, -1 + ) + topic_words = mmr( + topic_embedding, + word_embeddings, + words, + self.diversity, + self.top_n_words, + ) updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words] return updated_topics

    diff --git a/api/representation/openai.html b/api/representation/openai.html index e6f4e172..60fbf055 100755 --- a/api/representation/openai.html +++ b/api/representation/openai.html @@ -2635,94 +2635,149 @@

    OpenAI

    Using the OpenAI API to generate topic labels based - on one of their Completion of ChatCompletion models.

    -
    The default method is `openai.Completion` if `chat=False`.
    +on one of their Completion of ChatCompletion models.

    +

    The default method is openai.Completion if chat=False. The prompts will also need to follow a completion task. If you -are looking for a more interactive chats, use `chat=True` -with `model=gpt-3.5-turbo`. - -For an overview see: -https://platform.openai.com/docs/models - -!!! arguments - client: A `openai.OpenAI` client - !!! model "Model to use within OpenAI, defaults to `"text-ada-001"`." - NOTE: If a `gpt-3.5-turbo` model is used, make sure to set - `chat` to True. - !!! generator_kwargs "Kwargs passed to `openai.Completion.create`" - for fine-tuning the output. - !!! prompt "The prompt to be used in the model. If no prompt is given," - `self.default_prompt_` is used instead. - NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt - to decide where the keywords and documents need to be - inserted. - !!! delay_in_seconds "The delay in seconds between consecutive prompts" - in order to prevent RateLimitErrors. - !!! exponential_backoff "Retry requests with a random exponential backoff." - A short sleep is used when a rate limit error is hit, - then the requests is retried. Increase the sleep length - if errors are hit until 10 unsuccesfull requests. - If True, overrides `delay_in_seconds`. - !!! chat "Set this to True if a GPT-3.5 model is used." - See: https://platform.openai.com/docs/models/gpt-3-5 - !!! nr_docs "The number of documents to pass to OpenAI if a prompt" - with the `["DOCUMENTS"]` tag is used. - !!! diversity "The diversity of documents to pass to OpenAI." - Accepts values between 0 and 1. A higher - values results in passing more diverse documents - whereas lower values passes more similar documents. - !!! doc_length "The maximum length of each document. If a document is longer," - it will be truncated. If None, the entire document is passed. - !!! tokenizer "The tokenizer used to calculate to split the document into segments" - used to count the length of a document. - * If tokenizer is 'char', then the document is split up - into characters which are counted to adhere to `doc_length` - * If tokenizer is 'whitespace', the document is split up - into words separated by whitespaces. These words are counted - and truncated depending on `doc_length` - * If tokenizer is 'vectorizer', then the internal CountVectorizer - is used to tokenize the document. These tokens are counted - and trunctated depending on `doc_length` - * If tokenizer is a callable, then that callable is used to tokenize - the document. These tokens are counted and truncated depending - on `doc_length` - -Usage: - -To use this, you will need to install the openai package first: - -`pip install openai` - -Then, get yourself an API key and use OpenAI's API as follows: - -```python -import openai -from bertopic.representation import OpenAI -from bertopic import BERTopic - -# Create your representation model -client = openai.OpenAI(api_key=MY_API_KEY) -representation_model = OpenAI(client, delay_in_seconds=5) - -# Use the representation model in BERTopic on top of the default pipeline -topic_model = BERTopic(representation_model=representation_model) -``` - -You can also use a custom prompt: - -```python -prompt = "I have the following documents: [DOCUMENTS] +are looking for a more interactive chats, use chat=True +with model=gpt-3.5-turbo.

    +

    For an overview see: +https://platform.openai.com/docs/models

    + +

    Parameters:

    +

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    client

    A openai.OpenAI client

    required
    modelstr

    Model to use within OpenAI, defaults to "text-ada-001". + NOTE: If a gpt-3.5-turbo model is used, make sure to set + chat to True.

    'text-embedding-3-small'
    generator_kwargsMapping[str, Any]

    Kwargs passed to openai.Completion.create + for fine-tuning the output.

    {}
    promptstr

    The prompt to be used in the model. If no prompt is given, + self.default_prompt_ is used instead. + NOTE: Use "[KEYWORDS]" and "[DOCUMENTS]" in the prompt + to decide where the keywords and documents need to be + inserted.

    None
    delay_in_secondsfloat

    The delay in seconds between consecutive prompts + in order to prevent RateLimitErrors.

    None
    exponential_backoffbool

    Retry requests with a random exponential backoff. + A short sleep is used when a rate limit error is hit, + then the requests is retried. Increase the sleep length + if errors are hit until 10 unsuccessful requests. + If True, overrides delay_in_seconds.

    False
    chatbool

    Set this to True if a GPT-3.5 model is used. + See: https://platform.openai.com/docs/models/gpt-3-5

    False
    nr_docsint

    The number of documents to pass to OpenAI if a prompt + with the ["DOCUMENTS"] tag is used.

    4
    diversityfloat

    The diversity of documents to pass to OpenAI. + Accepts values between 0 and 1. A higher + values results in passing more diverse documents + whereas lower values passes more similar documents.

    None
    doc_lengthint

    The maximum length of each document. If a document is longer, + it will be truncated. If None, the entire document is passed.

    None
    tokenizerUnion[str, Callable]

    The tokenizer used to calculate to split the document into segments + used to count the length of a document. + * If tokenizer is 'char', then the document is split up + into characters which are counted to adhere to doc_length + * If tokenizer is 'whitespace', the document is split up + into words separated by whitespaces. These words are counted + and truncated depending on doc_length + * If tokenizer is 'vectorizer', then the internal CountVectorizer + is used to tokenize the document. These tokens are counted + and truncated depending on doc_length + * If tokenizer is a callable, then that callable is used to tokenize + the document. These tokens are counted and truncated depending + on doc_length

    None

    Usage:

    +

    To use this, you will need to install the openai package first:

    +

    pip install openai

    +

    Then, get yourself an API key and use OpenAI's API as follows:

    +
    import openai
    +from bertopic.representation import OpenAI
    +from bertopic import BERTopic
    +
    +# Create your representation model
    +client = openai.OpenAI(api_key=MY_API_KEY)
    +representation_model = OpenAI(client, delay_in_seconds=5)
    +
    +# Use the representation model in BERTopic on top of the default pipeline
    +topic_model = BERTopic(representation_model=representation_model)
    +
    +

    You can also use a custom prompt:

    +
    prompt = "I have the following documents: [DOCUMENTS] \nThese documents are about the following topic: '"
    +representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5)
    +
    +

    If you want to use OpenAI's ChatGPT model:

    +
    representation_model = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=10, chat=True)
     
    -

    These documents are about the following topic: '" - representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5) - If you want to use OpenAI's ChatGPT model:python - representation_model = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=10, chat=True) - ```

    Source code in bertopic\representation\_openai.py
    class OpenAI(BaseRepresentation):
    -    """ Using the OpenAI API to generate topic labels based
    +    r"""Using the OpenAI API to generate topic labels based
         on one of their Completion of ChatCompletion models.
     
         The default method is `openai.Completion` if `chat=False`.
    @@ -2750,7 +2805,7 @@ 

    OpenAI exponential_backoff: Retry requests with a random exponential backoff. A short sleep is used when a rate limit error is hit, then the requests is retried. Increase the sleep length - if errors are hit until 10 unsuccesfull requests. + if errors are hit until 10 unsuccessful requests. If True, overrides `delay_in_seconds`. chat: Set this to True if a GPT-3.5 model is used. See: https://platform.openai.com/docs/models/gpt-3-5 @@ -2771,7 +2826,7 @@

    OpenAI and truncated depending on `doc_length` * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted - and trunctated depending on `doc_length` + and truncated depending on `doc_length` * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on `doc_length` @@ -2810,19 +2865,21 @@

    OpenAI representation_model = OpenAI(client, model="gpt-3.5-turbo", delay_in_seconds=10, chat=True) ``` """ - def __init__(self, - client, - model: str = "text-embedding-3-small", - prompt: str = None, - generator_kwargs: Mapping[str, Any] = {}, - delay_in_seconds: float = None, - exponential_backoff: bool = False, - chat: bool = False, - nr_docs: int = 4, - diversity: float = None, - doc_length: int = None, - tokenizer: Union[str, Callable] = None - ): + + def __init__( + self, + client, + model: str = "text-embedding-3-small", + prompt: str = None, + generator_kwargs: Mapping[str, Any] = {}, + delay_in_seconds: float = None, + exponential_backoff: bool = False, + chat: bool = False, + nr_docs: int = 4, + diversity: float = None, + doc_length: int = None, + tokenizer: Union[str, Callable] = None, + ): self.client = client self.model = model @@ -2850,13 +2907,14 @@

    OpenAIif not self.generator_kwargs.get("stop") and not chat: self.generator_kwargs["stop"] = "\n" - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topics + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topics. Arguments: topic_model: A BERTopic model @@ -2868,7 +2926,9 @@

    OpenAI updated_topics: Updated topic representations """ # Extract the top n representative documents per topic - repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity) + repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( + c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity + ) # Generate using OpenAI's Language Model updated_topics = {} @@ -2884,23 +2944,32 @@

    OpenAIif self.chat: messages = [ {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt} + {"role": "user", "content": prompt}, ] - kwargs = {"model": self.model, "messages": messages, **self.generator_kwargs} + kwargs = { + "model": self.model, + "messages": messages, + **self.generator_kwargs, + } if self.exponential_backoff: response = chat_completions_with_backoff(self.client, **kwargs) else: response = self.client.chat.completions.create(**kwargs) # Check whether content was actually generated - # Adresses #1570 for potential issues with OpenAI's content filter + # Addresses #1570 for potential issues with OpenAI's content filter if hasattr(response.choices[0].message, "content"): label = response.choices[0].message.content.strip().replace("topic: ", "") else: label = "No label returned" else: if self.exponential_backoff: - response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs) + response = completions_with_backoff( + self.client, + model=self.model, + prompt=prompt, + **self.generator_kwargs, + ) else: response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs) label = response.choices[0].text.strip() @@ -2963,7 +3032,7 @@

    -

    Extract topics

    +

    Extract topics.

    Parameters:

    @@ -3019,13 +3088,14 @@

    Source code in bertopic\representation\_openai.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topics
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topics.
     
         Arguments:
             topic_model: A BERTopic model
    @@ -3037,7 +3107,9 @@ 

    updated_topics: Updated topic representations """ # Extract the top n representative documents per topic - repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity) + repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs( + c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity + ) # Generate using OpenAI's Language Model updated_topics = {} @@ -3053,23 +3125,32 @@

    if self.chat: messages = [ {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt} + {"role": "user", "content": prompt}, ] - kwargs = {"model": self.model, "messages": messages, **self.generator_kwargs} + kwargs = { + "model": self.model, + "messages": messages, + **self.generator_kwargs, + } if self.exponential_backoff: response = chat_completions_with_backoff(self.client, **kwargs) else: response = self.client.chat.completions.create(**kwargs) # Check whether content was actually generated - # Adresses #1570 for potential issues with OpenAI's content filter + # Addresses #1570 for potential issues with OpenAI's content filter if hasattr(response.choices[0].message, "content"): label = response.choices[0].message.content.strip().replace("topic: ", "") else: label = "No label returned" else: if self.exponential_backoff: - response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs) + response = completions_with_backoff( + self.client, + model=self.model, + prompt=prompt, + **self.generator_kwargs, + ) else: response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs) label = response.choices[0].text.strip() diff --git a/api/representation/pos.html b/api/representation/pos.html index 2707ccca..4ef73296 100755 --- a/api/representation/pos.html +++ b/api/representation/pos.html @@ -2632,7 +2632,7 @@

    PartOfSpeech
    -

    Extract Topic Keywords based on their Part-of-Speech

    +

    Extract Topic Keywords based on their Part-of-Speech.

    DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], @@ -2701,7 +2701,7 @@

    PartOfSpeech Source code in bertopic\representation\_pos.py
    class PartOfSpeech(BaseRepresentation):
    -    """ Extract Topic Keywords based on their Part-of-Speech
    +    """Extract Topic Keywords based on their Part-of-Speech.
     
         DEFAULT_PATTERNS = [
                     [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
    @@ -2750,36 +2750,43 @@ 

    PartOfSpeech representation_model = PartOfSpeech("en_core_web_sm", pos_patterns=pos_patterns) ``` """ - def __init__(self, - model: Union[str, Language] = "en_core_web_sm", - top_n_words: int = 10, - pos_patterns: List[str] = None): + + def __init__( + self, + model: Union[str, Language] = "en_core_web_sm", + top_n_words: int = 10, + pos_patterns: List[str] = None, + ): if isinstance(model, str): self.model = spacy.load(model) elif isinstance(model, Language): self.model = model else: - raise ValueError("Make sure that the Spacy model that you" - "pass is either a string referring to a" - "Spacy model or a Spacy nlp object.") + raise ValueError( + "Make sure that the Spacy model that you" + "pass is either a string referring to a" + "Spacy model or a Spacy nlp object." + ) self.top_n_words = top_n_words if pos_patterns is None: self.pos_patterns = [ - [{'POS': 'ADJ'}, {'POS': 'NOUN'}], - [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] + [{"POS": "ADJ"}, {"POS": "NOUN"}], + [{"POS": "NOUN"}], + [{"POS": "ADJ"}], ] else: self.pos_patterns = pos_patterns - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topics + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topics. Arguments: topic_model: A BERTopic model @@ -2828,14 +2835,16 @@

    PartOfSpeechupdated_topics = {topic: [] for topic in topics.keys()} for topic, candidate_keywords in candidate_topics.items(): - word_indices = np.sort([words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)]) + word_indices = np.sort( + [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup] + ) vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers] - indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1] - vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1] + indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] + vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)] updated_topics[topic] = topic_words if len(updated_topics[topic]) < self.top_n_words: - updated_topics[topic] += [("", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))] + updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))] return updated_topics

    @@ -2866,7 +2875,7 @@

    -

    Extract topics

    +

    Extract topics.

    Parameters:

    @@ -2922,13 +2931,14 @@

    Source code in bertopic\representation\_pos.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topics
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topics.
     
         Arguments:
             topic_model: A BERTopic model
    @@ -2977,14 +2987,16 @@ 

    updated_topics = {topic: [] for topic in topics.keys()} for topic, candidate_keywords in candidate_topics.items(): - word_indices = np.sort([words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)]) + word_indices = np.sort( + [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup] + ) vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers] - indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1] - vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1] + indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] + vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1] topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)] updated_topics[topic] = topic_words if len(updated_topics[topic]) < self.top_n_words: - updated_topics[topic] += [("", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))] + updated_topics[topic] += [("", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))] return updated_topics

    diff --git a/api/representation/zeroshot.html b/api/representation/zeroshot.html index b30e7097..9f8912e9 100755 --- a/api/representation/zeroshot.html +++ b/api/representation/zeroshot.html @@ -2632,7 +2632,7 @@

    ZeroShotClassification
    -

    Zero-shot Classification on topic keywords with candidate labels

    +

    Zero-shot Classification on topic keywords with candidate labels.

    Parameters:

    @@ -2690,7 +2690,7 @@

    ZeroShotClassification Source code in bertopic\representation\_zeroshot.py
    class ZeroShotClassification(BaseRepresentation):
    -    """ Zero-shot Classification on topic keywords with candidate labels
    +    """Zero-shot Classification on topic keywords with candidate labels.
     
         Arguments:
             candidate_topics: A list of labels to assign to the topics if they
    @@ -2717,31 +2717,36 @@ 

    ZeroShotClassification topic_model = BERTopic(representation_model=representation_model) ``` """ - def __init__(self, - candidate_topics: List[str], - model: str = "facebook/bart-large-mnli", - pipeline_kwargs: Mapping[str, Any] = {}, - min_prob: float = 0.8 - ): + + def __init__( + self, + candidate_topics: List[str], + model: str = "facebook/bart-large-mnli", + pipeline_kwargs: Mapping[str, Any] = {}, + min_prob: float = 0.8, + ): self.candidate_topics = candidate_topics if isinstance(model, str): self.model = pipeline("zero-shot-classification", model=model) elif isinstance(model, Pipeline): self.model = model else: - raise ValueError("Make sure that the HF model that you" - "pass is either a string referring to a" - "HF model or a `transformers.pipeline` object.") + raise ValueError( + "Make sure that the HF model that you" + "pass is either a string referring to a" + "HF model or a `transformers.pipeline` object." + ) self.pipeline_kwargs = pipeline_kwargs self.min_prob = min_prob - def extract_topics(self, - topic_model, - documents: pd.DataFrame, - c_tf_idf: csr_matrix, - topics: Mapping[str, List[Tuple[str, float]]] - ) -> Mapping[str, List[Tuple[str, float]]]: - """ Extract topics + def extract_topics( + self, + topic_model, + documents: pd.DataFrame, + c_tf_idf: csr_matrix, + topics: Mapping[str, List[Tuple[str, float]]], + ) -> Mapping[str, List[Tuple[str, float]]]: + """Extract topics. Arguments: topic_model: Not used @@ -2776,7 +2781,7 @@

    ZeroShotClassificationif len(topic_description) == 0: topic_description = topics[topic] elif len(topic_description) < 10: - topic_description += [("", 0) for _ in range(10-len(topic_description))] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description return updated_topics @@ -2808,7 +2813,7 @@

    -

    Extract topics

    +

    Extract topics.

    Parameters:

    @@ -2864,13 +2869,14 @@

    Source code in bertopic\representation\_zeroshot.py -
    def extract_topics(self,
    -                   topic_model,
    -                   documents: pd.DataFrame,
    -                   c_tf_idf: csr_matrix,
    -                   topics: Mapping[str, List[Tuple[str, float]]]
    -                   ) -> Mapping[str, List[Tuple[str, float]]]:
    -    """ Extract topics
    +          
    def extract_topics(
    +    self,
    +    topic_model,
    +    documents: pd.DataFrame,
    +    c_tf_idf: csr_matrix,
    +    topics: Mapping[str, List[Tuple[str, float]]],
    +) -> Mapping[str, List[Tuple[str, float]]]:
    +    """Extract topics.
     
         Arguments:
             topic_model: Not used
    @@ -2905,7 +2911,7 @@ 

    if len(topic_description) == 0: topic_description = topics[topic] elif len(topic_description) < 10: - topic_description += [("", 0) for _ in range(10-len(topic_description))] + topic_description += [("", 0) for _ in range(10 - len(topic_description))] updated_topics[topic] = topic_description return updated_topics diff --git a/changelog.html b/changelog.html index 68c7b427..49705366 100755 --- a/changelog.html +++ b/changelog.html @@ -2538,6 +2538,13 @@

    Note

    -

    The calculate_probabilties parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with .approximate_distribution.

    +

    The calculate_probabilities parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with .approximate_distribution.

    Numpy gives me an error when running BERTopic

    With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous ones) due to compilation issues and pypi.

    diff --git a/getting_started/best_practices/best_practices.html b/getting_started/best_practices/best_practices.html index c12fb607..65102816 100755 --- a/getting_started/best_practices/best_practices.html +++ b/getting_started/best_practices/best_practices.html @@ -3083,7 +3083,8 @@

    Inference
    from bertopic._utils import MyLogger
    -logger = MyLogger("ERROR")
    +logger = MyLogger()
    +logger.configure("ERROR")
     loaded_model.verbose = False
     topic_model.verbose = False
     

    diff --git a/getting_started/embeddings/embeddings.html b/getting_started/embeddings/embeddings.html index bbff0a97..182bd923 100755 --- a/getting_started/embeddings/embeddings.html +++ b/getting_started/embeddings/embeddings.html @@ -2967,7 +2967,7 @@

    Multimodal# Embed documents only doc_embeddings = model.embed_documents(docs) -# Embeding images only +# Embedding images only image_embeddings = model.embed_images(images) # Embed both images and documents, then average them diff --git a/getting_started/guided/guided.html b/getting_started/guided/guided.html index c6b270ee..5be65277 100755 --- a/getting_started/guided/guided.html +++ b/getting_started/guided/guided.html @@ -2724,7 +2724,7 @@

    Guided Topic Modeling

    + 2 -Mutiply the IDF values of the seeded keywords across all topics with 1.2. +Multiply the IDF values of the seeded keywords across all topics with 1.2. diff --git a/getting_started/guided/guided.svg b/getting_started/guided/guided.svg index 3d0742f9..ad45f17d 100755 --- a/getting_started/guided/guided.svg +++ b/getting_started/guided/guided.svg @@ -111,7 +111,7 @@ + 2 -Mutiply the IDF values of the seeded keywords across all topics with 1.2. +Multiply the IDF values of the seeded keywords across all topics with 1.2. diff --git a/getting_started/multimodal/multimodal.html b/getting_started/multimodal/multimodal.html index 1384fbe9..0fc7b856 100755 --- a/getting_started/multimodal/multimodal.html +++ b/getting_started/multimodal/multimodal.html @@ -2627,7 +2627,7 @@

    Text + Images
    # NOTE: This requires the `datasets` package which you can 
     # install with `pip install datasets`
     from datasets import load_dataset
    @@ -2692,7 +2692,7 @@ 

    Text + Images# Embed documents only doc_embeddings = model.embed_documents(docs) -# Embeding images only +# Embedding images only image_embeddings = model.embed_images(images) # Embed both images and documents, then average them diff --git a/getting_started/outlier_reduction/outlier_reduction.html b/getting_started/outlier_reduction/outlier_reduction.html index dc867b8b..06af3a4b 100755 --- a/getting_started/outlier_reduction/outlier_reduction.html +++ b/getting_started/outlier_reduction/outlier_reduction.html @@ -2835,10 +2835,10 @@

    Chain Strategies"c-tf-idf" strategy as it is quite fast. Then, we can perform the "distributions" strategy on the outliers that are left since this method is typically much slower:

    # Use the "c-TF-IDF" strategy with a threshold
    -new_topics = topic_model.reduce_outliers(docs, new_topics , strategy="c-tf-idf", threshold=0.1)
    +new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)
     
     # Reduce all outliers that are left with the "distributions" strategy
    -new_topics = topic_model.reduce_outliers(docs, topics, strategy="distributions")
    +new_topics = topic_model.reduce_outliers(docs, new_topics, strategy="distributions")
     

    Update Topics

    After generating our updated topics, we can feed them back into BERTopic in one of two ways. We can either update the topic representations themselves based on the documents that now belong to new topics or we can only update the topic frequency without updating the topic representations themselves.

    diff --git a/getting_started/representation/llm.html b/getting_started/representation/llm.html index a88807af..71717220 100755 --- a/getting_started/representation/llm.html +++ b/getting_started/representation/llm.html @@ -2842,7 +2842,7 @@

    Truncating DocumentsLlama 2

    Three pieces of the prompt were created:

      -
    • system_prompt helps us guide the model during a conversation. For example, we can say that it is a helpful assisant that is specialized in labeling topics.
    • +
    • system_prompt helps us guide the model during a conversation. For example, we can say that it is a helpful assistant that is specialized in labeling topics.
    • example_prompt gives an example of a correctly labeled topic to guide Llama 2
    • main_prompt contains the main question we are going to ask it, namely to label a topic. Note that it uses the [DOCUMENTS] and [KEYWORDS] to provide the most relevant documents and keywords as additional context
    diff --git a/getting_started/representation/representation.html b/getting_started/representation/representation.html index e0b1c734..c9dc6821 100755 --- a/getting_started/representation/representation.html +++ b/getting_started/representation/representation.html @@ -2947,7 +2947,7 @@

    Zero-Shot ClassificationChain Models

    All of the above models can make use of the candidate topics, as generated by c-TF-IDF, to further fine-tune the topic representations. For example, MaximalMarginalRelevance takes the keywords in the candidate topics and re-ranks them. Similarly, the keywords in the candidate topic can be used as the input for GPT-prompts in OpenAI.

    Although the default candidate topics are generated by c-TF-IDF, what if we were to chain these models? For example, we can use MaximalMarginalRelevance to improve upon the keywords in each topic before passing them to OpenAI.

    -

    This is supported in BERTopic by simply passing a list of representation models when instantation the topic model:

    +

    This is supported in BERTopic by simply passing a list of representation models when instantiation the topic model:

    from bertopic.representation import MaximalMarginalRelevance, OpenAI
     from bertopic import BERTopic
     import openai
    diff --git a/getting_started/serialization/serialization.html b/getting_started/serialization/serialization.html
    index b20cbdb3..30028d9c 100755
    --- a/getting_started/serialization/serialization.html
    +++ b/getting_started/serialization/serialization.html
    @@ -2784,7 +2784,7 @@ 

    HuggingFace Hubloaded_model = BERTopic.load("MaartenGr/BERTopic_ArXiv")

    Parameters

    -

    There are number of parameters that may be worthwile to know:

    +

    There are number of parameters that may be worthwhile to know:

    • private
      • Whether to create a private repository
      • diff --git a/getting_started/visualization/visualization.html b/getting_started/visualization/visualization.html index 0729d438..25547625 100755 --- a/getting_started/visualization/visualization.html +++ b/getting_started/visualization/visualization.html @@ -2621,8 +2621,8 @@
      • - - Visualize Probablities or Distribution + + Visualize Probabilities or Distribution
      • @@ -3155,7 +3155,7 @@

        Visualize Topics per Class< -

        Visualize Probablities or Distribution

        +

        Visualize Probabilities or Distribution

        We can generate the topic-document probability matrix by simply setting calculate_probabilities=True if a HDBSCAN model is used:

        from bertopic import BERTopic
         topic_model = BERTopic(calculate_probabilities=True)
        diff --git a/getting_started/visualization/visualize_documents.html b/getting_started/visualization/visualize_documents.html
        index 69fe02d6..7c45eea1 100755
        --- a/getting_started/visualization/visualize_documents.html
        +++ b/getting_started/visualization/visualize_documents.html
        @@ -717,8 +717,8 @@
         
               
                 
      • - - Visualize Probablities or Distribution + + Visualize Probabilities or Distribution
      • @@ -2634,8 +2634,8 @@
      • - - Visualize Probablities or Distribution + + Visualize Probabilities or Distribution
      • @@ -2714,7 +2714,7 @@

        Visualize documents with D
        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)
         fig.savefig("path/to/file.png", bbox_inches="tight")
         
        -

        Visualize Probablities or Distribution

        +

        Visualize Probabilities or Distribution

        We can generate the topic-document probability matrix by simply setting calculate_probabilities=True if a HDBSCAN model is used:

        from bertopic import BERTopic
         topic_model = BERTopic(calculate_probabilities=True)
        diff --git a/getting_started/zeroshot/zeroshot.html b/getting_started/zeroshot/zeroshot.html
        index a1485ec2..f59afb0c 100755
        --- a/getting_started/zeroshot/zeroshot.html
        +++ b/getting_started/zeroshot/zeroshot.html
        @@ -2608,10 +2608,12 @@ 

        Zero-shot Topic Modeling

        Zero-shot Topic Modeling is a technique that allows you to find topics in large amounts of documents that were predefined. When faced with many documents, you often have an idea of which topics will definitely be in there. Whether that is a result of simply knowing your data or if a domain expert is involved in defining those topics.

        This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. -This allows for extensive flexibility as there are three scenario's to explore.

        -

        First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.

        -

        Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.

        -

        Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.

        +This allows for extensive flexibility as there are three scenario's to explore:

        +
          +
        • First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.
        • +
        • Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.
        • +
        • Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
        • +
        @@ -2619,16 +2621,16 @@

        Zero-shot Topic Modeling

        -"Clustering" +"Religion" the labels Embed cosine similaritydocumentzeroshot For each document, assign topics based on between and embeddings -ManualBERTopicBERTopic -Create two models: (zeroshot documents) (non-zeroshot documents) - the models into one -Merge + that could not be assigned to a zero-shot topic +Cluster documents + the to the to create a single list of topics +Appendclusteringtopicszero-shot topics zeroshot topicslabels Define through . "Clustering" @@ -2661,105 +2663,105 @@

        Zero-shot Topic Modeling

        - - - - - - + + + + + + - - - - - + + + + + Topic Modeling -BERTopic +Cluster non-assigned docs + Topic X Topic Y Topic Z - + - - + + - + - + - - + + - + - + - - + + - + -Manual BERTopic +Zero-shot Topics Topic Modeling LLM Clustering - - - - - - + + + + + + - - - - - + + + + + - + - - + + - + - + - - + + - + @@ -2769,124 +2771,124 @@

        Zero-shot Topic Modeling

        LLM Clustering - - - - - - + + + + + + - - - - - + + + + + Topic X Topic Y Topic Z - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + -Large Language Models +LLM - + - - + + - + No match found - + - - + + - + @@ -2897,8 +2899,7 @@

        Zero-shot Topic Modeling

        -

        This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will be put through a regular BERTopic model.

        -

        This creates two models. One for the zero-shot topics and one for the non-zero-shot topics. We combine these two BERTopic models to create a single model that contains both zero-shot and non-zero-shot topics.

        +

        This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will follow the regular BERTopic pipeline and attempt to find clusters that do not fit with the zero-shot topics.

        Example

        In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. diff --git a/getting_started/zeroshot/zeroshot.svg b/getting_started/zeroshot/zeroshot.svg index d8d061f0..b908173b 100755 --- a/getting_started/zeroshot/zeroshot.svg +++ b/getting_started/zeroshot/zeroshot.svg @@ -4,16 +4,16 @@ -"Clustering" +"Religion" the labels Embed cosine similaritydocumentzeroshot For each document, assign topics based on between and embeddings -ManualBERTopicBERTopic -Create two models: (zeroshot documents) (non-zeroshot documents) - the models into one -Merge + that could not be assigned to a zero-shot topic +Cluster documents + the to the to create a single list of topics +Appendclusteringtopicszero-shot topics zeroshot topicslabels Define through . "Clustering" @@ -46,105 +46,105 @@ - - - - - - + + + + + + - - - - - + + + + + Topic Modeling -BERTopic +Cluster non-assigned docs + Topic X Topic Y Topic Z - + - - + + - + - + - - + + - + - + - - + + - + -Manual BERTopic +Zero-shot Topics Topic Modeling LLM Clustering - - - - - - + + + + + + - - - - - + + + + + - + - - + + - + - + - - + + - + @@ -154,124 +154,124 @@ LLM Clustering - - - - - - + + + + + + - - - - - + + + + + Topic X Topic Y Topic Z - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + -Large Language Models +LLM - + - - + + - + No match found - + - - + + - + diff --git a/objects.inv b/objects.inv index 563219f3..e985d2bb 100755 Binary files a/objects.inv and b/objects.inv differ diff --git a/search/search_index.json b/search/search_index.json index 1680f53a..c6d0411d 100755 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"BERTopic","text":"

        BERTopic is a topic modeling technique that leverages \ud83e\udd17 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

        BERTopic supports all kinds of topic modeling techniques:

        Guided Supervised Semi-supervised Manual Multi-topic distributions Hierarchical Class-based Dynamic Online/Incremental Multimodal Multi-aspect Text Generation/LLM Zero-shot (new!) Merge Models (new!) Seed Words (new!)

        Corresponding medium posts can be found here, here and here. For a more detailed overview, you can read the paper or see a brief overview.

        "},{"location":"index.html#installation","title":"Installation","text":"

        Installation, with sentence-transformers, can be done using pypi:

        pip install bertopic\n

        You may want to install more depending on the transformers and language backends that you will be using. The possible installations are:

        # Choose an embedding backend\npip install bertopic[flair, gensim, spacy, use]\n\n# Topic modeling with images\npip install bertopic[vision]\n
        "},{"location":"index.html#quick-start","title":"Quick Start","text":"

        We start by extracting topics from the well-known 20 newsgroups dataset containing English documents:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        After generating topics and their probabilities, we can access the frequent topics that were generated:

        >>> topic_model.get_topic_info()\n\nTopic   Count   Name\n-1      4630    -1_can_your_will_any\n0       693     49_windows_drive_dos_file\n1       466     32_jesus_bible_christian_faith\n2       441     2_space_launch_orbit_lunar\n3       381     22_key_encryption_keys_encrypted\n

        -1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0:

        >>> topic_model.get_topic(0)\n\n[('windows', 0.006152228076250982),\n ('drive', 0.004982897610645755),\n ('dos', 0.004845038866360651),\n ('file', 0.004140142872194834),\n ('disk', 0.004131678774810884),\n ('mac', 0.003624848635985097),\n ('memory', 0.0034840976976789903),\n ('software', 0.0034415334250699077),\n ('email', 0.0034239554442333257),\n ('pc', 0.003047105930670237)]\n

        Using .get_document_info, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.:

        >>> topic_model.get_document_info(docs)\n\nDocument                               Topic    Name                        Top_n_words                     Probability    ...\nI am sure some bashers of Pens...       0       0_game_team_games_season    game - team - games...          0.200010       ...\nMy brother is in the market for...      -1     -1_can_your_will_any         can - your - will...            0.420668       ...\nFinally you said what you dream...      -1     -1_can_your_will_any         can - your - will...            0.807259       ...\nThink! It is the SCSI card doing...     49     49_windows_drive_dos_file    windows - drive - docs...       0.071746       ...\n1) I have an old Jasmine drive...       49     49_windows_drive_dos_file    windows - drive - docs...       0.038983       ...\n

        Multilingual

        Use BERTopic(language=\"multilingual\") to select a model that supports 50+ languages.

        "},{"location":"index.html#fine-tune-topic-representations","title":"Fine-tune Topic Representations","text":"

        In BERTopic, there are a number of different topic representations that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is KeyBERTInspired, which for many users increases the coherence and reduces stopwords from the resulting topic representations:

        from bertopic.representation import KeyBERTInspired\n\n# Fine-tune your topic representations\nrepresentation_model = KeyBERTInspired()\ntopic_model = BERTopic(representation_model=representation_model)\n

        However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more:

        import openai\nfrom bertopic.representation import OpenAI\n\n# Fine-tune topic representations with GPT\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(client, model=\"gpt-3.5-turbo\", chat=True)\ntopic_model = BERTopic(representation_model=representation_model)\n

        Multi-aspect Topic Modeling

        Instead of iterating over all of these different topic representations, you can model them simultaneously with multi-aspect topic representations in BERTopic.

        "},{"location":"index.html#modularity","title":"Modularity","text":"

        By default, the main steps for topic modeling with BERTopic are sentence-transformers, UMAP, HDBSCAN, and c-TF-IDF run in sequence. However, it assumes some independence between these steps which makes BERTopic quite modular. In other words, BERTopic not only allows you to build your own topic model but to explore several topic modeling techniques on top of your customized topic model:

        You can swap out any of these models or even remove them entirely. The following steps are completely modular:

        1. Embedding documents
        2. Reducing dimensionality of embeddings
        3. Clustering reduced embeddings into topics
        4. Tokenization of topics
        5. Weight tokens
        6. Represent topics with one or multiple representations

        To find more about the underlying algorithm and assumptions here.

        "},{"location":"index.html#overview","title":"Overview","text":"

        BERTopic has many functions that quickly can become overwhelming. To alleviate this issue, you will find an overview of all methods and a short description of its purpose.

        "},{"location":"index.html#common","title":"Common","text":"

        Below, you will find an overview of common functions in BERTopic.

        Method Code Fit the model .fit(docs) Fit the model and predict documents .fit_transform(docs) Predict new documents .transform([new_doc]) Access single topic .get_topic(topic=12) Access all topics .get_topics() Get topic freq .get_topic_freq() Get all topic information .get_topic_info() Get all document information .get_document_info(docs) Get representative docs per topic .get_representative_docs() Update topic representation .update_topics(docs, n_gram_range=(1, 3)) Generate topic labels .generate_topic_labels() Set topic labels .set_topic_labels(my_custom_labels) Merge topics .merge_topics(docs, topics_to_merge) Reduce nr of topics .reduce_topics(docs, nr_topics=30) Reduce outliers .reduce_outliers(docs, topics) Find topics .find_topics(\"vehicle\") Save model .save(\"my_model\", serialization=\"safetensors\") Load model BERTopic.load(\"my_model\") Get parameters .get_params()"},{"location":"index.html#attributes","title":"Attributes","text":"

        After having trained your BERTopic model, several are saved within your model. These attributes, in part, refer to how model information is stored on an estimator during fitting. The attributes that you see below all end in _ and are public attributes that can be used to access model information.

        Attribute Description .topics_ The topics that are generated for each document after training or updating the topic model. .probabilities_ The probabilities that are generated for each document if HDBSCAN is used. .topic_sizes_ The size of each topic .topic_mapper_ A class for tracking topics and their mappings anytime they are merged/reduced. .topic_representations_ The top n terms per topic and their respective c-TF-IDF values. .c_tf_idf_ The topic-term matrix as calculated through c-TF-IDF. .topic_aspects_ The different aspects, or representations, of each topic. .topic_labels_ The default labels for each topic. .custom_labels_ Custom labels for each topic as generated through .set_topic_labels. .topic_embeddings_ The embeddings for each topic if embedding_model was used. .representative_docs_ The representative documents for each topic if HDBSCAN is used."},{"location":"index.html#variations","title":"Variations","text":"

        There are many different use cases in which topic modeling can be used. As such, several variations of BERTopic have been developed such that one package can be used across many use cases.

        Method Code Topic Distribution Approximation .approximate_distribution(docs) Online Topic Modeling .partial_fit(doc) Semi-supervised Topic Modeling .fit(docs, y=y) Supervised Topic Modeling .fit(docs, y=y) Manual Topic Modeling .fit(docs, y=y) Multimodal Topic Modeling .fit(docs, images=images) Topic Modeling per Class .topics_per_class(docs, classes) Dynamic Topic Modeling .topics_over_time(docs, timestamps) Hierarchical Topic Modeling .hierarchical_topics(docs) Guided Topic Modeling BERTopic(seed_topic_list=seed_topic_list) Zero-shot Topic Modeling BERTopic(zeroshot_topic_list=zeroshot_topic_list) Merge Multiple Models BERTopic.merge_models([topic_model_1, topic_model_2])"},{"location":"index.html#visualizations","title":"Visualizations","text":"

        Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. Visualizing different aspects of the topic model helps in understanding the model and makes it easier to tweak the model to your liking.

        Method Code Visualize Topics .visualize_topics() Visualize Documents .visualize_documents() Visualize Document with DataMapPlot .visualize_document_datamap() Visualize Document Hierarchy .visualize_hierarchical_documents() Visualize Topic Hierarchy .visualize_hierarchy() Visualize Topic Tree .get_topic_tree(hierarchical_topics) Visualize Topic Terms .visualize_barchart() Visualize Topic Similarity .visualize_heatmap() Visualize Term Score Decline .visualize_term_rank() Visualize Topic Probability Distribution .visualize_distribution(probs[0]) Visualize Topics over Time .visualize_topics_over_time(topics_over_time) Visualize Topics per Class .visualize_topics_per_class(topics_per_class)"},{"location":"index.html#citation","title":"Citation","text":"

        To cite the BERTopic paper, please use the following bibtex reference:

        @article{grootendorst2022bertopic,\n  title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},\n  author={Grootendorst, Maarten},\n  journal={arXiv preprint arXiv:2203.05794},\n  year={2022}\n}\n
        "},{"location":"changelog.html","title":"Changelog","text":""},{"location":"changelog.html#version-0162","title":"Version 0.16.2","text":"

        Release date: 12 May, 2024

        Fixes:
        • Fix issue with zeroshot topic modeling missing outlier #1957
        • Bump github actions versions by @afuetterer in #1941
        • Drop support for python 3.7 by @afuetterer in #1949
        • Add testing python 3.10+ in Github actions by @afuetterer in #1968
        • Speed up fitting CountVectorizer by @dannywhuang in #1938
        • Fix transform when using cuML HDBSCAN by @beckernick in #1960
        • Fix wrong link in algorithm documentation by @naeyn in #1970
        "},{"location":"changelog.html#version-0161","title":"Version 0.16.1","text":"

        Release date: 21 April, 2024

        Highlights:
        • Add Quantized LLM Tutorial
        • Add optional datamapplot visualization using topic_model.visualize_document_datamap by @lmcinnes in #1750
        • Migrated OpenAIBackend to openai>=1 by @peguerosdc in #1724
        • Add automatic height scaling and font resize by @ir2718 in #1863
        • Use [KEYWORDS] tags with the LangChain representation model by @mcantimmy in #1871
        Fixes:
        • Fixed issue with .merge_models seemingly skipping topic #1898
        • Fixed Cohere client.embed TypeError #1904
        • Fixed AttributeError: 'TextGeneration' object has no attribute 'random_state' #1870
        • Fixed topic embeddings not properly updated if all outliers were removed #1838
        • Fixed issue with representation models not properly merging #1762
        • Fixed Embeddings not ordered correctly when using .merge_models #1804
        • Fixed Outlier topic not in the 0th position when using zero-shot topic modeling causing prediction issues (amongst others) #1804
        • Fixed Incorrect label in ZeroShot doc SVG #1732
        • Fixed MultiModalBackend throws error with clip-ViT-B-32-multilingual-v1 #1670
        • Fixed AuthenticationError while using OpenAI() #1678

        • Update FAQ on Apple Silicon by @benz0li in #1901

        • Add documentation DataMapPlot + FAQ for running on Apple Silicon by @dkapitan in #1854
        • Remove commas from pip install reference in readme by @luisoala in #1850
        • Spelling corrections by @joouha in #1801
        • Replacing the deprecated text-ada-001 model with the latest text-embedding-3-small from OpenAI by @atmb4u in #1800
        • Prevent invalid empty input error when retrieving embeddings with openai backend by @liaoelton in #1827
        • Remove spurious warning about missing embedding model by @sliedes in #1774
        • Fix type hint in ClassTfidfTransformer constructor @snape in #1803
        • Fix typo and simplify wording in OnlineCountVectorizer docstring by @chrisji in #1802
        • Fixed warning when saving a topic model without an embedding model by @zilch42 in #1740
        • Fix bug in TextGeneration by @manveersadhal in #1726
        • Fix an incorrect link to usecases.md by @nicholsonjf in #1731
        • Prevent model argument being passed twice when using generator_kwargs in OpenAI by @ninavandiermen in #1733
        • Several fixes to the docstrings by @arpadikuma in #1719
        • Remove unused cluster_df variable in hierarchical_topics by @shadiakiki1986 in #1701
        • Removed redundant quotation mark by @LawrenceFulton in #1695
        • Fix typo in merge models docs by @zilch42 in #1660
        "},{"location":"changelog.html#version-0160","title":"Version 0.16.0","text":"

        Release date: 26 November, 2023

        Highlights:
        • Merge pre-trained BERTopic models with .merge_models
          • Combine models with different representations together!
          • Use this for incremental/online topic modeling to detect new incoming topics
          • First step towards federated learning with BERTopic
        • Zero-shot Topic Modeling
          • Use a predefined list of topics to assign documents
          • If needed, allows for further exploration of undefined topics
        • Seed (domain-specific) words with ClassTfidfTransformer
          • Make sure selected words are more likely to end up in the representation without influencing the clustering process
        • Added params to truncate documents to length when using LLMs
        • Added LlamaCPP as a representation model
        • LangChain: Support for LCEL Runnables by @joshuasundance-swca in #1586
        • Added topics parameter to .topics_over_time to select a subset of documents and topics
        • Documentation:
          • Best practices Guide
          • Llama 2 Tutorial
          • Zephyr Tutorial
          • Improved embeddings guidance (MTEB)
          • Improved logging throughout the package
        • Added support for Cohere's Embed v3:
          cohere_model = CohereBackend(\n    client,\n    embedding_model=\"embed-english-v3.0\",\n    embed_kwargs={\"input_type\": \"clustering\"}\n)\n
        Fixes:
        • Fixed n-gram Keywords need delimiting in OpenAI() #1546
        • Fixed OpenAI v1.0 issues #1629
        • Improved documentation/logging to address #1589, #1591
        • Fixed engine support for Azure OpenAI embeddings #1577
        • Fixed OpenAI Representation: KeyError: 'content' #1570
        • Fixed Loading topic model with multiple topic aspects changes their format #1487
        • Fix expired link in algorithm.md by @burugaria7 in #1396
        • Fix guided topic modeling in cuML's UMAP by @stevetracvc in #1326
        • OpenAI: Allow retrying on Service Unavailable errors by @agamble in #1407
        • Fixed parameter naming for HDBSCAN in best practices by @rnckp in #1408
        • Fixed typo in tips_and_tricks.md by @aronnoordhoek in #1446
        • Fix typos in documentation by @bobchien in #1481
        • Fix IndexError when all outliers are removed by reduce_outliers by @Aratako in #1466
        • Fix TypeError on reduce_outliers \"probabilities\" by @ananaphasia in #1501
        • Add new line to fix markdown bullet point formatting by @saeedesmaili in #1519
        • Update typo in topicrepresentation.md by @oliviercaron in #1537
        • Fix typo in FAQ by @sandijou in #1542
        • Fixed typos in best practices documentation by @poomkusa in #1557
        • Correct TopicMapper doc example by @chrisji in #1637
        • Fix typing in hierarchical_topics by @dschwalm in #1364
        • Fixed typing issue with treshold parameter in reduce_outliers by @dschwalm in #1380
        • Fix several typos by @mertyyanik in #1307 (#1307)
        • Fix inconsistent naming by @rolanderdei in #1073
        Merge Pre-trained BERTopic Models

        The new .merge_models feature allows for any number of fitted BERTopic models to be merged. Doing so allows for a number of use cases:

        • Incremental topic modeling -- Continuously merge models together to detect whether new topics have appeared
        • Federated Learning - Train BERTopic models on different clients and combine them on a central server
        • Minimal compute - We can essentially batch the training process into multiple instances to reduce compute
        • Different datasets - When you have different datasets that you want to train separately on, for example with different languages, you can train each model separately and join them after training

        To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain.

        First, we train three separate models on different parts of the data:

        from umap import UMAP\nfrom bertopic import BERTopic\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\n\n# Extract abstracts to train on and corresponding titles\nabstracts_1 = dataset[\"abstract\"][:5_000]\nabstracts_2 = dataset[\"abstract\"][5_000:10_000]\nabstracts_3 = dataset[\"abstract\"][10_000:15_000]\n\n# Create topic models\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\ntopic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1)\ntopic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2)\ntopic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3)\n

        Then, we can combine all three models into one with .merge_models:

        # Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n
        Zero-shot Topic Modeling

        Zeroshot Topic Modeling is a technique that allows you to find pre-defined topics in large amounts of documents. This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. This allows for extensive flexibility as there are three scenario's to explore.

        • No zeroshot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
        • Only zeroshot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.
        • Both zeroshot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.

        In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. We know the data and believe there to be at least the following topics: clustering, topic modeling, and large language models. However, we are not sure whether other topics exist and want to explore those.

        Using this feature is straightforward:

        from datasets import load_dataset\n\nfrom bertopic import BERTopic\nfrom bertopic.representation import KeyBERTInspired\n\n# We select a subsample of 5000 abstracts from ArXiv\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\ndocs = dataset[\"abstract\"][:5_000]\n\n# We define a number of topics that we know are in the documents\nzeroshot_topic_list = [\"Clustering\", \"Topic Modeling\", \"Large Language Models\"]\n\n# We fit our model using the zero-shot topics\n# and we define a minimum similarity. For each document,\n# if the similarity does not exceed that value, it will be used\n# for clustering instead.\ntopic_model = BERTopic(\n    embedding_model=\"thenlper/gte-small\", \n    min_topic_size=15,\n    zeroshot_topic_list=zeroshot_topic_list,\n    zeroshot_min_similarity=.85,\n    representation_model=KeyBERTInspired()\n)\ntopics, _ = topic_model.fit_transform(docs)\n

        When we run topic_model.get_topic_info() you will see something like this:

        Seed (Domain-specific) Words

        When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the \"TNM\" classification is a method for identifying the stage of most cancers. The word \"TNM\" is an abbreviation and might not be correctly captured in generic embedding models.

        To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of seed_words in the bertopic.vectorizer.ClassTfidfTransformer. To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like \"agent\" and \"robot\" should be important in such a topic were it to be found. Using the ClassTfidfTransformer, we can define those seed_words and also choose by how much their values are multiplied.

        The full example is then as follows:

        from umap import UMAP\nfrom datasets import load_dataset\nfrom bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\n# Let's take a subset of ArXiv abstracts as the training data\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\nabstracts = dataset[\"abstract\"][:5_000]\n\n# For illustration purposes, we make sure the output is fixed when running this code multiple times\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n\n# We can choose any number of seed words for which we want their representation\n# to be strengthen. We increase the importance of these words as we want them to be more\n# likely to end up in the topic representations.\nctfidf_model = ClassTfidfTransformer(\n    seed_words=[\"agent\", \"robot\", \"behavior\", \"policies\", \"environment\"], \n    seed_multiplier=2\n)\n\n# We run the topic model with the seeded words\ntopic_model = BERTopic(\n    umap_model=umap_model,\n    min_topic_size=15,\n    ctfidf_model=ctfidf_model,\n).fit(abstracts)\n
        Truncate Documents in LLMs

        When using LLMs with BERTopic, we can truncate the input documents in [DOCUMENTS] in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak:

        • doc_length - The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.
        • tokenizer - The tokenizer used to calculate to split the document into segments used to count the length of a document.
          • Options include 'char', 'whitespace', 'vectorizer', and a callable

        This means that the definition of doc_length changes depending on what constitutes a token in the tokenizer parameter. If a token is a character, then doc_length refers to max length in characters. If a token is a word, then doc_length refers to the max length in words.

        Let's illustrate this with an example. In the code below, we will use tiktoken to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated.

        We use bertopic.representation.OpenAI to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens:

        import openai\nimport tiktoken\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Tokenizer\ntokenizer= tiktoken.encoding_for_model(\"gpt-3.5-turbo\")\n\n# Create your representation model\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(\n    client,\n    model=\"gpt-3.5-turbo\",\n    delay_in_seconds=2, \n    chat=True,\n    nr_docs=4,\n    doc_length=100,\n    tokenizer=tokenizer\n)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"changelog.html#version-0150","title":"Version 0.15.0","text":"

        Release date: 29 May, 2023

        Highlights:
        • Multimodal Topic Modeling
          • Train your topic modeling on text, images, or images and text!
          • Use the bertopic.backend.MultiModalBackend to embed images, text, both or even caption images!
        • Multi-Aspect Topic Modeling
          • Create multiple topic representations simultaneously
        • Improved Serialization options
          • Push your model to the HuggingFace Hub with .push_to_hf_hub
          • Safer, smaller and more flexible serialization options with safetensors
          • Thanks to a great collaboration with HuggingFace and the authors of BERTransfer!
        • Added new embedding models
          • OpenAI: bertopic.backend.OpenAIBackend
          • Cohere: bertopic.backend.CohereBackend
        • Added example of summarizing topics with OpenAI's GPT-models
        • Added nr_docs and diversity parameters to OpenAI and Cohere representation models
        • Use custom_labels=\"Aspect1\" to use the aspect labels for visualizations instead
        • Added cuML support for probability calculation in .transform
        • Updated topic embeddings
          • Centroids by default and c-TF-IDF weighted embeddings for partial_fit and .update_topics
        • Added exponential_backoff parameter to OpenAI model
        Fixes:
        • Fixed custom prompt not working in TextGeneration
        • Fixed #1142
        • Add additional logic to handle cupy arrays by @metasyn in #1179
        • Fix hierarchy viz and handle any form of distance matrix by @elashrry in #1173
        • Updated languages list by @sam9111 in #1099
        • Added level_scale argument to visualize_hierarchical_documents by @zilch42 in #1106
        • Fix inconsistent naming by @rolanderdei in #1073
        Multimodal Topic Modeling

        With v0.15, we can now perform multimodal topic modeling in BERTopic! The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.

        In this example, we are going to use images from flickr that each have a caption accociated to it:

        # NOTE: This requires the `datasets` package which you can \n# install with `pip install datasets`\nfrom datasets import load_dataset\n\nds = load_dataset(\"maderix/flickr_bw_rgb\")\nimages = ds[\"train\"][\"image\"]\ndocs = ds[\"train\"][\"caption\"]\n

        The docs variable contains the captions for each image in images. We can now use these variables to run our multimodal example:

        from bertopic import BERTopic\nfrom bertopic.representation import VisualRepresentation\n\n# Additional ways of representing a topic\nvisual_model = VisualRepresentation()\n\n# Make sure to add the `visual_model` to a dictionary\nrepresentation_model = {\n   \"Visual_Aspect\":  visual_model,\n}\ntopic_model = BERTopic(representation_model=representation_model, verbose=True)\n

        We can now access our image representations for each topic with topic_model.topic_aspects_[\"Visual_Aspect\"]. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:

        import base64\nfrom io import BytesIO\nfrom IPython.display import HTML\n\ndef image_base64(im):\n    if isinstance(im, str):\n        im = get_thumbnail(im)\n    with BytesIO() as buffer:\n        im.save(buffer, 'jpeg')\n        return base64.b64encode(buffer.getvalue()).decode()\n\n\ndef image_formatter(im):\n    return f'<img src=\"data:image/jpeg;base64,{image_base64(im)}\">'\n\n# Extract dataframe\ndf = topic_model.get_topic_info().drop(\"Representative_Docs\", 1).drop(\"Name\", 1)\n\n# Visualize the images\nHTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))\n

        Multi-aspect Topic Modeling

        In this new release, we introduce multi-aspect topic modeling! During the .fit or .fit_transform stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).

        The approach is rather straightforward. We might want to represent our topics using a PartOfSpeech representation model but we might also want to try out KeyBERTInspired and compare those representation models. We can do this as follows:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic.representation import PartOfSpeech\nfrom bertopic.representation import MaximalMarginalRelevance\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Documents to train on\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# The main representation of a topic\nmain_representation = KeyBERTInspired()\n\n# Additional ways of representing a topic\naspect_model1 = PartOfSpeech(\"en_core_web_sm\")\naspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]\n\n# Add all models together to be run in a single `fit`\nrepresentation_model = {\n   \"Main\": main_representation,\n   \"Aspect1\":  aspect_model1,\n   \"Aspect2\":  aspect_model2 \n}\ntopic_model = BERTopic(representation_model=representation_model).fit(docs)\n

        As show above, to perform multi-aspect topic modeling, we make sure that representation_model is a dictionary where each representation model pipeline is defined. The main pipeline, that is used in most visualization options, is defined with the \"Main\" key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as \"Aspect1\" and \"Aspect2\".

        After we have fitted our model, we can access all representations with topic_model.get_topic_info():

        As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in topic_model.topic_aspects_.

        Serialization

        Saving, loading, and sharing a BERTopic model can be done in several ways. With this new release, it is now advised to go with .safetensors as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as .pickle and pytorch .bin are also possible.

        The methods are used as follows:

        topic_model = BERTopic().fit(my_docs)\n\n# Method 1 - safetensors\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 2 - pytorch\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"pytorch\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 3 - pickle\ntopic_model.save(\"my_model\", serialization=\"pickle\")\n

        Saving the topic modeling with .safetensors or pytorch has a number of advantages:

        • .safetensors is a relatively safe format
        • The resulting model can be very small (often < 20MB>) since no sub-models need to be saved
        • Although version control is important, there is a bit more flexibility with respect to specific versions of packages
        • More easily used in production
        • Share models with the HuggingFace Hub

        The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing safetensors, pytorch, and pickle. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings.

        HuggingFace Hub

        When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account:

        from huggingface_hub import login\nlogin()\n

        When you have logged in to your HuggingFace account, you can save and upload the model as follows:

        from bertopic import BERTopic\n\n# Train model\ntopic_model = BERTopic().fit(my_docs)\n\n# Push to HuggingFace Hub\ntopic_model.push_to_hf_hub(\n    repo_id=\"MaartenGr/BERTopic_ArXiv\",\n    save_ctfidf=True\n)\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_ArXiv\")\n
        "},{"location":"changelog.html#version-0141","title":"Version 0.14.1","text":"

        Release date: 2 March, 2023

        Highlights:
        • Use ChatGPT to create topic representations!:
        • Added delay_in_seconds parameter to OpenAI and Cohere representation models for throttling the API
          • Setting this between 5 and 10 allows for trial users now to use more easily without hitting RateLimitErrors
        • Fixed missing title param to visualization methods
        • Fixed probabilities not correctly aligning (#1024)
        • Fix typo in textgenerator @dkopljar27 in #1002
        ChatGPT

        Within OpenAI's API, the ChatGPT models use a different API structure compared to the GPT-3 models. In order to use ChatGPT with BERTopic, we need to define the model and make sure to set chat=True:

        import openai\nfrom bertopic import BERTopic\nfrom bertopic.representation import OpenAI\n\n# Create your representation model\nopenai.api_key = MY_API_KEY\nrepresentation_model = OpenAI(model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Prompting with ChatGPT is very satisfying and can be customized in BERTopic by using certain tags. There are currently two tags, namely \"[KEYWORDS]\" and \"[DOCUMENTS]\". These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively. For example, if we have the following prompt:

        prompt = \"\"\"\nI have topic that contains the following documents: \\n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n\"\"\"\n

        then that will be rendered as follows and passed to OpenAI's API:

        \"\"\"\nI have a topic that contains the following documents: \n- Our videos are also made possible by your support on patreon.co.\n- If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop.\n- If you want to help us make more videos, you can do so there.\n- And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com.\n\nThe topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch \n\nBased on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n\"\"\"\n

        Note

        Whenever you create a custom prompt, it is important to add

        Based on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n
        at the end of your prompt as BERTopic extracts everything that comes after topic:. Having said that, if topic: is not in the output, then it will simply extract the entire response, so feel free to experiment with the prompts.

        "},{"location":"changelog.html#version-0140","title":"Version 0.14.0","text":"

        Release date: 14 February, 2023

        Highlights:
        • Fine-tune topic representations with bertopic.representation
          • Diverse range of models, including KeyBERT, MMR, POS, Transformers, OpenAI, and more!'
          • Create your own prompts for text generation models, like GPT3:
            • Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and set of representative documents need to be inserted.
          • Chain models to perform fine-grained fine-tuning
          • Create and customize your represention model
        • Improved the topic reduction technique when using nr_topics=int
        • Added title parameters for all graphs (#800)
        Fixes:
        • Improve documentation (#837, #769, #954, #912, #911)
        • Bump pyyaml (#903)
        • Fix large number of representative docs (#965)
        • Prevent stochastisch behavior in .visualize_topics (#952)
        • Add custom labels parameter to .visualize_topics (#976)
        • Fix cuML HDBSCAN type checks by @FelSiq in #981
        API Changes:
        • The diversity parameter was removed in favor of bertopic.representation.MaximalMarginalRelevance
        • The representation_model parameter was added to bertopic.BERTopic

        Representation Models

        Fine-tune the c-TF-IDF representation with a variety of models. Whether that is through a KeyBERT-Inspired model or GPT-3, the choice is up to you!

        KeyBERTInspired

        The algorithm follows some principles of KeyBERT but does some optimization in order to speed up inference. Usage is straightforward:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        PartOfSpeech

        Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of keywords and documents that best represent a topic.

        from bertopic.representation import PartOfSpeech\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        MaximalMarginalRelevance

        When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like \"car\" and \"cars\" essentially represent the same information and often redundant. We can use MaximalMarginalRelevance to improve diversity of our candidate topics:

        from bertopic.representation import MaximalMarginalRelevance\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = MaximalMarginalRelevance(diversity=0.3)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Zero-Shot Classification

        To perform zero-shot classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels. If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords.

        We use it in BERTopic as follows:

        from bertopic.representation import ZeroShotClassification\nfrom bertopic import BERTopic\n\n# Create your representation model\ncandidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\nrepresentation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: \ud83e\udd17 Transformers

        Nearly every week, there are new and improved models released on the \ud83e\udd17 Model Hub that, with some creativity, allow for further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these methods are created as a way to support whatever might be released in the future.

        Using a GPT-like model from the huggingface hub is rather straightforward:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = TextGeneration('gpt2')\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: Cohere

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use Cohere to extract our topic labels from the candidate documents and keywords. To use this, you will need to install cohere first:

        pip install cohere\n

        Then, get yourself an API key and use Cohere's API as follows:

        import cohere\nfrom bertopic.representation import Cohere\nfrom bertopic import BERTopic\n\n# Create your representation model\nco = cohere.Client(my_api_key)\nrepresentation_model = Cohere(co)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: OpenAI

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use OpenAI to extract our topic labels from the candidate documents and keywords. To use this, you will need to install openai first:

        pip install openai\n

        Then, get yourself an API key and use OpenAI's API as follows:

        import openai\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Create your representation model\nopenai.api_key = MY_API_KEY\nrepresentation_model = OpenAI()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: LangChain

        Langchain is a package that helps users with chaining large language models. In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this external knowledge are the most representative documents in each topic.

        To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai:

        pip install langchain, openai\n

        Then, you can create your chain as follows:

        from langchain.chains.question_answering import load_qa_chain\nfrom langchain.llms import OpenAI\nchain = load_qa_chain(OpenAI(temperature=0, openai_api_key=MY_API_KEY), chain_type=\"stuff\")\n

        Finally, you can pass the chain to BERTopic as follows:

        from bertopic.representation import LangChain\n\n# Create your representation model\nrepresentation_model = LangChain(chain)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"changelog.html#version-0130","title":"Version 0.13.0","text":"

        Release date: 4 January, 2023

        Highlights:
        • Calculate topic distributions with .approximate_distribution regardless of the cluster model used
          • Generates topic distributions on a document- and token-levels
          • Can be used for any document regardless of its size!
        • Fully supervised BERTopic
          • You can now use a classification model for the clustering step instead to create a fully supervised topic model
        • Manual topic modeling
          • Generate topic representations from labels directly
          • Allows for skipping the embedding and clustering steps in order to go directly to the topic representation step
        • Reduce outliers with 4 different strategies using .reduce_outliers
        • Install BERTopic without SentenceTransformers for a lightweight package:
          • pip install --no-deps bertopic
          • pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml
        • Get meta data of trained documents such as topics and probabilities using .get_document_info(docs)
        • Added more support for cuML's HDBSCAN
          • Calculate and predict probabilities during fit_transform and transform respectively
          • This should give a major speed-up when setting calculate_probabilities=True
        • More images to the documentation and a lot of changes/updates/clarifications
        • Get representative documents for non-HDBSCAN models by comparing document and topic c-TF-IDF representations
        • Sklearn Pipeline Embedder by @koaning in #791
        Fixes:
        • Improve .partial_fit documentation (#837)
        • Fixed scipy linkage usage (#807)
        • Fixed shifted heatmap (#782)
        • Fixed SpaCy backend (#744)
        • Fixed representative docs with small clusters (<3) (#703)
        • Typo fixed by @timpal0l in #734
        • Typo fixed by @srulikbd in #842
        • Correcting iframe urls by @Mustapha-AJEGHRIR in #798
        • Refactor embedding methods by @zachschillaci27 in #855
        • Added diversity parameter to update_topics() function by @anubhabdaserrr in #887
        Documentation

        Personally, I believe that documentation can be seen as a feature and is an often underestimated aspect of open-source. So I went a bit overboard\ud83d\ude05... and created an animation about the three pillars of BERTopic using Manim. There are many other visualizations added, one of each variation of BERTopic, and many smaller changes.

        Topic Distributions

        The difficulty with a cluster-based topic modeling technique is that it does not directly consider that documents may contain multiple topics. With the new release, we can now model the distributions of topics! We even consider that a single word might be related to multiple topics. If a document is a mixture of topics, what is preventing a single word to be the same?

        To do so, we approximate the distribution of topics in a document by calculating and summing the similarities of tokensets (achieved by applying a sliding window) with the topics:

        # After fitting your model run the following for either your trained documents or even unseen documents\ntopic_distr, _ = topic_model.approximate_distribution(docs)\n

        To calculate and visualize the topic distributions in a document on a token-level, we can run the following:

        # We need to calculate the topic distributions on a token level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Create a visualization using a styled dataframe if Jinja2 is installed\ndf = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]); df\n
        Supervised Topic Modeling

        BERTopic now supports fully-supervised classification! Instead of using a clustering algorithm, like HDBSCAN, we can replace it with a classifier, like Logistic Regression:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseDimensionalityReduction\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.linear_model import LogisticRegression\n\n# Get labeled data\ndata= fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n\n# Allows us to skip over the dimensionality reduction step\nempty_dimensionality_model = BaseDimensionalityReduction()\n\n# Create a classifier to be used instead of the cluster model\nclf= LogisticRegression()\n\n# Create a fully supervised BERTopic instance\ntopic_model= BERTopic(\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=clf\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n
        Manual Topic Modeling

        When you already have a bunch of labels and simply want to extract topic representations from them, you might not need to actually learn how those can predicted. We can bypass the embeddings -> dimensionality reduction -> clustering steps and go straight to the c-TF-IDF representation of our labels:

        from bertopic import BERTopic\nfrom bertopic.backend import BaseEmbedder\nfrom bertopic.cluster import BaseCluster\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\n# Prepare our empty sub-models and reduce frequent words while we are at it.\nempty_embedding_model = BaseEmbedder()\nempty_dimensionality_model = BaseDimensionalityReduction()\nempty_cluster_model = BaseCluster()\n\n# Fit BERTopic without actually performing any clustering\ntopic_model= BERTopic(\n        embedding_model=empty_embedding_model,\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=empty_cluster_model,\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n
        Outlier Reduction

        Outlier reduction is an frequently-discussed topic in BERTopic as its default cluster model, HDBSCAN, has a tendency to generate many outliers. This often helps in the topic representation steps, as we do not consider documents that are less relevant, but you might want to still assign those outliers to actual topics. In the modular philosophy of BERTopic, keeping training times in mind, it is now possible to perform outlier reduction after having trained your topic model. This allows for ease of iteration and prevents having to train BERTopic many times to find the parameters you are searching for. There are 4 different strategies that you can use, so make sure to check out the documentation!

        Using it is rather straightforward:

        new_topics = topic_model.reduce_outliers(docs, topics)\n
        Lightweight BERTopic

        The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely \"all-MiniLM-L6-v2\". Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires pytorch which often results in a rather large environment, memory-wise.

        Fortunately, it is possible to install BERTopic without sentence-transformers and use it as a lightweight solution instead. The installation can be done as follows:

        pip install --no-deps bertopic\npip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml\n

        Then, we can use BERTopic without sentence-transformers as follows using a CPU-based embedding technique:

        from sklearn.pipeline import make_pipeline\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\npipe = make_pipeline(\n    TfidfVectorizer(),\n    TruncatedSVD(100)\n)\n\ntopic_model = BERTopic(embedding_model=pipe)\n

        As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary!

        Document Information

        Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and the probability of the clustering if the cluster model supports it. There are also options to include other metadata, such as the topic distributions or the x and y coordinates of the reduced embeddings that you can learn more about here.

        To get the document info, you will only need to pass the documents on which the topic model was trained:

        >>> topic_model.get_document_info(docs)\n\nDocument                               Topic    Name                        Top_n_words                     Probability    ...\nI am sure some bashers of Pens...       0       0_game_team_games_season    game - team - games...          0.200010       ...\nMy brother is in the market for...      -1     -1_can_your_will_any         can - your - will...            0.420668       ...\nFinally you said what you dream...      -1     -1_can_your_will_any         can - your - will...            0.807259       ...\nThink! It is the SCSI card doing...     49     49_windows_drive_dos_file    windows - drive - docs...       0.071746       ...\n1) I have an old Jasmine drive...       49     49_windows_drive_dos_file    windows - drive - docs...       0.038983       ...\n
        "},{"location":"changelog.html#version-0120","title":"Version 0.12.0","text":"

        Release date: 5 September, 2022

        Highlights:

        • Perform online/incremental topic modeling with .partial_fit
        • Expose c-TF-IDF model for customization with bertopic.vectorizers.ClassTfidfTransformer
          • The parameters bm25_weighting and reduce_frequent_words were added to potentially improve representations:
        • Expose attributes for easier access to internal data
        • Major changes to the Algorithm page of the documentation, which now contains three overviews of the algorithm:
          • Visualize Overview
          • Code Overview
          • Detailed Overview
        • Added an example of combining BERTopic with KeyBERT
        • Added many tests with the intention of making development a bit more stable

        Fixes:

        • Fixed iteratively merging topics (#632 and (#648)
        • Fixed 0th topic not showing up in visualizations (#667)
        • Fixed lowercasing not being optional (#682)
        • Fixed spelling (#664 and (#673)
        • Fixed 0th topic not shown in .get_topic_info by @oxymor0n in #660
        • Fixed spelling by @domenicrosati in #674
        • Add custom labels and title options to barchart @leloykun in #694

        Online/incremental topic modeling:

        Online topic modeling (sometimes called \"incremental topic modeling\") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained on before. In Scikit-Learn, this technique is often modeled through a .partial_fit function, which is also used in BERTopic.

        At a minimum, the cluster model needs to support a .partial_fit function in order to use this feature. The default HDBSCAN model will not work as it does not support online updating.

        from sklearn.datasets import fetch_20newsgroups\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.decomposition import IncrementalPCA\nfrom bertopic.vectorizers import OnlineCountVectorizer\nfrom bertopic import BERTopic\n\n# Prepare documents\nall_docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\ndoc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)]\n\n# Prepare sub-models that support online learning\numap_model = IncrementalPCA(n_components=5)\ncluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\ntopic_model = BERTopic(umap_model=umap_model,\n                       hdbscan_model=cluster_model,\n                       vectorizer_model=vectorizer_model)\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n

        Only the topics for the most recent batch of documents are tracked. If you want to be using online topic modeling, not for a streaming setting but merely for low-memory use cases, then it is advised to also update the .topics_ attribute as variations such as hierarchical topic modeling will not work afterward:

        # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration\ntopics = []\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n    topics.extend(topic_model.topics_)\n\ntopic_model.topics_ = topics\n

        c-TF-IDF:

        Explicitly define, use, and adjust the ClassTfidfTransformer with new parameters, bm25_weighting and reduce_frequent_words, to potentially improve the topic representation:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(bm25_weighting=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model)\n

        Attributes:

        After having fitted your BERTopic instance, you can use the following attributes to have quick access to certain information, such as the topic assignment for each document in topic_model.topics_.

        Attribute Type Description topics_ List[int] The topics that are generated for each document after training or updating the topic model. The most recent topics are tracked. probabilities_ List[float] The probability of the assigned topic per document. These are only calculated if a HDBSCAN model is used for the clustering step. When calculate_probabilities=True, then it is the probabilities of all topics per document. topic_sizes_ Mapping[int, int] The size of each topic. topic_mapper_ TopicMapper A class for tracking topics and their mappings anytime they are merged, reduced, added, or removed. topic_representations_ Mapping[int, Tuple[int, float]] The top n terms per topic and their respective c-TF-IDF values. c_tf_idf_ csr_matrix The topic-term matrix as calculated through c-TF-IDF. To access its respective words, run .vectorizer_model.get_feature_names() or .vectorizer_model.get_feature_names_out() topic_labels_ Mapping[int, str] The default labels for each topic. custom_labels_ List[str] Custom labels for each topic as generated through .set_topic_labels. topic_embeddings_ np.ndarray The embeddings for each topic. It is calculated by taking the weighted average of word embeddings in a topic based on their c-TF-IDF values. representative_docs_ Mapping[int, str] The representative documents for each topic if HDBSCAN is used."},{"location":"changelog.html#version-0110","title":"Version 0.11.0","text":"

        Release date: 11 July, 2022

        Highlights:

        • Perform hierarchical topic modeling with .hierarchical_topics
        hierarchical_topics = topic_model.hierarchical_topics(docs, topics) \n
        • Visualize hierarchical topic representations with .visualize_hierarchy
        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n
        • Extract a text-based hierarchical topic representation with .get_topic_tree
        tree = topic_model.get_topic_tree(hierarchical_topics)\n
        • Visualize 2D documents with .visualize_documents()
        # Use input embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# or use 2D reduced embeddings through a method of your own (e.g., PCA, t-SNE, UMAP, etc.)\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n
        • Visualize 2D hierarchical documents with .visualize_hierarchical_documents()
        # Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already which speed things up quite a bit:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n
        • Create custom labels to the topics throughout most visualizations
        # Generate topic labels\ntopic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=\", \")\n\n# Set them internally in BERTopic\ntopic_model.set_topic_labels(topics_labels)\n
        • Manually merge topics with .merge_topics()
        # Merge topics 1, 2, and 3\ntopics_to_merge = [1, 2, 3]\ntopic_model.merge_topics(docs, topics, topics_to_merge)\n\n# Merge topics 1 and 2, and separately merge topics 3 and 4\ntopics_to_merge = [[1, 2], [3, 4]]\ntopic_model.merge_topics(docs, topics, topics_to_merge)\n
        • Added example for finding similar topics between two models in the tips & tricks page
        • Add multi-modal example in the tips & tricks page
        • Added native Hugging Face transformers support

        Fixes:

        • Fix support for k-Means in .visualize_heatmap (#532)
        • Fix missing topic 0 in .visualize_topics (#533)
        • Fix inconsistencies in .get_topic_info (#572) and (#581)
        • Add optimal_ordering parameter to .visualize_hierarchy by @rafaelvalero in #390
        • Fix RuntimeError when used as sklearn estimator by @simonfelding in #448
        • Fix typo in visualization documentation by @dwhdai in #475
        • Fix typo in docstrings by @xwwwwww in #549
        • Support higher Flair versions
        "},{"location":"changelog.html#version-0100","title":"Version 0.10.0","text":"

        Release date: 30 April, 2022

        Highlights:

        • Use any dimensionality reduction technique instead of UMAP:
        from bertopic import BERTopic\nfrom sklearn.decomposition import PCA\n\ndim_model = PCA(n_components=5)\ntopic_model = BERTopic(umap_model=dim_model)\n
        • Use any clustering technique instead of HDBSCAN:
        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\n\ncluster_model = KMeans(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n

        Documentation:

        • Add a CountVectorizer page with tips and tricks on how to create topic representations that fit your use case
        • Added pages on how to use other dimensionality reduction and clustering algorithms
        • Additional instructions on how to reduce outliers in the FAQ:
        import numpy as np\nprobability_threshold = 0.01\nnew_topics = [np.argmax(prob) if max(prob) >= probability_threshold else -1 for prob in probs] \n

        Fixes:

        • Fixed None being returned for probabilities when transforming unseen documents
        • Replaced all instances of arg: with Arguments: for consistency
        • Before saving a fitted BERTopic instance, we remove the stopwords in the fitted CountVectorizer model as it can get quite large due to the number of words that end in stopwords if min_df is set to a value larger than 1
        • Set \"hdbscan>=0.8.28\" to prevent numpy issues
        • Although this was already fixed by the new release of HDBSCAN, it is technically still possible to install 0.8.27 with BERTopic which leads to these numpy issues
        • Update gensim dependency to >=4.0.0 (#371)
        • Fix topic 0 not appearing in visualizations (#472)
        • Fix (#506)
        • Fix (#429)
        • Fix typo in DTM documentation by @hp0404 in #386
        "},{"location":"changelog.html#version-094","title":"Version 0.9.4","text":"

        Release date: 14 December, 2021

        A number of fixes, documentation updates, and small features:

        • Expose diversity parameter
          • Use BERTopic(diversity=0.1) to change how diverse the words in a topic representation are (ranges from 0 to 1)
        • Improve stability of topic reduction by only computing the cosine similarity within c-TF-IDF and not the topic embeddings
        • Added property to c-TF-IDF that all IDF values should be positive (#351)
        • Improve stability of .visualize_barchart() and .visualize_hierarchy()
        • Major documentation overhaul (mkdocs, tutorials, FAQ, images, etc. ) (#330)
        • Drop python 3.6 (#333)
        • Relax plotly dependency (#88)
        • Additional logging for .transform (#356)
        "},{"location":"changelog.html#version-093","title":"Version 0.9.3","text":"

        Release date: 17 October, 2021

        • Fix #282
          • As it turns out the old implementation of topic mapping was still found in the transform function
        • Fix #285
          • Fix getting all representative docs
        • Fix #288
          • A recent issue with the package pyyaml that can be found in Google Colab
        "},{"location":"changelog.html#version-092","title":"Version 0.9.2","text":"

        Release date: 12 October, 2021

        A release focused on algorithmic optimization and fixing several issues:

        Highlights:

        • Update the non-multilingual paraphrase- models to the all- models due to improved performance
        • Reduce necessary RAM in c-TF-IDF top 30 word extraction

        Fixes:

        • Fix topic mapping
          • When reducing the number of topics, these need to be mapped to the correct input/output which had some issues in the previous version
          • A new class was created as a way to track these mappings regardless of how many times they were executed
          • In other words, you can iteratively reduce the number of topics after training the model without the need to continuously train the model
        • Fix typo in embeddings page (#200)
        • Fix link in README (#233)
        • Fix documentation .visualize_term_rank() (#253)
        • Fix getting correct representative docs (#258)
        • Update memory FAQ with HDBSCAN pr
        "},{"location":"changelog.html#version-091","title":"Version 0.9.1","text":"

        Release date: 1 September, 2021

        A release focused on fixing several issues:

        Fixes:

        • Fix TypeError when auto-reducing topics (#210)
        • Fix mapping representative docs when reducing topics (#208)
        • Fix visualization issues with probabilities (#205)
        • Fix missing normalize_frequency param in plots (#213)
        "},{"location":"changelog.html#version-090","title":"Version 0.9.0","text":"

        Release date: 9 August, 2021

        Highlights:

        • Implemented a Guided BERTopic -> Use seeds to steer the Topic Modeling
        • Get the most representative documents per topic: topic_model.get_representative_docs(topic=1)
          • This allows users to see which documents are good representations of a topic and better understand the topics that were created
        • Added normalize_frequency parameter to visualize_topics_per_class and visualize_topics_over_time in order to better compare the relative topic frequencies between topics
        • Return flat probabilities as default, only calculate the probabilities of all topics per document if calculate_probabilities is True
        • Added several FAQs

        Fixes:

        • Fix loading pre-trained BERTopic model
        • Fix mapping of probabilities
        • Fix #190

        Guided BERTopic:

        Guided BERTopic works in two ways:

        First, we create embeddings for each seeded topics by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics.

        Second, we take all words in seed_topic_list and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier.

        seed_topic_list = [[\"company\", \"billion\", \"quarter\", \"shrs\", \"earnings\"],\n                   [\"acquisition\", \"procurement\", \"merge\"],\n                   [\"exchange\", \"currency\", \"trading\", \"rate\", \"euro\"],\n                   [\"grain\", \"wheat\", \"corn\"],\n                   [\"coffee\", \"cocoa\"],\n                   [\"natural\", \"gas\", \"oil\", \"fuel\", \"products\", \"petrol\"]]\n\ntopic_model = BERTopic(seed_topic_list=seed_topic_list)\ntopics, probs = topic_model.fit_transform(docs)\n
        "},{"location":"changelog.html#version-081","title":"Version 0.8.1","text":"

        Release date: 8 June, 2021

        Highlights:

        • Improved models:
          • For English documents the default is now: \"paraphrase-MiniLM-L6-v2\"
          • For Non-English or multi-lingual documents the default is now: \"paraphrase-multilingual-MiniLM-L12-v2\"
          • Both models show not only great performance but are much faster!
        • Add interactive visualizations to the plotting API documentation

        For better performance, please use the following models:

        • English: \"paraphrase-mpnet-base-v2\"
        • Non-English or multi-lingual: \"paraphrase-multilingual-mpnet-base-v2\"

        Fixes:

        • Improved unit testing for more stability
        • Set transformers version for Flair
        "},{"location":"changelog.html#version-080","title":"Version 0.8.0","text":"

        Release date: 31 May, 2021

        Highlights:

        • Additional visualizations:
          • Topic Hierarchy: topic_model.visualize_hierarchy()
          • Topic Similarity Heatmap: topic_model.visualize_heatmap()
          • Topic Representation Barchart: topic_model.visualize_barchart()
          • Term Score Decline: topic_model.visualize_term_rank()
        • Created bertopic.plotting library to easily extend visualizations
        • Improved automatic topic reduction by using HDBSCAN to detect similar topics
        • Sort topic ids by their frequency. -1 is the outlier class and contains typically the most documents. After that 0 is the largest topic, 1 the second largest, etc.

        Fixes:

        • Fix typo #113, #117
        • Fix #121 by removing these two lines
        • Fix mapping of topics after reduction (it now excludes 0) (#103)
        "},{"location":"changelog.html#version-070","title":"Version 0.7.0","text":"

        Release date: 26 April, 2021

        The two main features are (semi-)supervised topic modeling and several backends to use instead of Flair and SentenceTransformers!

        Highlights:

        • (semi-)supervised topic modeling by leveraging supervised options in UMAP
          • model.fit(docs, y=target_classes)
        • Backends:
          • Added Spacy, Gensim, USE (TFHub)
          • Use a different backend for document embeddings and word embeddings
          • Create your own backends with bertopic.backend.BaseEmbedder
          • Click here for an overview of all new backends
        • Calculate and visualize topics per class
          • Calculate: topics_per_class = topic_model.topics_per_class(docs, topics, classes)
          • Visualize: topic_model.visualize_topics_per_class(topics_per_class)
        • Several tutorials were updated and added:
        Name Link Topic Modeling with BERTopic (Custom) Embedding Models in BERTopic Advanced Customization in BERTopic (semi-)Supervised Topic Modeling with BERTopic Dynamic Topic Modeling with Trump's Tweets

        Fixes:

        • Fixed issues with Torch req
        • Prevent saving term frequency matrix in CTFIDF class
        • Fixed DTM not working when reducing topics (#96)
        • Moved visualization dependencies to base BERTopic
          • pip install bertopic[visualization] becomes pip install bertopic
        • Allow precomputed embeddings in bertopic.find_topics() (#79):
        model = BERTopic(embedding_model=my_embedding_model)\nmodel.fit(docs, my_precomputed_embeddings)\nmodel.find_topics(search_term)\n
        "},{"location":"changelog.html#version-060","title":"Version 0.6.0","text":"

        Release date: 1 March, 2021

        Highlights:

        • DTM: Added a basic dynamic topic modeling technique based on the global c-TF-IDF representation
          • model.topics_over_time(docs, timestamps, global_tuning=True)
        • DTM: Option to evolve topics based on t-1 c-TF-IDF representation which results in evolving topics over time
          • Only uses topics at t-1 and skips evolution if there is a gap
          • model.topics_over_time(docs, timestamps, evolution_tuning=True)
        • DTM: Function to visualize topics over time
          • model.visualize_topics_over_time(topics_over_time)
        • DTM: Add binning of timestamps
          • model.topics_over_time(docs, timestamps, nr_bins=10)
        • Add function get general information about topics (id, frequency, name, etc.)
          • get_topic_info()
        • Improved stability of c-TF-IDF by taking the average number of words across all topics instead of the number of documents

        Fixes:

        • _map_probabilities() does not take into account that there is no probability of the outlier class and the probabilities are mutated instead of copied (#63, #64)
        "},{"location":"changelog.html#version-050","title":"Version 0.5.0","text":"

        Release date: 8 Februari, 2021

        Highlights:

        • Add Flair to allow for more (custom) token/document embeddings, including \ud83e\udd17 transformers
        • Option to use custom UMAP, HDBSCAN, and CountVectorizer
        • Added low_memory parameter to reduce memory during computation
        • Improved verbosity (shows progress bar)
        • Return the figure of visualize_topics()
        • Expose all parameters with a single function: get_params()

        Fixes:

        • To simplify the API, the parameters stop_words and n_neighbors were removed. These can still be used when a custom UMAP or CountVectorizer is used.
        • Set calculate_probabilities to False as a default. Calculating probabilities with HDBSCAN significantly increases computation time and memory usage. Better to remove calculating probabilities or only allow it by manually turning this on.
        • Use the newest version of sentence-transformers as it speeds ups encoding significantly
        "},{"location":"changelog.html#version-042","title":"Version 0.4.2","text":"

        Release date: 10 Januari, 2021

        Fixes:

        • Selecting embedding_model did not work when language was also used. This led to the user needing to set language to None before being able to use embedding_model. Fixed by using embedding_model when language is used (as a default parameter).
        "},{"location":"changelog.html#version-041","title":"Version 0.4.1","text":"

        Release date: 07 Januari, 2021

        Fixes:

        • Simple fix by lowering the languages variable to match the lowered input language.
        "},{"location":"changelog.html#version-040","title":"Version 0.4.0","text":"

        Release date: 21 December, 2020

        Highlights:

        • Visualize Topics similar to LDAvis
        • Added option to reduce topics after training
        • Added option to update topic representation after training
        • Added option to search topics using a search term
        • Significantly improved the stability of generating clusters
        • Finetune the topic words by selecting the most coherent words with the highest c-TF-IDF values
        • More extensive tutorials in the documentation

        Notable Changes:

        • Option to select language instead of sentence-transformers models to minimize the complexity of using BERTopic
        • Improved logging (remove duplicates)
        • Check if BERTopic is fitted
        • Added TF-IDF as an embedder instead of transformer models (see tutorial)
        • Numpy for Python 3.6 will be dropped and was therefore removed from the workflow.
        • Preprocess text before passing it through c-TF-IDF
        • Merged get_topics_freq() with get_topic_freq()

        Fixes:

        • Fix error handling topic probabilities
        "},{"location":"changelog.html#version-032","title":"Version 0.3.2","text":"

        Release date: 16 November, 2020

        Highlights:

        • Fixed a bug with the topic reduction method that seems to reduce the number of topics but not to the nr_topics as defined in the class. Since this was, to a certain extend, breaking the topic reduction method a new release was necessary.
        "},{"location":"changelog.html#version-031","title":"Version 0.3.1","text":"

        Release date: 4 November, 2020

        Highlights:

        • Adding the option to use custom embeddings or embeddings that you generated beforehand with whatever package you'd like to use. This allows users to further customize BERTopic to their liking.
        "},{"location":"changelog.html#version-030","title":"Version 0.3.0","text":"

        Release date: 29 October, 2020

        Highlights:

        • transform() and fit_transform() now also return the topic probability distributions
        • Added visualize_distribution() which visualizes the topic probability distribution for a single document
        "},{"location":"changelog.html#version-022","title":"Version 0.2.2","text":"

        Release date: 17 October, 2020

        Highlights:

        • Fixed n_gram_range not being used
        • Added option for using stopwords
        "},{"location":"changelog.html#version-021","title":"Version 0.2.1","text":"

        Release date: 11 October, 2020

        Highlights:

        • Improved the calculation of the class-based TF-IDF procedure by limiting the calculation to sparse matrices. This prevents out-of-memory problems when faced with large datasets.
        "},{"location":"changelog.html#version-020","title":"Version 0.2.0","text":"

        Release date: 11 October, 2020

        Highlights:

        • Changed c-TF-IDF procedure such that it implements a version of scikit-learns procedure. This should also speed up the calculation of the sparse matrix and prevent memory errors.
        • Added automated unit tests
        "},{"location":"changelog.html#version-012","title":"Version 0.1.2","text":"

        Release date: 1 October, 2020

        Highlights:

        • When transforming new documents, self.mapped_topics seemed to be missing. Added to the init.
        "},{"location":"changelog.html#version-011","title":"Version 0.1.1","text":"

        Release date: 24 September, 2020

        Highlights:

        • Fixed requirements --> Issue with pytorch
        • Update documentation
        "},{"location":"changelog.html#version-010","title":"Version 0.1.0","text":"

        Release date: 24 September, 2020

        Highlights:

        • First release of BERTopic
        • Added parameters for UMAP and HDBSCAN
        • Option to choose sentence-transformer model
        • Method for transforming unseen documents
        • Save and load trained models (UMAP and HDBSCAN)
        • Extract topics and their sizes

        Notable Changes:

        • Optimized c-TF-IDF
        • Improved documentation
        • Improved topic reduction
        "},{"location":"faq.html","title":"Frequently Asked Questions","text":""},{"location":"faq.html#why-are-the-results-not-consistent-between-runs","title":"Why are the results not consistent between runs?","text":"

        Due to the stochastic nature of UMAP, the results from BERTopic might differ even if you run the same code multiple times. Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best. You only need to generate the embeddings themselves once and run BERTopic several times with different parameters.

        If you want to reproduce the results, at the expense of performance, you can set a random_state in UMAP to prevent any stochastic behavior:

        from bertopic import BERTopic\nfrom umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=5, \n                  min_dist=0.0, metric='cosine', random_state=42)\ntopic_model = BERTopic(umap_model=umap_model)\n
        "},{"location":"faq.html#which-embedding-model-should-i-choose","title":"Which embedding model should I choose?","text":"

        Unfortunately, there is not a definitive list of the best models for each language, this highly depends on your data, the model, and your specific use case. However, the default model in BERTopic (\"all-MiniLM-L6-v2\") works great for English documents. In contrast, for multi-lingual documents or any other language, \"paraphrase-multilingual-MiniLM-L12-v2\" has shown great performance.

        If you want to use a model that provides a higher quality, but takes more computing time, then I would advise using all-mpnet-base-v2 and paraphrase-multilingual-mpnet-base-v2 instead.

        MTEB Leaderboard New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the MTEB leaderboard. It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.

        Many of these models can be used with SentenceTransformers in BERTopic, like so:

        from bertopic import BERTopic\nfrom sentence_transformers import SentenceTransformer\n\nembedding_model = SentenceTransformer(\"BAAI/bge-base-en-v1.5\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n

        SentenceTransformers SentenceTransformers work typically quite well and are the preferred models to use. They are great at generating document embeddings and have several multi-lingual versions available.

        \ud83e\udd17 transformers BERTopic allows you to use any \ud83e\udd17 transformers model. These models are typically embeddings created on a word/sentence level but can easily be pooled using Flair (see Guides/Embeddings). If you have a specific language for which you want to generate embeddings, you can choose the model here.

        "},{"location":"faq.html#how-do-i-reduce-topic-outliers","title":"How do I reduce topic outliers?","text":"

        There are several ways we can reduce outliers.

        First, the amount of datapoint classified as outliers is handled by the min_samples parameters in HDBSCAN. This value is automatically set to the same value of min_cluster_size. However, you can set it independently if you want to reduce the number of generated outliers. Lowering this value will result in less noise being generated.

        from bertopic import BERTopic\nfrom hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', \n                        cluster_selection_method='eom', prediction_data=True, min_samples=5)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Note

        Although this will lower outliers found in the data, this might force outliers to be put into topics where they do not belong. So make sure to strike a balance between keeping noise and reducing outliers.

        Second, after training our BERTopic model, we can assign outliers to topics by making use of the .reduce_outliers function in BERTopic. An advantage of using this approach is that there are four built in strategies one can choose for reducing outliers. Moreover, this technique allows the user to experiment with reducing outliers across a number of strategies and parameters without actually having to re-train the topic model each time. You can learn more about the .reduce_outlier function here. The following is a minimal example of how to use this function:

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers\nnew_topics = topic_model.reduce_outliers(docs, topics)\n

        Third, we can replace HDBSCAN with any other clustering algorithm that we want. So we can choose a clustering algorithm, like k-Means, that does not produce any outliers at all. Using k-Means instead of HDBSCAN is straightforward:

        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\n\ncluster_model = KMeans(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n
        "},{"location":"faq.html#how-do-i-remove-stop-words","title":"How do I remove stop words?","text":"

        At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context to create accurate embeddings.

        Instead, we can use the CountVectorizer to preprocess our documents after having generated embeddings and clustered our documents. I have found almost no disadvantages to using the CountVectorizer to remove stop words and it is something I would strongly advise to try out:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        We can also use the ClassTfidfTransformer to reduce the impact of frequent words. The result is very similar to explicitly removing stop words but this process does this automatically:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model)\n
        "},{"location":"faq.html#how-can-i-speed-up-bertopic","title":"How can I speed up BERTopic?","text":"

        You can speed up BERTopic by either generating your embeddings beforehand or by setting calculate_probabilities to False. Calculating the probabilities is quite expensive and can significantly increase the computation time. Thus, only use it if you do not mind waiting a bit before the model is done running or if you have less than a couple of hundred thousand documents.

        Also, make sure to use a GPU when extracting the sentence/document embeddings. Transformer models typically require a GPU and using only a CPU can slow down computation time quite a lot. However, if you do not have access to a GPU, looking into quantization might help.

        Lastly, it is also possible to speed up BERTopic with cuML's GPU acceleration of UMAP and HDBSCAN:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\nfrom cuml.manifold import UMAP\n\n# Create instances of GPU-accelerated UMAP and HDBSCAN\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\n\n# Pass the above models to be used in BERTopic\ntopic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)\n
        "},{"location":"faq.html#i-am-facing-memory-issues-help","title":"I am facing memory issues. Help!","text":"

        There are several ways to perform computation with large datasets:

        • First, you can set low_memory to True when instantiating BERTopic. This may prevent blowing up the memory in UMAP.

        • Second, setting calculate_probabilities to False when instantiating BERTopic prevents a huge document-topic probability matrix from being created. Moreover, HDBSCAN is quite slow when it tries to calculate probabilities on large datasets.

        • Third, you can set the minimum frequency of words in the CountVectorizer class to reduce the size of the resulting sparse c-TF-IDF matrix. You can do this as follows:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\", min_df=10)\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        The min_df parameter is used to indicate the minimum frequency of words. Setting this value larger than 1 can significantly reduce memory.

        • Fourth, you can use online topic modeling instead to use BERTopic on big data by training the model in chunks

        If the problem persists, then this could be an issue related to your available memory. The processing of millions of documents is quite computationally expensive and sufficient RAM is necessary.

        "},{"location":"faq.html#i-have-only-a-few-topics-how-do-i-increase-them","title":"I have only a few topics, how do I increase them?","text":"

        There are several reasons why your topic model may result in only a few topics:

        • First, you might only have a few documents (~1000). This makes it very difficult to properly extract topics due to the little amount of data available. Increasing the number of documents might solve your issues.

        • Second, min_topic_size might be simply too large for your number of documents. If you decrease the minimum size of topics, then you are much more likely to increase the number of topics generated. You could also decrease the n_neighbors parameter used in UMAP if this does not work.

        • Third, although this does not happen very often, there simply aren't that many topics to be found in your documents. You can often see this when you have many -1 topics, which is not a topic but a category of outliers.

        "},{"location":"faq.html#i-have-too-many-topics-how-do-i-decrease-them","title":"I have too many topics, how do I decrease them?","text":"

        If you have a large dataset, then it is possible to generate thousands of topics. Especially with large datasets, there is a good chance they contain many small topics. In practice, you might want a few hundred topics at most to interpret them nicely.

        There are a few ways of decreasing the number of generated topics:

        • First, we can set the min_topic_size in the BERTopic initialization much higher (e.g., 300) to make sure that those small clusters will not be generated. This is an HDBSCAN parameter that specifies the minimum number of documents needed in a cluster. More documents in a cluster mean fewer topics will be generated.

        • Second, you can create a custom UMAP model and set n_neighbors much higher than the default 15 (e.g., 200). This also prevents those micro clusters to be generated as it will need many neighboring documents to create a cluster.

        • Third, we can set nr_topics to a value that seems logical to the user. Do note that topics are forced to merge which might result in a lower quality of topics. In practice, I would advise using nr_topic=\"auto\" as that will merge topics that are very similar. Dissimilar topics will therefore remain separated.

        "},{"location":"faq.html#how-do-i-calculate-the-probabilities-of-all-topics-in-a-document","title":"How do I calculate the probabilities of all topics in a document?","text":"

        Although it is possible to calculate all the probabilities, the process of doing so is quite computationally inefficient and might significantly increase the computation time. To prevent this, the probabilities are not calculated as a default. To calculate them, you will have to set calculate_probabilities to True:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs) \n

        Note

        The calculate_probabilties parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with .approximate_distribution.

        "},{"location":"faq.html#numpy-gives-me-an-error-when-running-bertopic","title":"Numpy gives me an error when running BERTopic","text":"

        With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous ones) due to compilation issues and pypi.

        This is a known issue with the order of installation using pypi. You can find more details about this issue here and here.

        I would suggest doing one of the following:

        • Install the newest version from BERTopic (>= v0.5).
        • You can install hdbscan with pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation which might resolve the issue
        • Install BERTopic in a fresh environment using these steps.
        "},{"location":"faq.html#how-can-i-run-bertopic-without-an-internet-connection","title":"How can I run BERTopic without an internet connection?","text":"

        The great thing about using sentence-transformers is that it searches automatically for an embedding model locally. If it cannot find one, it will download the pre-trained model from its servers. Make sure that you set the correct path for sentence-transformers to work. You can find a bit more about that here.

        You can download the corresponding model here and unzip it. Then, simply use the following to create your embedding model:

        from sentence_transformers import SentenceTransformer\nembedding_model = SentenceTransformer('path/to/unzipped/model')\n

        Then, pass it to BERTopic:

        from bertopic import BERTopic\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"faq.html#can-i-use-the-gpu-to-speed-up-the-model","title":"Can I use the GPU to speed up the model?","text":"

        Yes. The GPU is automatically used when you use a SentenceTransformer or Flair embedding model. Using a CPU would then definitely slow things down. However, you can use other embeddings like TF-IDF or Doc2Vec embeddings in BERTopic which do not depend on GPU acceleration.

        You can use cuML to speed up both UMAP and HDBSCAN through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\nfrom cuml.manifold import UMAP\n\n# Create instances of GPU-accelerated UMAP and HDBSCAN\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\n\n# Pass the above models to be used in BERTopic\ntopic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Depending on the embeddings you are using, you might want to normalize them first to force a cosine-related distance metric in UMAP:

        from cuml.preprocessing import normalize\nembeddings = normalize(embeddings)\n
        "},{"location":"faq.html#how-can-i-use-bertopic-with-chinese-documents","title":"How can I use BERTopic with Chinese documents?","text":"

        Currently, CountVectorizer tokenizes text by splitting whitespace which does not work for Chinese. To get it to work, you will have to create a custom CountVectorizer with jieba:

        from sklearn.feature_extraction.text import CountVectorizer\nimport jieba\n\ndef tokenize_zh(text):\n    words = jieba.lcut(text)\n    return words\n\nvectorizer = CountVectorizer(tokenizer=tokenize_zh)\n

        Next, we pass our custom vectorizer to BERTopic and create our topic model:

        from bertopic import BERTopic\ntopic_model = BERTopic(embedding_model=model, verbose=True, vectorizer_model=vectorizer)\ntopics, _ = topic_model.fit_transform(docs, embeddings=embeddings)\n
        "},{"location":"faq.html#why-does-it-take-so-long-to-import-bertopic","title":"Why does it take so long to import BERTopic?","text":"

        The main culprit here seems to be UMAP. After running tests with Tuna we can see that most of the resources when importing BERTopic can be dedicated to UMAP:

        Unfortunately, there currently is no fix for this issue. The most recent ticket regarding this issue can be found here.

        "},{"location":"faq.html#should-i-preprocess-the-data","title":"Should I preprocess the data?","text":"

        No. By using document embeddings there is typically no need to preprocess the data as all parts of a document are important in understanding the general topic of the document. Although this holds in 99% of cases, if you have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply topic modeling to HTML-code to extract topics of code, then it becomes important.

        "},{"location":"faq.html#i-run-into-issues-running-on-apple-silicon-what-should-i-do","title":"I run into issues running on Apple Silicon. What should I do?","text":"

        Apple Silicon chips (M1 & M2) are based on arm64 (aka AArch64, not to be confused with amd64/x86_64). There are known issues with upstream dependencies for this architecture, for example numba. You may not always run into this issue, depending on the extras that you need.

        One possible solution is to use VS Code Dev Containers, which allow you to setup a Linux-based environment. To run BERTopic effectively you need to be aware of two things:

        1. Make sure to use a Docker image specifically built for arm64
        2. Make sure to use a volume instead of a bind-mount \u2139\ufe0f the latter significantly reduces disk I/O

        Using the pre-configured Data Science Dev Containers makes sure these setting are optimized. To start using them, do the following:

        • Install and run Docker
        • Clone repository data-science-devcontainers
        • Open VS Code, build the Python base or Python scipy container and start working \u2139\ufe0f Change PYTHON_VERSION to 3.11 in the respective devcontainer.json to work with the latest patch release of Python 3.11
        • Note that data is persisted in the container
        • When using an unmodified devcontainer.json: Work in /home/vscode \ud83d\udc49 This is the home directory of user vscode
        • Python packages are installed to the home directory by default \ud83d\udc49 This is due to env variable PIP_USER=1
        • Note that the directory /workspaces is also persisted
        "},{"location":"faq.html#do-these-data-science-dev-containers-support-gpu-acceleration","title":"Do these Data Science Dev Containers support GPU acceleration?","text":"

        Yes, but only on Linux and Windows.

        The CUDA-enabled variants require the following in addition to Docker:

        • NVIDIA GPU
        • NVIDIA driver
        • Linux: NVIDIA Container Toolkit
        • Windows: GPU support in Docker Desktop

        \u2139\ufe0f The host running the GPU accelerated Dev Containers only requires the NVIDIA driver, the CUDA toolkit does not have to be installed.

        See the CUDA Version Matrix regarding Ubuntu/CUDA/Python versions and recommended NVIDIA drivers.

        "},{"location":"usecases.html","title":"Use Cases","text":"

        Over the last few years, BERTopic has been used on a wide variety of use cases and domains, from cancer research and voice perception, to employee surveys and social media. This diversity allows for interesting use cases but it might quickly become overwhelming. This page is meant to demonstrate how, when, and why BERTopic is used in practice.

        "},{"location":"usecases.html#examples","title":"Examples","text":"

        Below are a number of use cases that have been applied in practice. These use cases are collected from and written by data-professionals.

        Note

        If you would like to add your use case, feel free open up a PR! You only need to update this file and add your example. You can just copy-paste one of the existing examples and adjust it contain a description of your use case.

        "},{"location":"usecases.html#app-user-feedback","title":"App User Feedback","text":"

        \"Analyzing user reviews from the App Store and Play Store helps us reveal valuable customer information, fix technical or usability issues, and help constantly improve customer experience. We utilize BERTopic for topic modeling and supervised classification of predefined categories.\" \u2022\u2022\u2022Tibor Fabian, Ph.D.Lead/Master Data ScientistTelef\u00f3nica Germany

        "},{"location":"usecases.html#employee-surveys","title":"Employee Surveys","text":"

        \"We are using BERTopic to support analysis of employee surveys. Here, we use BERTopic to compute the topics of discussion found in employee responses to open-ended survey questions. To further understand how employees feel about certain topics, we combined BERTopic with sentiment analysis to identify the sentiments associated with different topics and vice versa.\" \u2022\u2022\u2022Steve Quirolgico, Ph.D.Principal Engineer U.S. Department of Homeland Security

        "},{"location":"usecases.html#voice-perception","title":"Voice Perception","text":"

        \"A research project on voice perception to categorize what people describe when they make first impressions based on hearing people say, \"Hi\".\" preprint | code \u2022\u2022\u2022David FeinbergAssociate ProfessorMcMaster University

        "},{"location":"usecases.html#social-media","title":"Social Media","text":"

        \"We use BERTopic to detect trending topics in social media, Our product (AIM Insights) is a social media monitoring tool so detecting trending topics in social media helps our clients to capitalize on them for their campaigns. We use BERTopic to group social media posts into clusters, sort them by engagement to detect the ones that are trending, and then use OpenAI's GPT-3 to generate a label for each of the top clusters based on the most relevant documents in it. This is all done on Arabic posts using an in-house sentence embeddings model.\" \u2022\u2022\u2022Ahmed RashwanAI leadAIM Technologies

        "},{"location":"usecases.html#it-service-management","title":"IT Service Management","text":"

        \"In IT Service Management systems (e.g., Service Now) we receive Incidents, Problems, Change requests etc. We use BERTopic to categorize them into a group of topics/clusters to understand the distribution of the work requests over the period of time to plan and act accordingly for the future.\" \u2022\u2022\u2022Rajesh ThanaseelanData Science ConsultantDXC Technology

        "},{"location":"usecases.html#colon-cancer","title":"Colon Cancer","text":"

        \"We use BERTopic to evaluate P53 in Ovarian cancer for Computational backgrounds researchers, who find it easier to relate Artificial Intelligence with advancing the transformer model and unstructured medical data. The paper explores the heterogeneity of keyBERT, BERTopic, PyCaret, and LDAs as key phrase generators and topic model extractors, with P53 in ovarian cancer as a use case.\" \u2022\u2022\u2022 Mary AdewunmiPhD Student in Colon Cancer and AIUTAS

        "},{"location":"usecases.html#telephone-help-line","title":"Telephone Help Line","text":"

        \"We analyzed 100K+ phone call memos from a telephone help line. The Help Line is open to all people, regardless of religion, culture, and origin. It follows the principles of IFOTES (International Federation Of Telephone Emergency Services). The regional offices each offer independent counseling services via telephone or online. The phone call memos are written by hundreds of independent volunteers and come in various shapes, lengths, forms, and wordings - additionally to have them in multiple languages. While using BERTopic we ran a few tests to figure out if the topic modeling works. Selecting only one language with ~60K data points and a mixed language model we achieved good results. It helped identify topics within the calls and therefore show the organization what reasons there are for people calling them. We identified in a workshop a few interesting topics, which they were not aware of, for example, religious topics. The identification of existing and new, arising topics is crucial for the service quality of the organization. It furthermore helps detect trends over time, which can then be reported directly to Public Health institutions, which can then come up with campaigns to inform the public and help reduce certain psychological concerns. It acts as a representative psychological health barometer of the population.\" \u2022\u2022\u2022Kevin KuhnChief Executive Officergopf

        "},{"location":"usecases.html#regional-newspaper","title":"Regional Newspaper","text":"

        \"Recently, we wanted to evaluate our overall section structure, especially our local news section. As you can imagine, local news is quite a big part of what we do in a regional newspaper. We used BERTopic on a year's worth of local news data to explore the topics in local news and define a new section structure. The results from this analysis helped to define the new section structure, which was implemented this month. \" \u2022\u2022\u2022Thomas HuskenData ScientistBergens Tidende

        "},{"location":"usecases.html#intelligent-virtual-assistants","title":"Intelligent Virtual Assistants","text":"

        \"We have been using BERTopic as an early step in our exploratory analysis for intelligent virtual assistants. It helps us get a quick read on what some of the intents may be. The results help in the design discussions with customers.\" \u2022\u2022\u2022Stephen DrewVP, AI and Automation SolutionsFive9

        "},{"location":"usecases.html#electronic-health-records","title":"Electronic Health Records","text":"

        \"Given physician-created documents from hospitals, find themes in the text as well as differentiate between \"relevant\" and \"irrelevant\" text, and disambiguate homonyms. \" \u2022\u2022\u2022 Alexis RaykhelSenior NLP EngineerIodine Software

        "},{"location":"usecases.html#teaching","title":"Teaching","text":"

        \"BERTopic was used to determine a taxonomy of climate change risks discussed in financial news, and to compute firms' related exposure. It was used in a context a course offering on Climate Risks modelling with NLP.\" \u2022\u2022\u2022 Thomas LoransSenior Associate, Quantitative Analyst

        "},{"location":"usecases.html#zero-hunger-lab","title":"Zero Hunger Lab","text":"

        \"I am a PhD student at Tilburg University, at a lab called Zero Hunger Lab, where we try to use data science methods to improve food insecurity. One key issue is classifying and predicting food insecurity in food-insecure nations. The Integrated Food Security Phase Classification (IPC) system serves this purpose. The IPC categorizes food insecurity into five phases, ranging from minimal food insecurity to famine, and serves as a guide for directing humanitarian resources to the most affected regions. The IPC system strives to be based on evidence, however, obtaining accurate information about food insecurity in remote regions can prove challenging. Despite the availability of weather data, data in the socio-economic domain, such as food prices and conflict, can be scarce or unreliable due to limited infrastructure and bureaucratic obstacles. These complications often result in infrequent releases of IPC classifications and projections, making it difficult to effectively respond to food insecurity in these areas. One large source of daily-updated information is local news. Thus, one can build a model that classifies/predicts IPC by relying on news features obtained by NLP methods in addition to stuff like weather data. Previous research shows this is possible (see https://arxiv.org/pdf/2111.15602.pdf). The authors find words related to food insecurity using semantic frame parsing. After which, they count the occurrence of these words to create features. The features are put into a linear classifier. We wanted to apply more advanced methods and use local news sources (which we suppose contain more localized information). We used BERTopic on over a million articles scraped from Somali news websites. Because articles are both in English and Somali, we use a multilingual sentence encoder (LaBSE, which outperforms newer models in Somali). The results are quite nice. For example, topics most strongly correlated with known conflict casualty data are topics about terrorist attacks, car bombings, etc. And topics most negatively correlated with known conflict casualty data are about peace talks. We can also get an indication of food price development and forced migration. Most importantly, we can track the development of topics relating to food insecurity over time. While topic modelling cannot replace evidence-based food insecurity assessment, it can give a quick insight into a local situation when 'hard data' is lacking. I applaud you on your success with BERTopic. The package is incredibly clean and easy to use, and the method works well with little parameter tuning. To me, the fact that you were able to deliver such a useful tool on your own is incredible, especially in the field of NLP, which is dominated by large organizations such as Google and Meta. \" \u2022\u2022\u2022Cascha van WanrooijPhD StudentTilburg University

        "},{"location":"usecases.html#papers","title":"Papers","text":"

        BERTopic has also been adopted more and more in the academic field. Here are a few from all different kinds of research domains with interesting applications:

        • Adewunmi, M., Sharma, S. K., Sharma, N., Sushma, N. S., & Mounmo, B. (2022). Cancer Health Disparities drivers with BERTopic modelling and PyCaret Evaluation. Cancer Health Disparities, 6.
        • Ebeling, R., S\u00e1enz, C. A. C., Nobre, J. C., & Becker, K. (2022, May). Analysis of the influence of political polarization in the vaccination stance: the Brazilian COVID-19 scenario. In Proceedings of the International AAAI Conference on Web and Social Media (Vol. 16, pp. 159-170).
        • Hoseini, M., Melo, P., Benevenuto, F., Feldmann, A., & Zannettou, S. (2021). On the globalization of the QAnon conspiracy theory through Telegram. arXiv preprint arXiv:2105.13020.
        • Falkenberg, M., Galeazzi, A., Torricelli, M., Di Marco, N., Larosa, F., Sas, M., ... & Baronchelli, A. (2022). Growing polarization around climate change on social media. Nature Climate Change, 1-8.
        • S\u00e1nchez\u2010Franco, M. J., & Rey\u2010Moreno, M. (2022). Do travelers' reviews depend on the destination? An analysis in coastal and urban peer\u2010to\u2010peer lodgings. Psychology & Marketing, 39(2), 441-459.
        • Zhunis, A., Lima, G., Song, H., Han, J., & Cha, M. (2022, April). Emotion bubbles: Emotional composition of online discourse before and after the COVID-19 outbreak. In Proceedings of the ACM Web Conference 2022 (pp. 2603-2613).
        • Alhaj, F., Al-Haj, A., Sharieh, A., & Jabri, R. (2022). Improving Arabic cognitive distortion classification in Twitter using BERTopic. International Journal of Advanced Computer Science and Applications, 13(1), 854-860.

        Click here for a full overview of papers citing BERTopic.

        "},{"location":"algorithm/algorithm.html","title":"The Algorithm","text":"

        Below, you will find different types of overviews of each step in BERTopic's main algorithm. Each successive overview will be more in-depth than the previous overview. This approach aims to make the underlying algorithm as intuitive as possible for a wide range of users.

        "},{"location":"algorithm/algorithm.html#visual-overview","title":"Visual Overview","text":"

        BERTopic can be viewed as a sequence of steps to create its topic representations. There are five steps to this process:

        Although these steps are the default, there is some modularity to BERTopic. Each step in this process was carefully selected such that they are all somewhat independent from one another. For example, the tokenization step is not directly influenced by the embedding model that was used to convert the documents which allow us to be creative in how we perform the tokenization step.

        This effect is especially strong in the clustering step. Models like HDBSCAN assume that clusters can have different shapes and forms. As a result, using a centroid-based technique to model the topic representations would not be beneficial since the centroid is not always representative of these types of clusters. A bag-of-words representation, however, makes very few assumptions concerning the shape and form of a cluster.

        As a result, BERTopic is quite modular and can maintain its quality of topic generation throughout a variety of sub-models. In other words, BERTopic essentially allows you to build your own topic model:

        There is extensive documentation on how to use each step in this pipeline:

        1. Embeddings
        2. Dimensionality Reduction
        3. Clustering
        4. Tokenizer
        5. Weighting Scheme
        6. Representation Tuning
          • Large Language Models (LLM)
        "},{"location":"algorithm/algorithm.html#code-overview","title":"Code Overview","text":"

        After going through the visual overview, this code overview demonstrates the algorithm using BERTopic. An advantage of using BERTopic is each major step in its algorithm can be explicitly defined, thereby making the process not only transparent but also more intuitive.

        from umap import UMAP\nfrom hdbscan import HDBSCAN\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nfrom bertopic import BERTopic\nfrom bertopic.representation import KeyBERTInspired\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\n\n# Step 1 - Extract embeddings\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n\n# Step 2 - Reduce dimensionality\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')\n\n# Step 3 - Cluster reduced embeddings\nhdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)\n\n# Step 4 - Tokenize topics\nvectorizer_model = CountVectorizer(stop_words=\"english\")\n\n# Step 5 - Create topic representation\nctfidf_model = ClassTfidfTransformer()\n\n# Step 6 - (Optional) Fine-tune topic representations with \n# a `bertopic.representation` model\nrepresentation_model = KeyBERTInspired()\n\n# All steps together\ntopic_model = BERTopic(\n  embedding_model=embedding_model,          # Step 1 - Extract embeddings\n  umap_model=umap_model,                    # Step 2 - Reduce dimensionality\n  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings\n  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics\n  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words\n  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations\n)\n
        "},{"location":"algorithm/algorithm.html#detailed-overview","title":"Detailed Overview","text":"

        This overview describes each step in more detail such that you can get an intuitive feeling as to what models might fit best at each step in your use case.

        "},{"location":"algorithm/algorithm.html#1-embed-documents","title":"1. Embed documents","text":"

        We start by converting our documents to numerical representations. Although there are many methods for doing so the default in BERTopic is sentence-transformers. These models are often optimized for semantic similarity which helps tremendously in our clustering task. Moreover, they are great for creating either document- or sentence-embeddings. In BERTopic, you can choose any sentence-transformers model but two models are set as defaults:

        • \"all-MiniLM-L6-v2\"
        • \"paraphrase-multilingual-MiniLM-L12-v2\"

        The first is an English language model trained specifically for semantic similarity tasks which works quite well for most use cases. The second model is very similar to the first with one major difference being that the multilingual models work for 50+ languages. This model is quite a bit larger than the first and is only selected if you select any language other than English.

        Tip

        Although BERTopic uses sentence-transformers models as a default, you can choose any embedding model that fits your use case. Follow the guide here for selecting and customizing your model.

        "},{"location":"algorithm/algorithm.html#2-dimensionality-reduction","title":"2. Dimensionality reduction","text":"

        After having created our numerical representations of the documents we have to reduce the dimensionality of these representations. Cluster models typically have difficulty handling high dimensional data due to the curse of dimensionality. There are great approaches that can reduce dimensionality, such as PCA, but as a default UMAP is selected in BERTopic. It is a technique that can keep some of a dataset's local and global structure when reducing its dimensionality. This structure is important to keep as it contains the information necessary to create clusters of semantically similar documents.

        Tip

        Although BERTopic uses UMAP as a default, you can choose any dimensionality reduction model that fits your use case. Follow the guide here for selecting and customizing your model.

        "},{"location":"algorithm/algorithm.html#3-cluster-documents","title":"3. Cluster Documents","text":"

        After having reduced our embeddings, we can start clustering our data. For that, we leverage a density-based clustering technique, HDBSCAN. It can find clusters of different shapes and has the nice feature of identifying outliers where possible. As a result, we do not force documents into a cluster where they might not belong. This will improve the resulting topic representation as there is less noise to draw from.

        Tip

        Although BERTopic uses HDBSCAN as a default, you can choose any cluster model that fits your use case. Follow the guide here for selecting and customizing your model.

        "},{"location":"algorithm/algorithm.html#4-bag-of-words","title":"4. Bag-of-words","text":"

        Before we can start creating the topic representation we first need to select a technique that allows for modularity in BERTopic's algorithm. When we use HDBSCAN as a cluster model, we may assume that our clusters have different degrees of density and different shapes. This means that a centroid-based topic representation technique might not be the best-fitting model. In other words, we want a topic representation technique that makes little to no assumption on the expected structure of the clusters. To do this, we first combine all documents in a cluster into a single document. That, very long, document then represents the cluster. Then, we can count how often each word appears in each cluster. This generates something called a bag-of-words representation in which the frequency of each word in each cluster can be found. This bag-of-words representation is therefore on a cluster level and not on a document level. This distinction is important as we are interested in words on a topic level (i.e., cluster level). By using a bag-of-words representation, no assumption is made concerning the structure of the clusters. Moreover, the bag-of-words representation is L1-normalized to account for clusters that have different sizes.

        Tip

        There are many ways you can tune or change the bag-of-words step. This step allows for processing the documents however you want without affecting the first step, embedding the documents. You can follow the guide here for more information about tokenization options in BERTopic.

        "},{"location":"algorithm/algorithm.html#5-topic-representation","title":"5. Topic representation","text":"

        From the generated bag-of-words representation, we want to know what makes one cluster different from another. Which words are typical for cluster 1 and not so much for all other clusters? To solve this, we need to modify TF-IDF such that it considers topics (i.e., clusters) instead of documents. When you apply TF-IDF as usual on a set of documents, what you are doing is comparing the importance of words between documents. Now, what if, we instead treat all documents in a single category (e.g., a cluster) as a single document and then apply TF-IDF? The result would be importance scores for words within a cluster. The more important words are within a cluster, the more it is representative of that topic. In other words, if we extract the most important words per cluster, we get descriptions of topics! This model is called class-based TF-IDF:

        Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word x in class c, where c refers to the cluster we created before. This results in our class-based tf representation. This representation is L1-normalized to account for the differences in topic sizes. Then, we take the logarithm of one plus the average number of words per class A divided by the frequency of word x across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based idf representation. Like with the classic TF-IDF, we then multiply tf with idf to get the importance score per word in each class. In other words, the classical TF-IDF procedure is not used here but a modified version of the algorithm that allows for a much better representation.

        Tip

        In the ClassTfidfTransformer, there are a few parameters that might be worth exploring, including an option to perform additional BM-25 weighting. You can find more information about that here.

        "},{"location":"algorithm/algorithm.html#6-optional-fine-tune-topic-representation","title":"6. (Optional) Fine-tune Topic representation","text":"

        After having generated the c-TF-IDF representations, we have a set of words that describe a collection of documents. c-TF-IDF is a method that can quickly generate accurate topic representations. However, with the fast developments in NLP-world, new and exciting methods are released weekly. In order to keep up with what is happening, there is the possibility to further fine-tune these c-TF-IDF topics using GPT, T5, KeyBERT, Spacy, and other techniques. Many are implemented in BERTopic for you to use and play around with.

        More specifically, we can consider the c-TF-IDF generated topics to be candidate topics. They each contain a set of keywords and representative documents that we can use to further fine-tune the topic representations. Having a set of representative documents for each topic is huge advantage as it allows for fine-tuning on a reduced number of documents. This reduces computation for large models as they only need to operate on that small set of representative documents for each topic. As a result, large language models like GPT and T5 becomes feasible in production settings and typically take less wall time than the dimensionality reduction and clustering steps.

        The following models are implemented in bertopic.representation:

        • MaximalMarginalRelevance
        • PartOfSpeech
        • KeyBERTInspired
        • ZeroShotClassification
        • TextGeneration
        • Cohere
        • OpenAI
        • LangChain
        "},{"location":"api/bertopic.html","title":"BERTopic","text":"

        BERTopic is a topic modeling technique that leverages BERT embeddings and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

        The default embedding model is all-MiniLM-L6-v2 when selecting language=\"english\" and paraphrase-multilingual-MiniLM-L12-v2 when selecting language=\"multilingual\".

        Attributes:

        Name Type Description topics_ List[int])

        The topics that are generated for each document after training or updating the topic model. The most recent topics are tracked.

        probabilities_ List[float]

        The probability of the assigned topic per document. These are only calculated if a HDBSCAN model is used for the clustering step. When calculate_probabilities=True, then it is the probabilities of all topics per document.

        topic_sizes_ Mapping[int, int])

        The size of each topic.

        topic_mapper_ TopicMapper)

        A class for tracking topics and their mappings anytime they are merged, reduced, added, or removed.

        topic_representations_ Mapping[int, Tuple[int, float]])

        The top n terms per topic and their respective c-TF-IDF values.

        c_tf_idf_ csr_matrix)

        The topic-term matrix as calculated through c-TF-IDF. To access its respective words, run .vectorizer_model.get_feature_names() or .vectorizer_model.get_feature_names_out()

        topic_labels_ Mapping[int, str])

        The default labels for each topic.

        custom_labels_ List[str])

        Custom labels for each topic.

        topic_embeddings_ np.ndarray)

        The embeddings for each topic. They are calculated by taking the centroid embedding of each cluster.

        representative_docs_ Mapping[int, str])

        The representative documents for each topic.

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic()\ntopics, probabilities = topic_model.fit_transform(docs)\n

        If you want to use your own embedding model, use it as follows:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\ntopic_model = BERTopic(embedding_model=sentence_model)\n

        Due to the stochastic nature of UMAP, the results from BERTopic might differ and the quality can degrade. Using your own embeddings allows you to try out BERTopic several times until you find the topics that suit you best.

        Source code in bertopic\\_bertopic.py
        class BERTopic:\n    \"\"\"BERTopic is a topic modeling technique that leverages BERT embeddings and\n    c-TF-IDF to create dense clusters allowing for easily interpretable topics\n    whilst keeping important words in the topic descriptions.\n\n    The default embedding model is `all-MiniLM-L6-v2` when selecting `language=\"english\"`\n    and `paraphrase-multilingual-MiniLM-L12-v2` when selecting `language=\"multilingual\"`.\n\n    Attributes:\n        topics_ (List[int]) : The topics that are generated for each document after training or updating\n                              the topic model. The most recent topics are tracked.\n        probabilities_ (List[float]): The probability of the assigned topic per document. These are\n                                      only calculated if a HDBSCAN model is used for the clustering step.\n                                      When `calculate_probabilities=True`, then it is the probabilities\n                                      of all topics per document.\n        topic_sizes_ (Mapping[int, int]) : The size of each topic.\n        topic_mapper_ (TopicMapper) : A class for tracking topics and their mappings anytime they are\n                                      merged, reduced, added, or removed.\n        topic_representations_ (Mapping[int, Tuple[int, float]]) : The top n terms per topic and their respective\n                                                                   c-TF-IDF values.\n        c_tf_idf_ (csr_matrix) : The topic-term matrix as calculated through c-TF-IDF. To access its respective\n                                 words, run `.vectorizer_model.get_feature_names()`  or\n                                 `.vectorizer_model.get_feature_names_out()`\n        topic_labels_ (Mapping[int, str]) : The default labels for each topic.\n        custom_labels_ (List[str]) : Custom labels for each topic.\n        topic_embeddings_ (np.ndarray) : The embeddings for each topic. They are calculated by taking the\n                                         centroid embedding of each cluster.\n        representative_docs_ (Mapping[int, str]) : The representative documents for each topic.\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic()\n    topics, probabilities = topic_model.fit_transform(docs)\n    ```\n\n    If you want to use your own embedding model, use it as follows:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    topic_model = BERTopic(embedding_model=sentence_model)\n    ```\n\n    Due to the stochastic nature of UMAP, the results from BERTopic might differ\n    and the quality can degrade. Using your own embeddings allows you to\n    try out BERTopic several times until you find the topics that suit\n    you best.\n    \"\"\"\n    def __init__(self,\n                 language: str = \"english\",\n                 top_n_words: int = 10,\n                 n_gram_range: Tuple[int, int] = (1, 1),\n                 min_topic_size: int = 10,\n                 nr_topics: Union[int, str] = None,\n                 low_memory: bool = False,\n                 calculate_probabilities: bool = False,\n                 seed_topic_list: List[List[str]] = None,\n                 zeroshot_topic_list: List[str] = None,\n                 zeroshot_min_similarity: float = .7,\n                 embedding_model=None,\n                 umap_model: UMAP = None,\n                 hdbscan_model: hdbscan.HDBSCAN = None,\n                 vectorizer_model: CountVectorizer = None,\n                 ctfidf_model: TfidfTransformer = None,\n                 representation_model: BaseRepresentation = None,\n                 verbose: bool = False,\n                 ):\n        \"\"\"BERTopic initialization\n\n        Arguments:\n            language: The main language used in your documents. The default sentence-transformers\n                      model for \"english\" is `all-MiniLM-L6-v2`. For a full overview of\n                      supported languages see bertopic.backend.languages. Select\n                      \"multilingual\" to load in the `paraphrase-multilingual-MiniLM-L12-v2`\n                      sentence-transformers model that supports 50+ languages.\n                      NOTE: This is not used if `embedding_model` is used.\n            top_n_words: The number of words per topic to extract. Setting this\n                         too high can negatively impact topic embeddings as topics\n                         are typically best represented by at most 10 words.\n            n_gram_range: The n-gram range for the CountVectorizer.\n                          Advised to keep high values between 1 and 3.\n                          More would likely lead to memory issues.\n                          NOTE: This param will not be used if you pass in your own\n                          CountVectorizer.\n            min_topic_size: The minimum size of the topic. Increasing this value will lead\n                            to a lower number of clusters/topics and vice versa. \n                            It is the same parameter as `min_cluster_size` in HDBSCAN.\n                            NOTE: This param will not be used if you are using `hdbscan_model`.\n            nr_topics: Specifying the number of topics will reduce the initial\n                       number of topics to the value specified. This reduction can take\n                       a while as each reduction in topics (-1) activates a c-TF-IDF\n                       calculation. If this is set to None, no reduction is applied. Use\n                       \"auto\" to automatically reduce topics using HDBSCAN.\n                       NOTE: Controlling the number of topics is best done by adjusting\n                       `min_topic_size` first before adjusting this parameter.\n            low_memory: Sets UMAP low memory to True to make sure less memory is used.\n                        NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP\n                        this parameter will not be used.\n            calculate_probabilities: Calculate the probabilities of all topics\n                                     per document instead of the probability of the assigned\n                                     topic per document. This could slow down the extraction\n                                     of topics if you have many documents (> 100_000).\n                                     NOTE: If false you cannot use the corresponding\n                                     visualization method `visualize_probabilities`.\n                                     NOTE: This is an approximation of topic probabilities\n                                     as used in HDBSCAN and not an exact representation.\n            seed_topic_list: A list of seed words per topic to converge around\n            zeroshot_topic_list: A list of topic names to use for zero-shot classification\n            zeroshot_min_similarity: The minimum similarity between a zero-shot topic and\n                                     a document for assignment. The higher this value, the more\n                                     confident the model needs to be to assign a zero-shot topic to a document.\n            verbose: Changes the verbosity of the model, Set to True if you want\n                     to track the stages of the model.\n            embedding_model: Use a custom embedding model.\n                             The following backends are currently supported\n                               * SentenceTransformers\n                               * Flair\n                               * Spacy\n                               * Gensim\n                               * USE (TF-Hub)\n                             You can also pass in a string that points to one of the following\n                             sentence-transformers models:\n                               * https://www.sbert.net/docs/pretrained_models.html\n            umap_model: Pass in a UMAP model to be used instead of the default.\n                        NOTE: You can also pass in any dimensionality reduction algorithm as long\n                        as it has `.fit` and `.transform` functions.\n            hdbscan_model: Pass in a hdbscan.HDBSCAN model to be used instead of the default\n                           NOTE: You can also pass in any clustering algorithm as long as it has\n                           `.fit` and `.predict` functions along with the `.labels_` variable.\n            vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model.\n            ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model.\n            representation_model: Pass in a model that fine-tunes the topic representations\n                                  calculated through c-TF-IDF. Models from `bertopic.representation`\n                                  are supported.\n        \"\"\"\n        # Topic-based parameters\n        if top_n_words > 100:\n            logger.warning(\"Note that extracting more than 100 words from a sparse \"\n                           \"can slow down computation quite a bit.\")\n\n        self.top_n_words = top_n_words\n        self.min_topic_size = min_topic_size\n        self.nr_topics = nr_topics\n        self.low_memory = low_memory\n        self.calculate_probabilities = calculate_probabilities\n        self.verbose = verbose\n        self.seed_topic_list = seed_topic_list\n        self.zeroshot_topic_list = zeroshot_topic_list\n        self.zeroshot_min_similarity = zeroshot_min_similarity\n\n        # Embedding model\n        self.language = language if not embedding_model else None\n        self.embedding_model = embedding_model\n\n        # Vectorizer\n        self.n_gram_range = n_gram_range\n        self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range)\n        self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n\n        # Representation model\n        self.representation_model = representation_model\n\n        # UMAP or another algorithm that has .fit and .transform functions\n        self.umap_model = umap_model or UMAP(n_neighbors=15,\n                                             n_components=5,\n                                             min_dist=0.0,\n                                             metric='cosine',\n                                             low_memory=self.low_memory)\n\n        # HDBSCAN or another clustering algorithm that has .fit and .predict functions and\n        # the .labels_ variable to extract the labels\n        self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size,\n                                                              metric='euclidean',\n                                                              cluster_selection_method='eom',\n                                                              prediction_data=True)\n\n        # Public attributes\n        self.topics_ = None\n        self.probabilities_ = None\n        self.topic_sizes_ = None\n        self.topic_mapper_ = None\n        self.topic_representations_ = None\n        self.topic_embeddings_ = None\n        self.topic_labels_ = None\n        self.custom_labels_ = None\n        self.c_tf_idf_ = None\n        self.representative_images_ = None\n        self.representative_docs_ = {}\n        self.topic_aspects_ = {}\n\n        # Private attributes for internal tracking purposes\n        self._outliers = 1\n        self._merged_topics = None\n\n        if verbose:\n            logger.set_level(\"DEBUG\")\n        else:\n            logger.set_level(\"WARNING\")\n\n    def fit(self,\n            documents: List[str],\n            embeddings: np.ndarray = None,\n            images: List[str] = None,\n            y: Union[List[int], np.ndarray] = None):\n        \"\"\" Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics\n\n        Arguments:\n            documents: A list of documents to fit on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model\n            images: A list of paths to the images to fit on or the images themselves\n            y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n               specific instance is specified.\n\n        Examples:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all')['data']\n        topic_model = BERTopic().fit(docs)\n        ```\n\n        If you want to use your own embeddings, use it as follows:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n\n        # Create embeddings\n        docs = fetch_20newsgroups(subset='all')['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n        # Create topic model\n        topic_model = BERTopic().fit(docs, embeddings)\n        ```\n        \"\"\"\n        self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images)\n        return self\n\n    def fit_transform(self,\n                      documents: List[str],\n                      embeddings: np.ndarray = None,\n                      images: List[str] = None,\n                      y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],\n                                                                       Union[np.ndarray, None]]:\n        \"\"\" Fit the models on a collection of documents, generate topics,\n        and return the probabilities and topic per document.\n\n        Arguments:\n            documents: A list of documents to fit on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model\n            images: A list of paths to the images to fit on or the images themselves\n            y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n               specific instance is specified.\n\n        Returns:\n            predictions: Topic predictions for each documents\n            probabilities: The probability of the assigned topic per document.\n                           If `calculate_probabilities` in BERTopic is set to True, then\n                           it calculates the probabilities of all topics across all documents\n                           instead of only the assigned topic. This, however, slows down\n                           computation and may increase memory usage.\n\n        Examples:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all')['data']\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        ```\n\n        If you want to use your own embeddings, use it as follows:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n\n        # Create embeddings\n        docs = fetch_20newsgroups(subset='all')['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n        # Create topic model\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs, embeddings)\n        ```\n        \"\"\"\n        if documents is not None:\n            check_documents_type(documents)\n            check_embeddings_shape(embeddings, documents)\n\n        doc_ids = range(len(documents)) if documents is not None else range(len(images))\n        documents = pd.DataFrame({\"Document\": documents,\n                                  \"ID\": doc_ids,\n                                  \"Topic\": None,\n                                  \"Image\": images})\n\n        # Extract embeddings\n        if embeddings is None:\n            logger.info(\"Embedding - Transforming documents to embeddings.\")\n            self.embedding_model = select_backend(self.embedding_model,\n                                                  language=self.language)\n            embeddings = self._extract_embeddings(documents.Document.values.tolist(),\n                                                  images=images,\n                                                  method=\"document\",\n                                                  verbose=self.verbose)\n            logger.info(\"Embedding - Completed \\u2713\")\n        else:\n            if self.embedding_model is not None:\n                self.embedding_model = select_backend(self.embedding_model,\n                                                      language=self.language)\n\n        # Guided Topic Modeling\n        if self.seed_topic_list is not None and self.embedding_model is not None:\n            y, embeddings = self._guided_topic_modeling(embeddings)\n\n        # Zero-shot Topic Modeling\n        if self._is_zeroshot():\n            documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(documents, embeddings)\n            if documents is None:\n                return self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings)\n\n        # Reduce dimensionality\n        umap_embeddings = self._reduce_dimensionality(embeddings, y)\n\n        # Cluster reduced embeddings\n        documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)\n\n        # Sort and Map Topic IDs by their frequency\n        if not self.nr_topics:\n            documents = self._sort_mappings_by_frequency(documents)\n\n        # Create documents from images if we have images only\n        if documents.Document.values[0] is None:\n            custom_documents = self._images_to_text(documents, embeddings)\n\n            # Extract topics by calculating c-TF-IDF\n            self._extract_topics(custom_documents, embeddings=embeddings)\n            self._create_topic_vectors(documents=documents, embeddings=embeddings)\n\n            # Reduce topics\n            if self.nr_topics:\n                custom_documents = self._reduce_topics(custom_documents)\n\n            # Save the top 3 most representative documents per topic\n            self._save_representative_docs(custom_documents)\n        else:\n            # Extract topics by calculating c-TF-IDF\n            self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)\n\n            # Reduce topics\n            if self.nr_topics:\n                documents = self._reduce_topics(documents)\n\n            # Save the top 3 most representative documents per topic\n            self._save_representative_docs(documents)\n\n        # Resulting output\n        self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)\n        predictions = documents.Topic.to_list()\n\n        # Combine Zero-shot with outliers\n        if self._is_zeroshot() and len(documents) != len(doc_ids):\n            predictions = self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings)\n\n        return predictions, self.probabilities_\n\n    def transform(self,\n                  documents: Union[str, List[str]],\n                  embeddings: np.ndarray = None,\n                  images: List[str] = None) -> Tuple[List[int], np.ndarray]:\n        \"\"\" After having fit a model, use transform to predict new instances\n\n        Arguments:\n            documents: A single document or a list of documents to predict on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model.\n            images: A list of paths to the images to predict on or the images themselves\n\n        Returns:\n            predictions: Topic predictions for each documents\n            probabilities: The topic probability distribution which is returned by default.\n                           If `calculate_probabilities` in BERTopic is set to False, then the\n                           probabilities are not calculated to speed up computation and\n                           decrease memory usage.\n\n        Examples:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all')['data']\n        topic_model = BERTopic().fit(docs)\n        topics, probs = topic_model.transform(docs)\n        ```\n\n        If you want to use your own embeddings:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n\n        # Create embeddings\n        docs = fetch_20newsgroups(subset='all')['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n        # Create topic model\n        topic_model = BERTopic().fit(docs, embeddings)\n        topics, probs = topic_model.transform(docs, embeddings)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_embeddings_shape(embeddings, documents)\n\n        if isinstance(documents, str) or documents is None:\n            documents = [documents]\n\n        if embeddings is None:\n            embeddings = self._extract_embeddings(documents,\n                                                  images=images,\n                                                  method=\"document\",\n                                                  verbose=self.verbose)\n\n        # Check if an embedding model was found\n        if embeddings is None:\n            raise ValueError(\"No embedding model was found to embed the documents.\"\n                             \"Make sure when loading in the model using BERTopic.load()\"\n                             \"to also specify the embedding model.\")\n\n        # Transform without hdbscan_model and umap_model using only cosine similarity\n        elif type(self.hdbscan_model) == BaseCluster:\n            logger.info(\"Predicting topic assignments through cosine similarity of topic and document embeddings.\")\n            sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))\n            predictions = np.argmax(sim_matrix, axis=1) - self._outliers\n\n            if self.calculate_probabilities:\n                probabilities = sim_matrix\n            else:\n                probabilities = np.max(sim_matrix, axis=1)\n\n        # Transform with full pipeline\n        else:\n            logger.info(\"Dimensionality - Reducing dimensionality of input embeddings.\")\n            umap_embeddings = self.umap_model.transform(embeddings)\n            logger.info(\"Dimensionality - Completed \\u2713\")\n\n            # Extract predictions and probabilities if it is a HDBSCAN-like model\n            logger.info(\"Clustering - Approximating new points with `hdbscan_model`\")\n            if is_supported_hdbscan(self.hdbscan_model):\n                predictions, probabilities = hdbscan_delegator(self.hdbscan_model, \"approximate_predict\", umap_embeddings)\n\n                # Calculate probabilities\n                if self.calculate_probabilities:\n                    logger.info(\"Probabilities - Start calculation of probabilities with HDBSCAN\")\n                    probabilities = hdbscan_delegator(self.hdbscan_model, \"membership_vector\", umap_embeddings)\n                    logger.info(\"Probabilities - Completed \\u2713\")\n            else:\n                predictions = self.hdbscan_model.predict(umap_embeddings)\n                probabilities = None\n            logger.info(\"Cluster - Completed \\u2713\")\n\n            # Map probabilities and predictions\n            probabilities = self._map_probabilities(probabilities, original_topics=True)\n            predictions = self._map_predictions(predictions)\n        return predictions, probabilities\n\n    def partial_fit(self,\n                    documents: List[str],\n                    embeddings: np.ndarray = None,\n                    y: Union[List[int], np.ndarray] = None):\n        \"\"\" Fit BERTopic on a subset of the data and perform online learning\n        with batch-like data.\n\n        Online topic modeling in BERTopic is performed by using dimensionality\n        reduction and cluster algorithms that support a `partial_fit` method\n        in order to incrementally train the topic model.\n\n        Likewise, the `bertopic.vectorizers.OnlineCountVectorizer` is used\n        to dynamically update its vocabulary when presented with new data.\n        It has several parameters for modeling decay and updating the\n        representations.\n\n        In other words, although the main algorithm stays the same, the training\n        procedure now works as follows:\n\n        For each subset of the data:\n\n        1. Generate embeddings with a pre-traing language model\n        2. Incrementally update the dimensionality reduction algorithm with `partial_fit`\n        3. Incrementally update the cluster algorithm with `partial_fit`\n        4. Incrementally update the OnlineCountVectorizer and apply some form of decay\n\n        Note that it is advised to use `partial_fit` with batches and\n        not single documents for the best performance.\n\n        Arguments:\n            documents: A list of documents to fit on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model\n            y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n               specific instance is specified.\n\n        Examples:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sklearn.cluster import MiniBatchKMeans\n        from sklearn.decomposition import IncrementalPCA\n        from bertopic.vectorizers import OnlineCountVectorizer\n        from bertopic import BERTopic\n\n        # Prepare documents\n        docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\n        # Prepare sub-models that support online learning\n        umap_model = IncrementalPCA(n_components=5)\n        cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\n        vectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\n        topic_model = BERTopic(umap_model=umap_model,\n                               hdbscan_model=cluster_model,\n                               vectorizer_model=vectorizer_model)\n\n        # Incrementally fit the topic model by training on 1000 documents at a time\n        for index in range(0, len(docs), 1000):\n            topic_model.partial_fit(docs[index: index+1000])\n        ```\n        \"\"\"\n        # Checks\n        check_embeddings_shape(embeddings, documents)\n        if not hasattr(self.hdbscan_model, \"partial_fit\"):\n            raise ValueError(\"In order to use `.partial_fit`, the cluster model should have \"\n                             \"a `.partial_fit` function.\")\n\n        # Prepare documents\n        if isinstance(documents, str):\n            documents = [documents]\n        documents = pd.DataFrame({\"Document\": documents,\n                                  \"ID\": range(len(documents)),\n                                  \"Topic\": None})\n\n        # Extract embeddings\n        if embeddings is None:\n            if self.topic_representations_ is None:\n                self.embedding_model = select_backend(self.embedding_model,\n                                                      language=self.language)\n            embeddings = self._extract_embeddings(documents.Document.values.tolist(),\n                                                  method=\"document\",\n                                                  verbose=self.verbose)\n        else:\n            if self.embedding_model is not None and self.topic_representations_ is None:\n                self.embedding_model = select_backend(self.embedding_model,\n                                                      language=self.language)\n\n        # Reduce dimensionality\n        if self.seed_topic_list is not None and self.embedding_model is not None:\n            y, embeddings = self._guided_topic_modeling(embeddings)\n        umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True)\n\n        # Cluster reduced embeddings\n        documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True)\n        topics = documents.Topic.to_list()\n\n        # Map and find new topics\n        if not self.topic_mapper_:\n            self.topic_mapper_ = TopicMapper(topics)\n        mappings = self.topic_mapper_.get_mappings()\n        new_topics = set(topics).difference(set(mappings.keys()))\n        new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)}\n        self.topic_mapper_.add_new_topics(new_topic_ids)\n        updated_mappings = self.topic_mapper_.get_mappings()\n        updated_topics = [updated_mappings[topic] for topic in topics]\n        documents[\"Topic\"] = updated_topics\n\n        # Add missing topics (topics that were originally created but are now missing)\n        if self.topic_representations_:\n            missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics))\n            for missing_topic in missing_topics:\n                documents.loc[len(documents), :] = [\" \", len(documents), missing_topic]\n        else:\n            missing_topics = {}\n\n        # Prepare documents\n        documents_per_topic = documents.sort_values(\"Topic\").groupby(['Topic'], as_index=False)\n        updated_topics = documents_per_topic.first().Topic.astype(int)\n        documents_per_topic = documents_per_topic.agg({'Document': ' '.join})\n\n        # Update topic representations\n        self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True)\n        self.topic_representations_ = self._extract_words_per_topic(updated_words, documents, self.c_tf_idf_, calculate_aspects=False)\n        self._create_topic_vectors()\n        self.topic_labels_ = {key: f\"{key}_\" + \"_\".join([word[0] for word in values[:4]])\n                              for key, values in self.topic_representations_.items()}\n\n        # Update topic sizes\n        if len(missing_topics) > 0:\n            documents = documents.iloc[:-len(missing_topics)]\n\n        if self.topic_sizes_ is None:\n            self._update_topic_size(documents)\n        else:\n            sizes = documents.groupby(['Topic'], as_index=False).count()\n            for _, row in sizes.iterrows():\n                topic = int(row.Topic)\n                if self.topic_sizes_.get(topic) is not None and topic not in missing_topics:\n                    self.topic_sizes_[topic] += int(row.Document)\n                elif self.topic_sizes_.get(topic) is None:\n                    self.topic_sizes_[topic] = int(row.Document)\n            self.topics_ = documents.Topic.astype(int).tolist()\n\n        return self\n\n    def topics_over_time(self,\n                         docs: List[str],\n                         timestamps: Union[List[str],\n                                           List[int]],\n                         topics: List[int] = None,\n                         nr_bins: int = None,\n                         datetime_format: str = None,\n                         evolution_tuning: bool = True,\n                         global_tuning: bool = True) -> pd.DataFrame:\n        \"\"\" Create topics over time\n\n        To create the topics over time, BERTopic needs to be already fitted once.\n        From the fitted models, the c-TF-IDF representations are calculate at\n        each timestamp t. Then, the c-TF-IDF representations at timestamp t are\n        averaged with the global c-TF-IDF representations in order to fine-tune the\n        local representations.\n\n        NOTE:\n            Make sure to use a limited number of unique timestamps (<100) as the\n            c-TF-IDF representation will be calculated at each single unique timestamp.\n            Having a large number of unique timestamps can take some time to be calculated.\n            Moreover, there aren't many use-cases where you would like to see the difference\n            in topic representations over more than 100 different timestamps.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            timestamps: The timestamp of each document. This can be either a list of strings or ints.\n                        If it is a list of strings, then the datetime format will be automatically\n                        inferred. If it is a list of ints, then the documents will be ordered in\n                        ascending order.\n            topics: A list of topics where each topic is related to a document in `docs` and\n                    a timestamp in `timestamps`. You can use this to apply topics_over_time on\n                    a subset of the data. Make sure that `docs`, `timestamps`, and `topics`\n                    all correspond to one another and have the same size.\n            nr_bins: The number of bins you want to create for the timestamps. The left interval will\n                     be chosen as the timestamp. An additional column will be created with the\n                     entire interval.\n            datetime_format: The datetime format of the timestamps if they are strings, eg \u201c%d/%m/%Y\u201d.\n                             Set this to None if you want to have it automatically detect the format.\n                             See strftime documentation for more information on choices:\n                             https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.\n            evolution_tuning: Fine-tune each topic representation at timestamp *t* by averaging its\n                              c-TF-IDF matrix with the c-TF-IDF matrix at timestamp *t-1*. This creates\n                              evolutionary topic representations.\n            global_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix\n                       with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                       topic representations that could not be found in the documents at timestamp *t*.\n\n        Returns:\n            topics_over_time: A dataframe that contains the topic, words, and frequency of topic\n                              at timestamp *t*.\n\n        Examples:\n\n        The timestamps variable represents the timestamp of each document. If you have over\n        100 unique timestamps, it is advised to bin the timestamps as shown below:\n\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        selected_topics = topics if topics else self.topics_\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": selected_topics, \"Timestamps\": timestamps})\n        global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)\n\n        all_topics = sorted(list(documents.Topic.unique()))\n        all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}\n\n        if isinstance(timestamps[0], str):\n            infer_datetime_format = True if not datetime_format else False\n            documents[\"Timestamps\"] = pd.to_datetime(documents[\"Timestamps\"],\n                                                     infer_datetime_format=infer_datetime_format,\n                                                     format=datetime_format)\n\n        if nr_bins:\n            documents[\"Bins\"] = pd.cut(documents.Timestamps, bins=nr_bins)\n            documents[\"Timestamps\"] = documents.apply(lambda row: row.Bins.left, 1)\n\n        # Sort documents in chronological order\n        documents = documents.sort_values(\"Timestamps\")\n        timestamps = documents.Timestamps.unique()\n        if len(timestamps) > 100:\n            logger.warning(f\"There are more than 100 unique timestamps (i.e., {len(timestamps)}) \"\n                           \"which significantly slows down the application. Consider setting `nr_bins` \"\n                           \"to a value lower than 100 to speed up calculation. \")\n\n        # For each unique timestamp, create topic representations\n        topics_over_time = []\n        for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose):\n\n            # Calculate c-TF-IDF representation for a specific timestamp\n            selection = documents.loc[documents.Timestamps == timestamp, :]\n            documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join,\n                                                                                    \"Timestamps\": \"count\"})\n            c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n            if global_tuning or evolution_tuning:\n                c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False)\n\n            # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF\n            # matrix at timestamp t-1\n            if evolution_tuning and index != 0:\n                current_topics = sorted(list(documents_per_topic.Topic.values))\n                overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics))))\n\n                current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]\n                previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics]\n\n                c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] +\n                                                          previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil()\n\n            # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n            # by simply taking the average of the two\n            if global_tuning:\n                selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]\n                c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0\n\n            # Extract the words per topic\n            words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n            topic_frequency = pd.Series(documents_per_topic.Timestamps.values,\n                                        index=documents_per_topic.Topic).to_dict()\n\n            # Fill dataframe with results\n            topics_at_timestamp = [(topic,\n                                    \", \".join([words[0] for words in values][:5]),\n                                    topic_frequency[topic],\n                                    timestamp) for topic, values in words_per_topic.items()]\n            topics_over_time.extend(topics_at_timestamp)\n\n            if evolution_tuning:\n                previous_topics = sorted(list(documents_per_topic.Topic.values))\n                previous_c_tf_idf = c_tf_idf.copy()\n\n        return pd.DataFrame(topics_over_time, columns=[\"Topic\", \"Words\", \"Frequency\", \"Timestamp\"])\n\n    def topics_per_class(self,\n                         docs: List[str],\n                         classes: Union[List[int], List[str]],\n                         global_tuning: bool = True) -> pd.DataFrame:\n        \"\"\" Create topics per class\n\n        To create the topics per class, BERTopic needs to be already fitted once.\n        From the fitted models, the c-TF-IDF representations are calculated at\n        each class c. Then, the c-TF-IDF representations at class c are\n        averaged with the global c-TF-IDF representations in order to fine-tune the\n        local representations. This can be turned off if the pure representation is\n        needed.\n\n        NOTE:\n            Make sure to use a limited number of unique classes (<100) as the\n            c-TF-IDF representation will be calculated at each single unique class.\n            Having a large number of unique classes can take some time to be calculated.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            classes: The class of each document. This can be either a list of strings or ints.\n            global_tuning: Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix\n                           with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                           topic representations that could not be found in the documents for class c.\n\n        Returns:\n            topics_per_class: A dataframe that contains the topic, words, and frequency of topics\n                              for each class.\n\n        Examples:\n\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        topics_per_class = topic_model.topics_per_class(docs, classes)\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Class\": classes})\n        global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)\n\n        # For each unique timestamp, create topic representations\n        topics_per_class = []\n        for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose):\n\n            # Calculate c-TF-IDF representation for a specific timestamp\n            selection = documents.loc[documents.Class == class_, :]\n            documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join,\n                                                                                    \"Class\": \"count\"})\n            c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n            # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n            # by simply taking the average of the two\n            if global_tuning:\n                c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False)\n                c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0\n\n            # Extract the words per topic\n            words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n            topic_frequency = pd.Series(documents_per_topic.Class.values,\n                                        index=documents_per_topic.Topic).to_dict()\n\n            # Fill dataframe with results\n            topics_at_class = [(topic,\n                                \", \".join([words[0] for words in values][:5]),\n                                topic_frequency[topic],\n                                class_) for topic, values in words_per_topic.items()]\n            topics_per_class.extend(topics_at_class)\n\n        topics_per_class = pd.DataFrame(topics_per_class, columns=[\"Topic\", \"Words\", \"Frequency\", \"Class\"])\n\n        return topics_per_class\n\n    def hierarchical_topics(self,\n                            docs: List[str],\n                            linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n                            distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame:\n        \"\"\" Create a hierarchy of topics\n\n        To create this hierarchy, BERTopic needs to be already fitted once.\n        Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF\n        representation using `scipy.cluster.hierarchy.linkage`.\n\n        Based on that hierarchy, we calculate the topic representation at each\n        merged step. This is a local representation, as we only assume that the\n        chosen step is merged and not all others which typically improves the\n        topic representation.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            linkage_function: The linkage function to use. Default is:\n                              `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n            distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                               `lambda x: 1 - cosine_similarity(x)`.\n                               You can pass any function that returns either a square matrix of \n                               shape (n_samples, n_samples) with zeros on the diagonal and \n                               non-negative values or condensed distance matrix of shape\n                               (n_samples * (n_samples - 1) / 2,) containing the upper\n                               triangular of the distance matrix.\n\n        Returns:\n            hierarchical_topics: A dataframe that contains a hierarchy of topics\n                                 represented by their parents and their children\n\n        Examples:\n\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n        ```\n\n        A custom linkage function can be used as follows:\n\n        ```python\n        from scipy.cluster import hierarchy as sch\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n\n        # Hierarchical topics\n        linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n        hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        if distance_function is None:\n            distance_function = lambda x: 1 - cosine_similarity(x)\n\n        if linkage_function is None:\n            linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n\n        # Calculate distance\n        embeddings = self.c_tf_idf_[self._outliers:]\n        X = distance_function(embeddings)\n        X = validate_distance_matrix(X, embeddings.shape[0])\n\n        # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix\n        Z = linkage_function(X)\n\n        # Calculate basic bag-of-words to be iteratively merged later\n        documents = pd.DataFrame({\"Document\": docs,\n                                  \"ID\": range(len(docs)),\n                                  \"Topic\": self.topics_})\n        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})\n        documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :]\n        clean_documents = self._preprocess_text(documents_per_topic.Document.values)\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = self.vectorizer_model.get_feature_names_out()\n        else:\n            words = self.vectorizer_model.get_feature_names()\n\n        bow = self.vectorizer_model.transform(clean_documents)\n\n        # Extract clusters\n        hier_topics = pd.DataFrame(columns=[\"Parent_ID\", \"Parent_Name\", \"Topics\",\n                                            \"Child_Left_ID\", \"Child_Left_Name\",\n                                            \"Child_Right_ID\", \"Child_Right_Name\"])\n        for index in tqdm(range(len(Z))):\n\n            # Find clustered documents\n            clusters = sch.fcluster(Z, t=Z[index][2], criterion='distance') - self._outliers\n            nr_clusters = len(clusters)\n\n            # Extract first topic we find to get the set of topics in a merged topic\n            topic = None\n            val = Z[index][0]\n            while topic is None:\n                if val - len(clusters) < 0:\n                    topic = int(val)\n                else:\n                    val = Z[int(val - len(clusters))][0]\n            clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]]\n\n            # Group bow per cluster, calculate c-TF-IDF and extract words\n            grouped = csr_matrix(bow[clustered_topics].sum(axis=0))\n            c_tf_idf = self.ctfidf_model.transform(grouped)\n            selection = documents.loc[documents.Topic.isin(clustered_topics), :]\n            selection.Topic = 0\n            words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n\n            # Extract parent's name and ID\n            parent_id = index + len(clusters)\n            parent_name = \"_\".join([x[0] for x in words_per_topic[0]][:5])\n\n            # Extract child's name and ID\n            Z_id = Z[index][0]\n            child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n            if Z_id - nr_clusters < 0:\n                child_left_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n            else:\n                child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name\n\n            # Extract child's name and ID\n            Z_id = Z[index][1]\n            child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n            if Z_id - nr_clusters < 0:\n                child_right_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n            else:\n                child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name\n\n            # Save results\n            hier_topics.loc[len(hier_topics), :] = [parent_id, parent_name,\n                                                    clustered_topics,\n                                                    int(Z[index][0]), child_left_name,\n                                                    int(Z[index][1]), child_right_name]\n\n        hier_topics[\"Distance\"] = Z[:, 2]\n        hier_topics = hier_topics.sort_values(\"Parent_ID\", ascending=False)\n        hier_topics[[\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]] = hier_topics[[\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]].astype(str)\n\n        return hier_topics\n\n    def approximate_distribution(self,\n                                 documents: Union[str, List[str]],\n                                 window: int = 4,\n                                 stride: int = 1,\n                                 min_similarity: float = 0.1,\n                                 batch_size: int = 1000,\n                                 padding: bool = False,\n                                 use_embedding_model: bool = False,\n                                 calculate_tokens: bool = False,\n                                 separator: str = \" \") -> Tuple[np.ndarray,\n                                                                Union[List[np.ndarray], None]]:\n        \"\"\" A post-hoc approximation of topic distributions across documents.\n\n        In order to perform this approximation, each document is split into tokens\n        according to the provided tokenizer in the `CountVectorizer`. Then, a\n        sliding window is applied on each document creating subsets of the document.\n        For example, with a window size of 3 and stride of 1, the sentence:\n\n        `Solving the right problem is difficult.`\n\n        can be split up into `solving the right`, `the right problem`, `right problem is`,\n        and `problem is difficult`. These are called tokensets. For each of these\n        tokensets, we calculate their c-TF-IDF representation and find out\n        how similar they are to the previously generated topics. Then, the\n        similarities to the topics for each tokenset are summed up in order to\n        create a topic distribution for the entire document.\n\n        We can also dive into this a bit deeper by then splitting these tokensets\n        up into individual tokens and calculate how much a word, in a specific sentence,\n        contributes to the topics found in that document. This can be enabled by\n        setting `calculate_tokens=True` which can be used for visualization purposes\n        in `topic_model.visualize_approximate_distribution`.\n\n        The main output, `topic_distributions`, can also be used directly in\n        `.visualize_distribution(topic_distributions[index])` by simply selecting\n        a single distribution.\n\n        Arguments:\n            documents: A single document or a list of documents for which we\n                       approximate their topic distributions\n            window: Size of the moving window which indicates the number of\n                    tokens being considered.\n            stride: How far the window should move at each step.\n            min_similarity: The minimum similarity of a document's tokenset\n                            with respect to the topics.\n            batch_size: The number of documents to process at a time. If None,\n                        then all documents are processed at once.\n                        NOTE: With a large number of documents, it is not\n                        advised to process all documents at once.\n            padding: Whether to pad the beginning and ending of a document with\n                     empty tokens.\n            use_embedding_model: Whether to use the topic model's embedding\n                                 model to calculate the similarity between\n                                 tokensets and topics instead of using c-TF-IDF.\n            calculate_tokens: Calculate the similarity of tokens with all topics.\n                              NOTE: This is computation-wise more expensive and\n                              can require more memory. Using this over batches of\n                              documents might be preferred.\n            separator: The separator used to merge tokens into tokensets.\n\n        Returns:\n            topic_distributions: A `n` x `m` matrix containing the topic distributions\n                                 for all input documents with `n` being the documents\n                                 and `m` the topics.\n            topic_token_distributions: A list of `t` x `m` arrays with `t` being the\n                                       number of tokens for the respective document\n                                       and `m` the topics.\n\n        Examples:\n\n        After fitting the model, the topic distributions can be calculated regardless\n        of the clustering model and regardless of whether the documents were previously\n        seen or not:\n\n        ```python\n        topic_distr, _ = topic_model.approximate_distribution(docs)\n        ```\n\n        As a result, the topic distributions are calculated in `topic_distr` for the\n        entire document based on a token set with a specific window size and stride.\n\n        If you want to calculate the topic distributions on a token-level:\n\n        ```python\n        topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n        ```\n\n        The `topic_token_distr` then contains, for each token, the best fitting topics.\n        As with `topic_distr`, it can contain multiple topics for a single token.\n        \"\"\"\n        if isinstance(documents, str):\n            documents = [documents]\n\n        if batch_size is None:\n            batch_size = len(documents)\n            batches = 1\n        else:\n            batches = math.ceil(len(documents)/batch_size)\n\n        topic_distributions = []\n        topic_token_distributions = []\n\n        for i in tqdm(range(batches), disable=not self.verbose):\n            doc_set = documents[i*batch_size: (i+1) * batch_size]\n\n            # Extract tokens\n            analyzer = self.vectorizer_model.build_tokenizer()\n            tokens = [analyzer(document) for document in doc_set]\n\n            # Extract token sets\n            all_sentences = []\n            all_indices = [0]\n            all_token_sets_ids = []\n\n            for tokenset in tokens:\n                if len(tokenset) < window:\n                    token_sets = [tokenset]\n                    token_sets_ids = [list(range(len(tokenset)))]\n                else:\n\n                    # Extract tokensets using window and stride parameters\n                    stride_indices = list(range(len(tokenset)))[::stride]\n                    token_sets = []\n                    token_sets_ids = []\n                    for stride_index in stride_indices:\n                        selected_tokens = tokenset[stride_index: stride_index+window]\n\n                        if padding or len(selected_tokens) == window:\n                            token_sets.append(selected_tokens)\n                            token_sets_ids.append(list(range(stride_index, stride_index+len(selected_tokens))))\n\n                    # Add empty tokens at the beginning and end of a document\n                    if padding:\n                        padded = []\n                        padded_ids = []\n                        t = math.ceil(window / stride) - 1\n                        for i in range(math.ceil(window / stride) - 1):\n                            padded.append(tokenset[:window - ((t-i) * stride)])\n                            padded_ids.append(list(range(0, window - ((t-i) * stride))))\n\n                        token_sets = padded + token_sets\n                        token_sets_ids = padded_ids + token_sets_ids\n\n                # Join the tokens\n                sentences = [separator.join(token) for token in token_sets]\n                all_sentences.extend(sentences)\n                all_token_sets_ids.extend(token_sets_ids)\n                all_indices.append(all_indices[-1] + len(sentences))\n\n            # Calculate similarity between embeddings of token sets and the topics\n            if use_embedding_model:\n                embeddings = self._extract_embeddings(all_sentences, method=\"document\", verbose=True)\n                similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers:])\n\n            # Calculate similarity between c-TF-IDF of token sets and the topics\n            else:\n                bow_doc = self.vectorizer_model.transform(all_sentences)\n                c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n                similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:])\n\n            # Only keep similarities that exceed the minimum\n            similarity[similarity < min_similarity] = 0\n\n            # Aggregate results on an individual token level\n            if calculate_tokens:\n                topic_distribution = []\n                topic_token_distribution = []\n                for index, token in enumerate(tokens):\n                    start = all_indices[index]\n                    end = all_indices[index+1]\n\n                    if start == end:\n                        end = end + 1\n\n                    # Assign topics to individual tokens\n                    token_id = [i for i in range(len(token))]\n                    token_val = {index: [] for index in token_id}\n                    for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]):\n                        for token in token_set:\n                            if token in token_val:\n                                token_val[token].append(sim)\n\n                    matrix = []\n                    for _, value in token_val.items():\n                        matrix.append(np.add.reduce(value))\n\n                    # Take empty documents into account\n                    matrix = np.array(matrix)\n                    if len(matrix.shape) == 1:\n                        matrix = np.zeros((1, len(self.topic_labels_) - self._outliers))\n\n                    topic_token_distribution.append(np.array(matrix))\n                    topic_distribution.append(np.add.reduce(matrix))\n\n                topic_distribution = normalize(topic_distribution, norm='l1', axis=1)\n\n            # Aggregate on a tokenset level indicated by the window and stride\n            else:\n                topic_distribution = []\n                for index in range(len(all_indices)-1):\n                    start = all_indices[index]\n                    end = all_indices[index+1]\n\n                    if start == end:\n                        end = end + 1\n                    group = similarity[start:end].sum(axis=0)\n                    topic_distribution.append(group)\n                topic_distribution = normalize(np.array(topic_distribution), norm='l1', axis=1)\n                topic_token_distribution = None\n\n            # Combine results\n            topic_distributions.append(topic_distribution)\n            if topic_token_distribution is None:\n                topic_token_distributions = None\n            else:\n                topic_token_distributions.extend(topic_token_distribution)\n\n        topic_distributions = np.vstack(topic_distributions)\n\n        return topic_distributions, topic_token_distributions\n\n    def find_topics(self,\n                    search_term: str = None,\n                    image: str = None,\n                    top_n: int = 5) -> Tuple[List[int], List[float]]:\n        \"\"\" Find topics most similar to a search_term\n\n        Creates an embedding for search_term and compares that with\n        the topic embeddings. The most similar topics are returned\n        along with their similarity values.\n\n        The search_term can be of any size but since it is compared\n        with the topic representation it is advised to keep it\n        below 5 words.\n\n        Arguments:\n            search_term: the term you want to use to search for topics.\n            top_n: the number of topics to return\n\n        Returns:\n            similar_topics: the most similar topics from high to low\n            similarity: the similarity scores from high to low\n\n        Examples:\n\n        You can use the underlying embedding model to find topics that\n        best represent the search term:\n\n        ```python\n        topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n        ```\n\n        Note that the search query is typically more accurate if the\n        search_term consists of a phrase or multiple words.\n        \"\"\"\n        if self.embedding_model is None:\n            raise Exception(\"This method can only be used if you did not use custom embeddings.\")\n\n        topic_list = list(self.topic_representations_.keys())\n        topic_list.sort()\n\n        # Extract search_term embeddings and compare with topic embeddings\n        if search_term is not None:\n            search_embedding = self._extract_embeddings([search_term],\n                                                        method=\"word\",\n                                                        verbose=False).flatten()\n        elif image is not None:\n            search_embedding = self._extract_embeddings([None],\n                                                        images=[image],\n                                                        method=\"document\",\n                                                        verbose=False).flatten()\n        sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten()\n\n        # Extract topics most similar to search_term\n        ids = np.argsort(sims)[-top_n:]\n        similarity = [sims[i] for i in ids][::-1]\n        similar_topics = [topic_list[index] for index in ids][::-1]\n\n        return similar_topics, similarity\n\n    def update_topics(self,\n                      docs: List[str],\n                      images: List[str] = None,\n                      topics: List[int] = None,\n                      top_n_words: int = 10,\n                      n_gram_range: Tuple[int, int] = None,\n                      vectorizer_model: CountVectorizer = None,\n                      ctfidf_model: ClassTfidfTransformer = None,\n                      representation_model: BaseRepresentation = None):\n        \"\"\" Updates the topic representation by recalculating c-TF-IDF with the new\n        parameters as defined in this function.\n\n        When you have trained a model and viewed the topics and the words that represent them,\n        you might not be satisfied with the representation. Perhaps you forgot to remove\n        stop_words or you want to try out a different n_gram_range. This function allows you\n        to update the topic representation after they have been formed.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            images: The images you used when calling either `fit` or `fit_transform`\n            topics: A list of topics where each topic is related to a document in `docs`.\n                    Use this variable to change or map the topics.\n                    NOTE: Using a custom list of topic assignments may lead to errors if\n                          topic reduction techniques are used afterwards. Make sure that\n                          manually assigning topics is the last step in the pipeline\n            top_n_words: The number of words per topic to extract. Setting this\n                         too high can negatively impact topic embeddings as topics\n                         are typically best represented by at most 10 words.\n            n_gram_range: The n-gram range for the CountVectorizer.\n            vectorizer_model: Pass in your own CountVectorizer from scikit-learn\n            ctfidf_model: Pass in your own c-TF-IDF model to update the representations\n            representation_model: Pass in a model that fine-tunes the topic representations\n                                  calculated through c-TF-IDF. Models from `bertopic.representation`\n                                  are supported.\n\n        Examples:\n\n        In order to update the topic representation, you will need to first fit the topic\n        model and extract topics from them. Based on these, you can update the representation:\n\n        ```python\n        topic_model.update_topics(docs, n_gram_range=(2, 3))\n        ```\n\n        You can also use a custom vectorizer to update the representation:\n\n        ```python\n        from sklearn.feature_extraction.text import CountVectorizer\n        vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\")\n        topic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n        ```\n\n        You can also use this function to change or map the topics to something else.\n        You can update them as follows:\n\n        ```python\n        topic_model.update_topics(docs, my_updated_topics)\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        check_is_fitted(self)\n        if not n_gram_range:\n            n_gram_range = self.n_gram_range\n\n        if top_n_words > 100:\n            logger.warning(\"Note that extracting more than 100 words from a sparse \"\n                           \"can slow down computation quite a bit.\")\n        self.top_n_words = top_n_words\n        self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)\n        self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n        self.representation_model = representation_model\n\n        if topics is None:\n            topics = self.topics_\n        else:\n            logger.warning(\"Using a custom list of topic assignments may lead to errors if \"\n                           \"topic reduction techniques are used afterwards. Make sure that \"\n                           \"manually assigning topics is the last step in the pipeline.\"\n                           \"Note that topic embeddings will also be created through weighted\"\n                           \"c-TF-IDF embeddings instead of centroid embeddings.\")\n\n        self._outliers = 1 if -1 in set(topics) else 0\n\n        # Extract words\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": topics, \"ID\": range(len(docs)), \"Image\": images})\n        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})\n        self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)\n        self.topic_representations_ = self._extract_words_per_topic(words, documents)\n\n        # Update topic vectors\n        if set(topics) != self.topics_:\n\n            # Remove outlier topic embedding if all that has changed is the outlier class\n            same_position = all([True if old_topic == new_topic else False for old_topic, new_topic in zip(self.topics_, topics) if old_topic != -1])\n            if same_position and -1 not in topics and -1 in self.topics_:\n                self.topic_embeddings_ = self.topic_embeddings_[1:]\n            else:\n                self._create_topic_vectors()\n\n        # Update topic labels\n        self.topic_labels_ = {key: f\"{key}_\" + \"_\".join([word[0] for word in values[:4]])\n                              for key, values in\n                              self.topic_representations_.items()}\n        self._update_topic_size(documents)\n\n    def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:\n        \"\"\" Return topics with top n words and their c-TF-IDF score\n\n        Arguments:\n            full: If True, returns all different forms of topic representations\n                  for each topic, including aspects\n\n        Returns:\n            self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score\n\n        Examples:\n\n        ```python\n        all_topics = topic_model.get_topics()\n        ```\n        \"\"\"\n        check_is_fitted(self)\n\n        if full:\n            topic_representations = {\"Main\": self.topic_representations_}\n            topic_representations.update(self.topic_aspects_)\n            return topic_representations\n        else:\n            return self.topic_representations_\n\n    def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]:\n        \"\"\" Return top n words for a specific topic and their c-TF-IDF scores\n\n        Arguments:\n            topic: A specific topic for which you want its representation\n            full: If True, returns all different forms of topic representations\n                  for a topic, including aspects\n\n        Returns:\n            The top n words for a specific word and its respective c-TF-IDF scores\n\n        Examples:\n\n        ```python\n        topic = topic_model.get_topic(12)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        if topic in self.topic_representations_:\n            if full:\n                representations = {\"Main\": self.topic_representations_[topic]}\n                aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()}\n                representations.update(aspects)\n                return representations\n            else:\n                return self.topic_representations_[topic]\n        else:\n            return False\n\n    def get_topic_info(self, topic: int = None) -> pd.DataFrame:\n        \"\"\" Get information about each topic including its ID, frequency, and name.\n\n        Arguments:\n            topic: A specific topic for which you want the frequency\n\n        Returns:\n            info: The information relating to either a single topic or all topics\n\n        Examples:\n\n        ```python\n        info_df = topic_model.get_topic_info()\n        ```\n        \"\"\"\n        check_is_fitted(self)\n\n        info = pd.DataFrame(self.topic_sizes_.items(), columns=[\"Topic\", \"Count\"]).sort_values(\"Topic\")\n        info[\"Name\"] = info.Topic.map(self.topic_labels_)\n\n        # Custom label\n        if self.custom_labels_ is not None:\n            if len(self.custom_labels_) == len(info):\n                labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)}\n                info[\"CustomName\"] = info[\"Topic\"].map(labels)\n\n        # Main Keywords\n        values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()}\n        info[\"Representation\"] = info[\"Topic\"].map(values)\n\n        # Extract all topic aspects\n        if self.topic_aspects_:\n            for aspect, values in self.topic_aspects_.items():\n                if isinstance(list(values.values())[-1], list):\n                    if isinstance(list(values.values())[-1][0], tuple) or isinstance(list(values.values())[-1][0], list):\n                        values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()}\n                    elif isinstance(list(values.values())[-1][0], str):\n                        values = {topic: \" \".join(value).strip() for topic, value in values.items()}\n                info[aspect] = info[\"Topic\"].map(values)\n\n        # Representative Docs / Images\n        if self.representative_docs_ is not None:\n            info[\"Representative_Docs\"] = info[\"Topic\"].map(self.representative_docs_)\n        if self.representative_images_ is not None:\n            info[\"Representative_Images\"] = info[\"Topic\"].map(self.representative_images_)\n\n        # Select specific topic to return\n        if topic is not None:\n            info = info.loc[info.Topic == topic, :]\n\n        return info.reset_index(drop=True)\n\n    def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:\n        \"\"\" Return the size of topics (descending order)\n\n        Arguments:\n            topic: A specific topic for which you want the frequency\n\n        Returns:\n            Either the frequency of a single topic or dataframe with\n            the frequencies of all topics\n\n        Examples:\n\n        To extract the frequency of all topics:\n\n        ```python\n        frequency = topic_model.get_topic_freq()\n        ```\n\n        To get the frequency of a single topic:\n\n        ```python\n        frequency = topic_model.get_topic_freq(12)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        if isinstance(topic, int):\n            return self.topic_sizes_[topic]\n        else:\n            return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values(\"Count\",\n                                                                                                   ascending=False)\n\n    def get_document_info(self,\n                          docs: List[str],\n                          df: pd.DataFrame = None,\n                          metadata: Mapping[str, Any] = None) -> pd.DataFrame:\n        \"\"\" Get information about the documents on which the topic was trained\n        including the documents themselves, their respective topics, the name\n        of each topic, the top n words of each topic, whether it is a\n        representative document, and probability of the clustering if the cluster\n        model supports it.\n\n        There are also options to include other meta data, such as the topic\n        distributions or the x and y coordinates of the reduced embeddings.\n\n        Arguments:\n            docs: The documents on which the topic model was trained.\n            df: A dataframe containing the metadata and the documents on which\n                the topic model was originally trained on.\n            metadata: A dictionary with meta data for each document in the form\n                      of column name (key) and the respective values (value).\n\n        Returns:\n            document_info: A dataframe with several statistics regarding\n                           the documents on which the topic model was trained.\n\n        Usage:\n\n        To get the document info, you will only need to pass the documents on which\n        the topic model was trained:\n\n        ```python\n        document_info = topic_model.get_document_info(docs)\n        ```\n\n        There are additionally options to include meta data, such as the topic\n        distributions. Moreover, we can pass the original dataframe that contains\n        the documents and extend it with the information retrieved from BERTopic:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n\n        # The original data in a dataframe format to include the target variable\n        data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\n        df = pd.DataFrame({\"Document\": data['data'], \"Class\": data['target']})\n\n        # Add information about the percentage of the document that relates to the topic\n        topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)\n        distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]\n\n        # Create our documents dataframe using the original dataframe and meta data about\n        # the topic distributions\n        document_info = topic_model.get_document_info(docs, df=df,\n                                                      metadata={\"Topic_distribution\": distributions})\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        if df is not None:\n            document_info = df.copy()\n            document_info[\"Document\"] = docs\n            document_info[\"Topic\"] = self.topics_\n        else:\n            document_info = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_})\n\n        # Add topic info through `.get_topic_info()`\n        topic_info = self.get_topic_info().drop(\"Count\", axis=1)\n        document_info = pd.merge(document_info, topic_info, on=\"Topic\", how=\"left\")\n\n        # Add top n words\n        top_n_words = {topic: \" - \".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)}\n        document_info[\"Top_n_words\"] = document_info.Topic.map(top_n_words)\n\n        # Add flat probabilities\n        if self.probabilities_ is not None:\n            if len(self.probabilities_.shape) == 1:\n                document_info[\"Probability\"] = self.probabilities_\n            else:\n                document_info[\"Probability\"] = [max(probs) if topic != -1 else 1-sum(probs)\n                                                for topic, probs in zip(self.topics_, self.probabilities_)]\n\n        # Add representative document labels\n        repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs]\n        document_info[\"Representative_document\"] = False\n        document_info.loc[document_info.Document.isin(repr_docs), \"Representative_document\"] = True\n\n        # Add custom meta data provided by the user\n        if metadata is not None:\n            for column, values in metadata.items():\n                document_info[column] = values\n        return document_info\n\n    def get_representative_docs(self, topic: int = None) -> List[str]:\n        \"\"\" Extract the best representing documents per topic.\n\n        NOTE:\n            This does not extract all documents per topic as all documents\n            are not saved within BERTopic. To get all documents, please\n            run the following:\n\n            ```python\n            # When you used `.fit_transform`:\n            df = pd.DataFrame({\"Document\": docs, \"Topic\": topic})\n\n            # When you used `.fit`:\n            df = pd.DataFrame({\"Document\": docs, \"Topic\": topic_model.topics_})\n            ```\n\n        Arguments:\n            topic: A specific topic for which you want\n                   the representative documents\n\n        Returns:\n            Representative documents of the chosen topic\n\n        Examples:\n\n        To extract the representative docs of all topics:\n\n        ```python\n        representative_docs = topic_model.get_representative_docs()\n        ```\n\n        To get the representative docs of a single topic:\n\n        ```python\n        representative_docs = topic_model.get_representative_docs(12)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        if isinstance(topic, int):\n            if self.representative_docs_.get(topic):\n                return self.representative_docs_[topic]\n            else:\n                return None\n        else:\n            return self.representative_docs_\n\n    @staticmethod\n    def get_topic_tree(hier_topics: pd.DataFrame,\n                       max_distance: float = None,\n                       tight_layout: bool = False) -> str:\n        \"\"\" Extract the topic tree such that it can be printed\n\n        Arguments:\n            hier_topics: A dataframe containing the structure of the topic tree.\n                         This is the output of `topic_model.hierachical_topics()`\n            max_distance: The maximum distance between two topics. This value is\n                          based on the Distance column in `hier_topics`.\n            tight_layout: Whether to use a tight layout (narrow width) for\n                          easier readability if you have hundreds of topics.\n\n        Returns:\n            A tree that has the following structure when printed:\n                .\n                .\n                \u2514\u2500health_medical_disease_patients_hiv\n                    \u251c\u2500patients_medical_disease_candida_health\n                    \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n                    \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n                    \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n                    \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n                    \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n\n            The blocks (\u25a0) indicate that the topic is one you can directly access\n            from `topic_model.get_topic`. In other words, they are the original un-grouped topics.\n\n        Examples:\n\n        ```python\n        # Train model\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n        # Print topic tree\n        tree = topic_model.get_topic_tree(hierarchical_topics)\n        print(tree)\n        ```\n        \"\"\"\n        width = 1 if tight_layout else 4\n        if max_distance is None:\n            max_distance = hier_topics.Distance.max() + 1\n\n        max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1\n\n        # Extract mapping from ID to name\n        topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name))\n        topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)))\n        topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()}\n\n        # Create tree\n        tree = {str(row[1].Parent_ID): [str(row[1].Child_Left_ID), str(row[1].Child_Right_ID)]\n                for row in hier_topics.iterrows()}\n\n        def get_tree(start, tree):\n            \"\"\" Based on: https://stackoverflow.com/a/51920869/10532563 \"\"\"\n\n            def _tree(to_print, start, parent, tree, grandpa=None, indent=\"\"):\n\n                # Get distance between merged topics\n                distance = hier_topics.loc[(hier_topics.Child_Left_ID == parent) |\n                                           (hier_topics.Child_Right_ID == parent), \"Distance\"]\n                distance = distance.values[0] if len(distance) > 0 else 10\n\n                if parent != start:\n                    if grandpa is None:\n                        to_print += topic_to_name[parent]\n                    else:\n                        if int(parent) <= max_original_topic:\n\n                            # Do not append topic ID if they are not merged\n                            if distance < max_distance:\n                                to_print += \"\u25a0\u2500\u2500\" + topic_to_name[parent] + f\" \u2500\u2500 Topic: {parent}\" + \"\\n\"\n                            else:\n                                to_print += \"O \\n\"\n                        else:\n                            to_print += topic_to_name[parent] + \"\\n\"\n\n                if parent not in tree:\n                    return to_print\n\n                for child in tree[parent][:-1]:\n                    to_print += indent + \"\u251c\" + \"\u2500\"\n                    to_print = _tree(to_print, start, child, tree, parent, indent + \"\u2502\" + \" \" * width)\n\n                child = tree[parent][-1]\n                to_print += indent + \"\u2514\" + \"\u2500\"\n                to_print = _tree(to_print, start, child, tree, parent, indent + \" \" * (width+1))\n\n                return to_print\n\n            to_print = \".\" + \"\\n\"\n            to_print = _tree(to_print, start, start, tree)\n            return to_print\n\n        start = str(hier_topics.Parent_ID.astype(int).max())\n        return get_tree(start, tree)\n\n    def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None:\n        \"\"\" Set custom topic labels in your fitted BERTopic model\n\n        Arguments:\n            topic_labels: If a list of topic labels, it should contain the same number\n                          of labels as there are topics. This must be ordered\n                          from the topic with the lowest ID to the highest ID,\n                          including topic -1 if it exists.\n                          If a dictionary of `topic ID`: `topic_label`, it can have\n                          any number of topics as it will only map the topics found\n                          in the dictionary.\n\n        Examples:\n\n        First, we define our topic labels with `.generate_topic_labels` in which\n        we can customize our topic labels:\n\n        ```python\n        topic_labels = topic_model.generate_topic_labels(nr_words=2,\n                                                    topic_prefix=True,\n                                                    word_length=10,\n                                                    separator=\", \")\n        ```\n\n        Then, we pass these `topic_labels` to our topic model which\n        can be accessed at any time with `.custom_labels_`:\n\n        ```python\n        topic_model.set_topic_labels(topic_labels)\n        topic_model.custom_labels_\n        ```\n\n        You might want to change only a few topic labels instead of all of them.\n        To do so, you can pass a dictionary where the keys are the topic IDs and\n        its keys the topic labels:\n\n        ```python\n        topic_model.set_topic_labels({0: \"Space\", 1: \"Sports\", 2: \"Medicine\"})\n        topic_model.custom_labels_\n        ```\n        \"\"\"\n        unique_topics = sorted(set(self.topics_))\n\n        if isinstance(topic_labels, dict):\n            if self.custom_labels_ is not None:\n                original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)}\n            else:\n                info = self.get_topic_info()\n                original_labels = dict(zip(info.Topic, info.Name))\n            custom_labels = [topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics]\n\n        elif isinstance(topic_labels, list):\n            if len(topic_labels) == len(unique_topics):\n                custom_labels = topic_labels\n            else:\n                raise ValueError(\"Make sure that `topic_labels` contains the same number \"\n                                 \"of labels as there are topics.\")\n\n        self.custom_labels_ = custom_labels\n\n    def generate_topic_labels(self,\n                              nr_words: int = 3,\n                              topic_prefix: bool = True,\n                              word_length: int = None,\n                              separator: str = \"_\",\n                              aspect: str = None) -> List[str]:\n        \"\"\" Get labels for each topic in a user-defined format\n\n        Arguments:\n            nr_words: Top `n` words per topic to use\n            topic_prefix: Whether to use the topic ID as a prefix.\n                          If set to True, the topic ID will be separated\n                          using the `separator`\n            word_length: The maximum length of each word in the topic label.\n                         Some words might be relatively long and setting this\n                         value helps to make sure that all labels have relatively\n                         similar lengths.\n            separator: The string with which the words and topic prefix will be\n                       separated. Underscores are the default but a nice alternative\n                       is `\", \"`.\n            aspect: The aspect from which to generate topic labels\n\n        Returns:\n            topic_labels: A list of topic labels sorted from the lowest topic ID to the highest.\n                          If the topic model was trained using HDBSCAN, the lowest topic ID is -1,\n                          otherwise it is 0.\n\n        Examples:\n\n        To create our custom topic labels, usage is rather straightforward:\n\n        ```python\n        topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=\", \")\n        ```\n        \"\"\"\n        unique_topics = sorted(set(self.topics_))\n\n        topic_labels = []\n        for topic in unique_topics:\n            if aspect:\n                words, _ = zip(*self.topic_aspects_[aspect][topic])\n            else:\n                words, _ = zip(*self.get_topic(topic))\n\n            if word_length:\n                words = [word[:word_length] for word in words][:nr_words]\n            else:\n                words = list(words)[:nr_words]\n\n            if topic_prefix:\n                topic_label = f\"{topic}{separator}\" + separator.join(words)\n            else:\n                topic_label = separator.join(words)\n\n            topic_labels.append(topic_label)\n\n        return topic_labels\n\n    def merge_topics(self,\n                     docs: List[str],\n                     topics_to_merge: List[Union[Iterable[int], int]],\n                     images: List[str] = None) -> None:\n        \"\"\"\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            topics_to_merge: Either a list of topics or a list of list of topics\n                             to merge. For example:\n                                [1, 2, 3] will merge topics 1, 2 and 3\n                                [[1, 2], [3, 4]] will merge topics 1 and 2, and\n                                separately merge topics 3 and 4.\n            images: A list of paths to the images used when calling either\n                    `fit` or `fit_transform`\n\n        Examples:\n\n        If you want to merge topics 1, 2, and 3:\n\n        ```python\n        topics_to_merge = [1, 2, 3]\n        topic_model.merge_topics(docs, topics_to_merge)\n        ```\n\n        or if you want to merge topics 1 and 2, and separately\n        merge topics 3 and 4:\n\n        ```python\n        topics_to_merge = [[1, 2],\n                            [3, 4]]\n        topic_model.merge_topics(docs, topics_to_merge)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Image\": images, \"ID\": range(len(docs))})\n\n        mapping = {topic: topic for topic in set(self.topics_)}\n        if isinstance(topics_to_merge[0], int):\n            for topic in sorted(topics_to_merge):\n                mapping[topic] = topics_to_merge[0]\n        elif isinstance(topics_to_merge[0], Iterable):\n            for topic_group in sorted(topics_to_merge):\n                for topic in topic_group:\n                    mapping[topic] = topic_group[0]\n        else:\n            raise ValueError(\"Make sure that `topics_to_merge` is either\"\n                             \"a list of topics or a list of list of topics.\")\n\n        # Track mappings and sizes of topics for merging topic embeddings\n        mappings = defaultdict(list)\n        for key, val in sorted(mapping.items()):\n            mappings[val].append(key)\n        mappings = {topic_from:\n                    {\"topics_to\": topics_to,\n                     \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_to]}\n                    for topic_from, topics_to in mappings.items()}\n\n        # Update topics\n        documents.Topic = documents.Topic.map(mapping)\n        self.topic_mapper_.add_mappings(mapping)\n        documents = self._sort_mappings_by_frequency(documents)\n        self._extract_topics(documents, mappings=mappings)\n        self._update_topic_size(documents)\n        self._save_representative_docs(documents)\n        self.probabilities_ = self._map_probabilities(self.probabilities_)\n\n    def reduce_topics(self,\n                      docs: List[str],\n                      nr_topics: Union[int, str] = 20,\n                      images: List[str] = None) -> None:\n        \"\"\" Reduce the number of topics to a fixed number of topics\n        or automatically.\n\n        If nr_topics is an integer, then the number of topics is reduced\n        to nr_topics using `AgglomerativeClustering` on the cosine distance matrix\n        of the topic embeddings.\n\n        If nr_topics is `\"auto\"`, then HDBSCAN is used to automatically\n        reduce the number of topics by running it on the topic embeddings.\n\n        The topics, their sizes, and representations are updated.\n\n        Arguments:\n            docs: The docs you used when calling either `fit` or `fit_transform`\n            nr_topics: The number of topics you want reduced to\n            images: A list of paths to the images used when calling either\n                    `fit` or `fit_transform`\n\n        Updates:\n            topics_ : Assigns topics to their merged representations.\n            probabilities_ : Assigns probabilities to their merged representations.\n\n        Examples:\n\n        You can further reduce the topics by passing the documents with their\n        topics and probabilities (if they were calculated):\n\n        ```python\n        topic_model.reduce_topics(docs, nr_topics=30)\n        ```\n\n        You can then access the updated topics and probabilities with:\n\n        ```python\n        topics = topic_model.topics_\n        probabilities = topic_model.probabilities_\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n\n        self.nr_topics = nr_topics\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Image\": images, \"ID\": range(len(docs))})\n\n        # Reduce number of topics\n        documents = self._reduce_topics(documents)\n        self._merged_topics = None\n        self._save_representative_docs(documents)\n        self.probabilities_ = self._map_probabilities(self.probabilities_)\n\n        return self\n\n    def reduce_outliers(self,\n                        documents: List[str],\n                        topics: List[int],\n                        images: List[str] = None,\n                        strategy: str = \"distributions\",\n                        probabilities: np.ndarray = None,\n                        threshold: float = 0,\n                        embeddings: np.ndarray = None,\n                        distributions_params: Mapping[str, Any] = {}) -> List[int]:\n        \"\"\" Reduce outliers by merging them with their nearest topic according\n        to one of several strategies.\n\n        When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created\n        that do not fall within any of the created topics. These are labeled as -1.\n        This function allows the user to match outlier documents with their nearest topic\n        using one of the following strategies using the `strategy` parameter:\n            * \"probabilities\"\n                This uses the soft-clustering as performed by HDBSCAN to find the\n                best matching topic for each outlier document. To use this, make\n                sure to calculate the `probabilities` beforehand by instantiating\n                BERTopic with `calculate_probabilities=True`.\n            * \"distributions\"\n                Use the topic distributions, as calculated with `.approximate_distribution`\n                to find the most frequent topic in each outlier document. You can use the\n                `distributions_params` variable to tweak the parameters of\n                `.approximate_distribution`.\n            * \"c-tf-idf\"\n                Calculate the c-TF-IDF representation for each outlier document and\n                find the best matching c-TF-IDF topic representation using\n                cosine similarity.\n            * \"embeddings\"\n                Using the embeddings of each outlier documents, find the best\n                matching topic embedding using cosine similarity.\n\n        Arguments:\n            documents: A list of documents for which we reduce or remove the outliers.\n            topics: The topics that correspond to the documents\n            images: A list of paths to the images used when calling either\n                    `fit` or `fit_transform`\n            strategy: The strategy used for reducing outliers.\n                    Options:\n                        * \"probabilities\"\n                            This uses the soft-clustering as performed by HDBSCAN\n                            to find the best matching topic for each outlier document.\n\n                        * \"distributions\"\n                            Use the topic distributions, as calculated with `.approximate_distribution`\n                            to find the most frequent topic in each outlier document.\n\n                        * \"c-tf-idf\"\n                            Calculate the c-TF-IDF representation for outlier documents and\n                            find the best matching c-TF-IDF topic representation.\n\n                        * \"embeddings\"\n                            Calculate the embeddings for outlier documents and\n                            find the best matching topic embedding.\n            threshold: The threshold for assigning topics to outlier documents. This value\n                       represents the minimum probability when `strategy=\"probabilities\"`.\n                       For all other strategies, it represents the minimum similarity.\n            embeddings: The pre-computed embeddings to be used when `strategy=\"embeddings\"`.\n                        If this is None, then it will compute the embeddings for the outlier documents.\n            distributions_params: The parameters used in `.approximate_distribution` when using\n                                  the strategy `\"distributions\"`.\n\n        Returns:\n            new_topics: The updated topics\n\n        Usage:\n\n        The default settings uses the `\"distributions\"` strategy:\n\n        ```python\n        new_topics = topic_model.reduce_outliers(docs, topics)\n        ```\n\n        When you use the `\"probabilities\"` strategy, make sure to also pass the probabilities\n        as generated through HDBSCAN:\n\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic(calculate_probabilities=True)\n        topics, probs = topic_model.fit_transform(docs)\n\n        new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n        ```\n        \"\"\"\n        if images is not None:\n            strategy = \"embeddings\"\n\n        # Check correct use of parameters\n        if strategy.lower() == \"probabilities\" and probabilities is None:\n            raise ValueError(\"Make sure to pass in `probabilities` in order to use the probabilities strategy\")\n\n        # Reduce outliers by extracting most likely topics through the topic-term probability matrix\n        if strategy.lower() == \"probabilities\":\n            new_topics = [np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic\n                          for topic, prob in zip(topics, probabilities)]\n\n        # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions\n        elif strategy.lower() == \"distributions\":\n            outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n            outlier_docs = [documents[index] for index in outlier_ids]\n            topic_distr, _ = self.approximate_distribution(outlier_docs, min_similarity=threshold, **distributions_params)\n            outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr])\n            new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n        # Reduce outliers by finding the most similar c-TF-IDF representations\n        elif strategy.lower() == \"c-tf-idf\":\n            outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n            outlier_docs = [documents[index] for index in outlier_ids]\n\n            # Calculate c-TF-IDF of outlier documents with all topics\n            bow_doc = self.vectorizer_model.transform(outlier_docs)\n            c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n            similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:])\n\n            # Update topics\n            similarity[similarity < threshold] = 0\n            outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n            new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n        # Reduce outliers by finding the most similar topic embeddings\n        elif strategy.lower() == \"embeddings\":\n            if self.embedding_model is None and embeddings is None:\n                raise ValueError(\"To use this strategy, you will need to pass a model to `embedding_model`\"\n                                 \"when instantiating BERTopic.\")\n            outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n            if images is not None:\n                outlier_docs = [images[index] for index in outlier_ids]\n            else:\n                outlier_docs = [documents[index] for index in outlier_ids]\n\n            # Extract or calculate embeddings for outlier documents\n            if embeddings is not None:\n                outlier_embeddings = np.array([embeddings[index] for index in outlier_ids])\n            elif images is not None:\n                outlier_images = [images[index] for index in outlier_ids]\n                outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose)\n            else:\n                outlier_embeddings = self.embedding_model.embed_documents(outlier_docs)\n            similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers:])\n\n            # Update topics\n            similarity[similarity < threshold] = 0\n            outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n            new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n        return new_topics\n\n    def visualize_topics(self,\n                         topics: List[int] = None,\n                         top_n_topics: int = None,\n                         custom_labels: bool = False,\n                         title: str = \"<b>Intertopic Distance Map</b>\",\n                         width: int = 650,\n                         height: int = 650) -> go.Figure:\n        \"\"\" Visualize topics, their sizes, and their corresponding words\n\n        This visualization is highly inspired by LDAvis, a great visualization\n        technique typically reserved for LDA.\n\n        Arguments:\n            topics: A selection of topics to visualize\n                    Not to be confused with the topics that you get from `.fit_transform`.\n                    For example, if you want to visualize only topics 1 through 5:\n                    `topics = [1, 2, 3, 4, 5]`.\n            top_n_topics: Only select the top n most frequent topics\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_topics()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_topics()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_topics(self,\n                                         topics=topics,\n                                         top_n_topics=top_n_topics,\n                                         custom_labels=custom_labels,\n                                         title=title,\n                                         width=width,\n                                         height=height)\n\n    def visualize_documents(self,\n                            docs: List[str],\n                            topics: List[int] = None,\n                            embeddings: np.ndarray = None,\n                            reduced_embeddings: np.ndarray = None,\n                            sample: float = None,\n                            hide_annotations: bool = False,\n                            hide_document_hover: bool = False,\n                            custom_labels: bool = False,\n                            title: str = \"<b>Documents and Topics</b>\",\n                            width: int = 1200,\n                            height: int = 750) -> go.Figure:\n        \"\"\" Visualize documents and their topics in 2D\n\n        Arguments:\n            topic_model: A fitted BERTopic instance.\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            topics: A selection of topics to visualize.\n                    Not to be confused with the topics that you get from `.fit_transform`.\n                    For example, if you want to visualize only topics 1 through 5:\n                    `topics = [1, 2, 3, 4, 5]`.\n            embeddings: The embeddings of all documents in `docs`.\n            reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n            sample: The percentage of documents in each topic that you would like to keep.\n                    Value can be between 0 and 1. Setting this value to, for example,\n                    0.1 (10% of documents in each topic) makes it easier to visualize\n                    millions of documents as a subset is chosen.\n            hide_annotations: Hide the names of the traces on top of each cluster.\n            hide_document_hover: Hide the content of the documents when hovering over\n                                specific points. Helps to speed up generation of visualization.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_documents(docs)\n        ```\n\n        Do note that this re-calculates the embeddings and reduces them to 2D.\n        The advised and preferred pipeline for using this function is as follows:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n        from bertopic import BERTopic\n        from umap import UMAP\n\n        # Prepare embeddings\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n        # Train BERTopic\n        topic_model = BERTopic().fit(docs, embeddings)\n\n        # Reduce dimensionality of embeddings, this step is optional\n        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n        # Run the visualization with the original embeddings\n        topic_model.visualize_documents(docs, embeddings=embeddings)\n\n        # Or, if you have reduced the original embeddings already:\n        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n        fig.write_html(\"path/to/file.html\")\n        ```\n\n        <iframe src=\"../getting_started/visualization/documents.html\"\n        style=\"width:1000px; height: 800px; border: 0px;\"\"></iframe>\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        return plotting.visualize_documents(self,\n                                            docs=docs,\n                                            topics=topics,\n                                            embeddings=embeddings,\n                                            reduced_embeddings=reduced_embeddings,\n                                            sample=sample,\n                                            hide_annotations=hide_annotations,\n                                            hide_document_hover=hide_document_hover,\n                                            custom_labels=custom_labels,\n                                            title=title,\n                                            width=width,\n                                            height=height)\n\n    def visualize_document_datamap(self,\n                                   docs: List[str],\n                                   topics: List[int] = None,\n                                   embeddings: np.ndarray = None,\n                                   reduced_embeddings: np.ndarray = None,\n                                   custom_labels: Union[bool, str] = False,\n                                   title: str = \"Documents and Topics\",\n                                   sub_title: Union[str, None] = None,\n                                   width: int = 1200,\n                                   height: int = 1200,\n                                   **datamap_kwds):\n        \"\"\" Visualize documents and their topics in 2D as a static plot for publication using\n        DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best\n        to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model.\n\n        Arguments:\n            topic_model:  A fitted BERTopic instance.\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            embeddings:  The embeddings of all documents in `docs`.\n            reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.\n            custom_labels:  If bool, whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n                           If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n            title: Title of the plot.\n            sub_title: Sub-title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n            **datamap_kwds:  All further keyword args will be passed on to DataMapPlot's\n                             `create_plot` function. See the DataMapPlot documentation\n                             for more details.\n\n        Returns:\n            figure: A Matplotlib Figure object.\n\n        Examples:\n\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_document_datamap(docs)\n        ```\n\n        Do note that this re-calculates the embeddings and reduces them to 2D.\n        The advised and preferred pipeline for using this function is as follows:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n        from bertopic import BERTopic\n        from umap import UMAP\n\n        # Prepare embeddings\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n        # Train BERTopic\n        topic_model = BERTopic(min_topic_size=36).fit(docs, embeddings)\n\n        # Reduce dimensionality of embeddings, this step is optional\n        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n        # Run the visualization with the original embeddings\n        topic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n        # Or, if you have reduced the original embeddings already:\n        topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n        fig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        return plotting.visualize_document_datamap(self,\n                                                   docs,\n                                                   topics,\n                                                   embeddings,\n                                                   reduced_embeddings,\n                                                   custom_labels,\n                                                   title,\n                                                   sub_title,\n                                                   width,\n                                                   height,\n                                                   **datamap_kwds)\n    def visualize_hierarchical_documents(self,\n                                         docs: List[str],\n                                         hierarchical_topics: pd.DataFrame,\n                                         topics: List[int] = None,\n                                         embeddings: np.ndarray = None,\n                                         reduced_embeddings: np.ndarray = None,\n                                         sample: Union[float, int] = None,\n                                         hide_annotations: bool = False,\n                                         hide_document_hover: bool = True,\n                                         nr_levels: int = 10,\n                                         level_scale: str = 'linear',\n                                         custom_labels: bool = False,\n                                         title: str = \"<b>Hierarchical Documents and Topics</b>\",\n                                         width: int = 1200,\n                                         height: int = 750) -> go.Figure:\n        \"\"\" Visualize documents and their topics in 2D at different levels of hierarchy\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            hierarchical_topics: A dataframe that contains a hierarchy of topics\n                                represented by their parents and their children\n            topics: A selection of topics to visualize.\n                    Not to be confused with the topics that you get from `.fit_transform`.\n                    For example, if you want to visualize only topics 1 through 5:\n                    `topics = [1, 2, 3, 4, 5]`.\n            embeddings: The embeddings of all documents in `docs`.\n            reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n            sample: The percentage of documents in each topic that you would like to keep.\n                    Value can be between 0 and 1. Setting this value to, for example,\n                    0.1 (10% of documents in each topic) makes it easier to visualize\n                    millions of documents as a subset is chosen.\n            hide_annotations: Hide the names of the traces on top of each cluster.\n            hide_document_hover: Hide the content of the documents when hovering over\n                                 specific points. Helps to speed up generation of visualizations.\n            nr_levels: The number of levels to be visualized in the hierarchy. First, the distances\n                       in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with\n                       equal length. Then, for each list of distances, the merged topics, that have \n                       a distance less or equal to the maximum distance of the selected list of distances, are selected.\n                       NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to\n                       the length of `hierarchical_topics`.\n            level_scale: Whether to apply a linear or logarithmic ('log') scale levels of the distance\n                         vector. Linear scaling will perform an equal number of merges at each level\n                         while logarithmic scaling will perform more mergers in earlier levels to\n                         provide more resolution at higher levels (this can be used for when the number\n                         of topics is large).\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n                           NOTE: Custom labels are only generated for the original\n                           un-merged topics.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n        ```\n\n        Do note that this re-calculates the embeddings and reduces them to 2D.\n        The advised and preferred pipeline for using this function is as follows:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n        from bertopic import BERTopic\n        from umap import UMAP\n\n        # Prepare embeddings\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n        # Train BERTopic and extract hierarchical topics\n        topic_model = BERTopic().fit(docs, embeddings)\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n        # Reduce dimensionality of embeddings, this step is optional\n        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n        # Run the visualization with the original embeddings\n        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n        # Or, if you have reduced the original embeddings already:\n        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n        fig.write_html(\"path/to/file.html\")\n        ```\n\n        <iframe src=\"../getting_started/visualization/hierarchical_documents.html\"\n        style=\"width:1000px; height: 770px; border: 0px;\"\"></iframe>\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        return plotting.visualize_hierarchical_documents(self,\n                                                         docs=docs,\n                                                         hierarchical_topics=hierarchical_topics,\n                                                         topics=topics,\n                                                         embeddings=embeddings,\n                                                         reduced_embeddings=reduced_embeddings,\n                                                         sample=sample,\n                                                         hide_annotations=hide_annotations,\n                                                         hide_document_hover=hide_document_hover,\n                                                         nr_levels=nr_levels,\n                                                         level_scale=level_scale,\n                                                         custom_labels=custom_labels,\n                                                         title=title,\n                                                         width=width,\n                                                         height=height)\n\n    def visualize_term_rank(self,\n                            topics: List[int] = None,\n                            log_scale: bool = False,\n                            custom_labels: bool = False,\n                            title: str = \"<b>Term score decline per Topic</b>\",\n                            width: int = 800,\n                            height: int = 500) -> go.Figure:\n        \"\"\" Visualize the ranks of all terms across all topics\n\n        Each topic is represented by a set of words. These words, however,\n        do not all equally represent the topic. This visualization shows\n        how many words are needed to represent a topic and at which point\n        the beneficial effect of adding words starts to decline.\n\n        Arguments:\n            topics: A selection of topics to visualize. These will be colored\n                    red where all others will be colored black.\n            log_scale: Whether to represent the ranking on a log scale\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n\n        To visualize the ranks of all words across\n        all topics simply run:\n\n        ```python\n        topic_model.visualize_term_rank()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_term_rank()\n        fig.write_html(\"path/to/file.html\")\n        ```\n\n        Reference:\n\n        This visualization was heavily inspired by the\n        \"Term Probability Decline\" visualization found in an\n        analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/).\n        Reference to that specific analysis can be found\n        [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html).\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_term_rank(self,\n                                            topics=topics,\n                                            log_scale=log_scale,\n                                            custom_labels=custom_labels,\n                                            title=title,\n                                            width=width,\n                                            height=height)\n\n    def visualize_topics_over_time(self,\n                                   topics_over_time: pd.DataFrame,\n                                   top_n_topics: int = None,\n                                   topics: List[int] = None,\n                                   normalize_frequency: bool = False,\n                                   custom_labels: bool = False,\n                                   title: str = \"<b>Topics over Time</b>\",\n                                   width: int = 1250,\n                                   height: int = 450) -> go.Figure:\n        \"\"\" Visualize topics over time\n\n        Arguments:\n            topics_over_time: The topics you would like to be visualized with the\n                              corresponding topic representation\n            top_n_topics: To visualize the most frequent topics instead of all\n            topics: Select which topics you would like to be visualized\n            normalize_frequency: Whether to normalize each topic's frequency individually\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            A plotly.graph_objects.Figure including all traces\n\n        Examples:\n\n        To visualize the topics over time, simply run:\n\n        ```python\n        topics_over_time = topic_model.topics_over_time(docs, timestamps)\n        topic_model.visualize_topics_over_time(topics_over_time)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_topics_over_time(topics_over_time)\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_topics_over_time(self,\n                                                   topics_over_time=topics_over_time,\n                                                   top_n_topics=top_n_topics,\n                                                   topics=topics,\n                                                   normalize_frequency=normalize_frequency,\n                                                   custom_labels=custom_labels,\n                                                   title=title,\n                                                   width=width,\n                                                   height=height)\n\n    def visualize_topics_per_class(self,\n                                   topics_per_class: pd.DataFrame,\n                                   top_n_topics: int = 10,\n                                   topics: List[int] = None,\n                                   normalize_frequency: bool = False,\n                                   custom_labels: bool = False,\n                                   title: str = \"<b>Topics per Class</b>\",\n                                   width: int = 1250,\n                                   height: int = 900) -> go.Figure:\n        \"\"\" Visualize topics per class\n\n        Arguments:\n            topics_per_class: The topics you would like to be visualized with the\n                              corresponding topic representation\n            top_n_topics: To visualize the most frequent topics instead of all\n            topics: Select which topics you would like to be visualized\n            normalize_frequency: Whether to normalize each topic's frequency individually\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            A plotly.graph_objects.Figure including all traces\n\n        Examples:\n\n        To visualize the topics per class, simply run:\n\n        ```python\n        topics_per_class = topic_model.topics_per_class(docs, classes)\n        topic_model.visualize_topics_per_class(topics_per_class)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_topics_per_class(topics_per_class)\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_topics_per_class(self,\n                                                   topics_per_class=topics_per_class,\n                                                   top_n_topics=top_n_topics,\n                                                   topics=topics,\n                                                   normalize_frequency=normalize_frequency,\n                                                   custom_labels=custom_labels,\n                                                   title=title,\n                                                   width=width,\n                                                   height=height)\n\n    def visualize_distribution(self,\n                               probabilities: np.ndarray,\n                               min_probability: float = 0.015,\n                               custom_labels: bool = False,\n                               title: str = \"<b>Topic Probability Distribution</b>\",\n                               width: int = 800,\n                               height: int = 600) -> go.Figure:\n        \"\"\" Visualize the distribution of topic probabilities\n\n        Arguments:\n            probabilities: An array of probability scores\n            min_probability: The minimum probability score to visualize.\n                             All others are ignored.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n\n        Make sure to fit the model before and only input the\n        probabilities of a single document:\n\n        ```python\n        topic_model.visualize_distribution(topic_model.probabilities_[0])\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_distribution(topic_model.probabilities_[0])\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_distribution(self,\n                                               probabilities=probabilities,\n                                               min_probability=min_probability,\n                                               custom_labels=custom_labels,\n                                               title=title,\n                                               width=width,\n                                               height=height)\n\n    def visualize_approximate_distribution(self,\n                                           document: str,\n                                           topic_token_distribution: np.ndarray,\n                                           normalize: bool = False):\n        \"\"\" Visualize the topic distribution calculated by `.approximate_topic_distribution`\n        on a token level. Thereby indicating the extent to which a certain word or phrase belongs\n        to a specific topic. The assumption here is that a single word can belong to multiple\n        similar topics and as such can give information about the broader set of topics within\n        a single document.\n\n        Arguments:\n            topic_model: A fitted BERTopic instance.\n            document: The document for which you want to visualize\n                      the approximated topic distribution.\n            topic_token_distribution: The topic-token distribution of the document as\n                                      extracted by `.approximate_topic_distribution`\n            normalize: Whether to normalize, between 0 and 1 (summing up to 1), the\n                       topic distribution values.\n\n        Returns:\n            df: A stylized dataframe indicating the best fitting topics\n                for each token.\n\n        Examples:\n\n        ```python\n        # Calculate the topic distributions on a token level\n        # Note that we need to have `calculate_token_level=True`\n        topic_distr, topic_token_distr = topic_model.approximate_distribution(\n                docs, calculate_token_level=True\n        )\n\n        # Visualize the approximated topic distributions\n        df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])\n        df\n        ```\n\n        To revert this stylized dataframe back to a regular dataframe,\n        you can run the following:\n\n        ```python\n        df.data.columns = [column.strip() for column in df.data.columns]\n        df = df.data\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_approximate_distribution(self,\n                                                           document=document,\n                                                           topic_token_distribution=topic_token_distribution,\n                                                           normalize=normalize)\n\n    def visualize_hierarchy(self,\n                            orientation: str = \"left\",\n                            topics: List[int] = None,\n                            top_n_topics: int = None,\n                            custom_labels: bool = False,\n                            title: str = \"<b>Hierarchical Clustering</b>\",\n                            width: int = 1000,\n                            height: int = 600,\n                            hierarchical_topics: pd.DataFrame = None,\n                            linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n                            distance_function: Callable[[csr_matrix], csr_matrix] = None,\n                            color_threshold: int = 1) -> go.Figure:\n        \"\"\" Visualize a hierarchical structure of the topics\n\n        A ward linkage function is used to perform the\n        hierarchical clustering based on the cosine distance\n        matrix between topic embeddings.\n\n        Arguments:\n            topic_model: A fitted BERTopic instance.\n            orientation: The orientation of the figure.\n                         Either 'left' or 'bottom'\n            topics: A selection of topics to visualize\n            top_n_topics: Only select the top n most frequent topics\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n                           NOTE: Custom labels are only generated for the original\n                           un-merged topics.\n            title: Title of the plot.\n            width: The width of the figure. Only works if orientation is set to 'left'\n            height: The height of the figure. Only works if orientation is set to 'bottom'\n            hierarchical_topics: A dataframe that contains a hierarchy of topics\n                                 represented by their parents and their children.\n                                 NOTE: The hierarchical topic names are only visualized\n                                 if both `topics` and `top_n_topics` are not set.\n            linkage_function: The linkage function to use. Default is:\n                              `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n                              NOTE: Make sure to use the same `linkage_function` as used\n                              in `topic_model.hierarchical_topics`.\n            distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                               `lambda x: 1 - cosine_similarity(x)`\n                               NOTE: Make sure to use the same `distance_function` as used\n                               in `topic_model.hierarchical_topics`.\n            color_threshold: Value at which the separation of clusters will be made which\n                             will result in different colors for different clusters.\n                             A higher value will typically lead to less colored clusters.\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n\n        To visualize the hierarchical structure of\n        topics simply run:\n\n        ```python\n        topic_model.visualize_hierarchy()\n        ```\n\n        If you also want the labels of hierarchical topics visualized,\n        run the following:\n\n        ```python\n        # Extract hierarchical topics and their representations\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n        # Visualize these representations\n        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n        ```\n\n        If you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_hierarchy()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        <iframe src=\"../getting_started/visualization/hierarchy.html\"\n        style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_hierarchy(self,\n                                            orientation=orientation,\n                                            topics=topics,\n                                            top_n_topics=top_n_topics,\n                                            custom_labels=custom_labels,\n                                            title=title,\n                                            width=width,\n                                            height=height,\n                                            hierarchical_topics=hierarchical_topics,\n                                            linkage_function=linkage_function,\n                                            distance_function=distance_function,\n                                            color_threshold=color_threshold\n                                            )\n\n    def visualize_heatmap(self,\n                          topics: List[int] = None,\n                          top_n_topics: int = None,\n                          n_clusters: int = None,\n                          custom_labels: bool = False,\n                          title: str = \"<b>Similarity Matrix</b>\",\n                          width: int = 800,\n                          height: int = 800) -> go.Figure:\n        \"\"\" Visualize a heatmap of the topic's similarity matrix\n\n        Based on the cosine similarity matrix between topic embeddings,\n        a heatmap is created showing the similarity between topics.\n\n        Arguments:\n            topics: A selection of topics to visualize.\n            top_n_topics: Only select the top n most frequent topics.\n            n_clusters: Create n clusters and order the similarity\n                        matrix by those clusters.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n\n        To visualize the similarity matrix of\n        topics simply run:\n\n        ```python\n        topic_model.visualize_heatmap()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_heatmap()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_heatmap(self,\n                                          topics=topics,\n                                          top_n_topics=top_n_topics,\n                                          n_clusters=n_clusters,\n                                          custom_labels=custom_labels,\n                                          title=title,\n                                          width=width,\n                                          height=height)\n\n    def visualize_barchart(self,\n                           topics: List[int] = None,\n                           top_n_topics: int = 8,\n                           n_words: int = 5,\n                           custom_labels: bool = False,\n                           title: str = \"Topic Word Scores\",\n                           width: int = 250,\n                           height: int = 250,\n                           autoscale: bool=False) -> go.Figure:\n        \"\"\" Visualize a barchart of selected topics\n\n        Arguments:\n            topics: A selection of topics to visualize.\n            top_n_topics: Only select the top n most frequent topics.\n            n_words: Number of words to show in a topic\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of each figure.\n            height: The height of each figure.\n            autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n\n        To visualize the barchart of selected topics\n        simply run:\n\n        ```python\n        topic_model.visualize_barchart()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_barchart()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_barchart(self,\n                                           topics=topics,\n                                           top_n_topics=top_n_topics,\n                                           n_words=n_words,\n                                           custom_labels=custom_labels,\n                                           title=title,\n                                           width=width,\n                                           height=height,\n                                           autoscale=autoscale)\n\n    def save(self,\n             path,\n             serialization: Literal[\"safetensors\", \"pickle\", \"pytorch\"] = \"pickle\",\n             save_embedding_model: Union[bool, str] = True,\n             save_ctfidf: bool = False):\n        \"\"\" Saves the model to the specified path or folder\n\n        When saving the model, make sure to also keep track of the versions\n        of dependencies and Python used. Loading and saving the model should\n        be done using the same dependencies and Python. Moreover, models\n        saved in one version of BERTopic should not be loaded in other versions.\n\n        Arguments:\n            path: If `serialization` is 'safetensors' or `pytorch`, this is a directory.\n                  If `serialization` is `pickle`, then this is a file.\n            serialization: If `pickle`, the entire model will be pickled. If `safetensors`\n                           or `pytorch` the model will be saved without the embedding,\n                           dimensionality reduction, and clustering algorithms.\n                           This is a very efficient format and typically advised.\n            save_embedding_model: If serialization is `pickle`, then you can choose to skip\n                                  saving the embedding model. If serialization is `safetensors`\n                                  or `pytorch`, this variable can be used as a string pointing\n                                  towards a huggingface model.\n            save_ctfidf: Whether to save c-TF-IDF information if serialization is `safetensors`\n                         or `pytorch`\n\n        Examples:\n\n        To save the model in an efficient and safe format (safetensors) with c-TF-IDF information:\n\n        ```python\n        topic_model.save(\"model_dir\", serialization=\"safetensors\", save_ctfidf=True)\n        ```\n\n        If you wish to also add a pointer to the embedding model, which will be downloaded from\n        HuggingFace upon loading:\n\n        ```python\n        embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\n        topic_model.save(\"model_dir\", serialization=\"safetensors\", save_embedding_model=embedding_model)\n        ```\n\n        or if you want save the full model with pickle:\n\n        ```python\n        topic_model.save(\"my_model\")\n        ```\n\n        NOTE: Pickle can run arbitrary code and is generally considered to be less safe than\n        safetensors.\n        \"\"\"\n        if serialization == \"pickle\":\n            logger.warning(\"When you use `pickle` to save/load a BERTopic model,\"\n                           \"please make sure that the environments in which you save\"\n                           \"and load the model are **exactly** the same. The version of BERTopic,\"\n                           \"its dependencies, and python need to remain the same.\")\n\n            with open(path, 'wb') as file:\n\n                # This prevents the vectorizer from being too large in size if `min_df` was\n                # set to a value higher than 1\n                self.vectorizer_model.stop_words_ = None\n\n                if not save_embedding_model:\n                    embedding_model = self.embedding_model\n                    self.embedding_model = None\n                    joblib.dump(self, file)\n                    self.embedding_model = embedding_model\n                else:\n                    joblib.dump(self, file)\n        elif serialization == \"safetensors\" or serialization == \"pytorch\":\n\n            # Directory\n            save_directory = Path(path)\n            save_directory.mkdir(exist_ok=True, parents=True)\n\n            # Check embedding model\n            if save_embedding_model and hasattr(self.embedding_model, '_hf_model') and not isinstance(save_embedding_model, str):\n                save_embedding_model = self.embedding_model._hf_model\n            elif not save_embedding_model:\n                logger.warning(\"You are saving a BERTopic model without explicitly defining an embedding model.\"\n                               \"If you are using a sentence-transformers model or a HuggingFace model supported\"\n                               \"by sentence-transformers, please save the model by using a pointer towards that model.\"\n                               \"For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`\")\n\n            # Minimal\n            save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization)\n            save_utils.save_topics(model=self, path=save_directory / \"topics.json\")\n            save_utils.save_images(model=self, path=save_directory / \"images\")\n            save_utils.save_config(model=self, path=save_directory / 'config.json', embedding_model=save_embedding_model)\n\n            # Additional\n            if save_ctfidf:\n                save_utils.save_ctfidf(model=self, save_directory=save_directory, serialization=serialization)\n                save_utils.save_ctfidf_config(model=self, path=save_directory / 'ctfidf_config.json')\n\n    @classmethod\n    def load(cls,\n             path: str,\n             embedding_model=None):\n        \"\"\" Loads the model from the specified path or directory\n\n        Arguments:\n            path: Either load a BERTopic model from a file (`.pickle`) or a folder containing\n                  `.safetensors` or `.bin` files.\n            embedding_model: Additionally load in an embedding model if it was not saved\n                             in the BERTopic model file or directory.\n\n        Examples:\n\n        ```python\n        BERTopic.load(\"model_dir\")\n        ```\n\n        or if you did not save the embedding model:\n\n        ```python\n        BERTopic.load(\"model_dir\", embedding_model=\"all-MiniLM-L6-v2\")\n        ```\n        \"\"\"\n        file_or_dir = Path(path)\n\n        # Load from Pickle\n        if file_or_dir.is_file():\n            with open(file_or_dir, 'rb') as file:\n                if embedding_model:\n                    topic_model = joblib.load(file)\n                    topic_model.embedding_model = select_backend(embedding_model)\n                else:\n                    topic_model = joblib.load(file)\n                return topic_model\n\n        # Load from directory or HF\n        if file_or_dir.is_dir():\n            topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir)\n        elif \"/\" in str(path):\n            topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path)\n        else:\n            raise ValueError(\"Make sure to either pass a valid directory or HF model.\")\n        topic_model = _create_model_from_files(topics, params, tensors, ctfidf_tensors, ctfidf_config, images,\n                                               warn_no_backend=(embedding_model is None))\n\n        # Replace embedding model if one is specifically chosen\n        if embedding_model is not None:\n            topic_model.embedding_model = select_backend(embedding_model)\n\n        return topic_model\n\n    @classmethod\n    def merge_models(cls, models, min_similarity: float = .7, embedding_model=None):\n        \"\"\" Merge multiple pre-trained BERTopic models into a single model.\n\n        The models are merged as if they were all saved using pytorch or\n        safetensors, so a minimal version without c-TF-IDF.\n\n        To do this, we choose the first model in the list of\n        models as a baseline. Then, we check each model whether\n        they contain topics that are not in the baseline.\n        This check is based on the cosine similarity between\n        topics embeddings. If topic embeddings between two models\n        are similar, then the topic of the second model is re-assigned\n        to the first. If they are dissimilar, the topic of the second\n        model is assigned to the first.\n\n        In essence, we simply check whether sufficiently \"new\"\n        topics emerge and add them.\n\n        Arguments:\n            models: A list of fitted BERTopic models\n            min_similarity: The minimum similarity for when topics are merged.\n            embedding_model: Additionally load in an embedding model if necessary.\n\n        Returns:\n            A new BERTopic model that was created as if you were\n            loading a model from the HuggingFace Hub without c-TF-IDF\n\n        Examples:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n        # Create three separate models\n        topic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000])\n        topic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000])\n        topic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:])\n\n        # Combine all models into one\n        merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n        ```\n        \"\"\"\n        import torch\n\n        # Temporarily save model and push to HF\n        with TemporaryDirectory() as tmpdir:\n\n            # Save model weights and config.\n            all_topics, all_params, all_tensors = [], [], []\n            for index, model in enumerate(models):\n                model.save(tmpdir, serialization=\"pytorch\")\n                topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir))\n                all_topics.append(topics)\n                all_params.append(params)\n                all_tensors.append(np.array(tensors[\"topic_embeddings\"]))\n\n                # Create a base set of parameters\n                if index == 0:\n                    merged_topics = topics\n                    merged_params = params\n                    merged_tensors = np.array(tensors[\"topic_embeddings\"])\n                    merged_topics[\"custom_labels\"] = None\n\n        for tensors, selected_topics in zip(all_tensors[1:], all_topics[1:]):\n            # Calculate similarity matrix\n            sim_matrix = cosine_similarity(tensors, merged_tensors)\n            sims = np.max(sim_matrix, axis=1)\n\n            # Extract new topics\n            new_topics = sorted([index - selected_topics[\"_outliers\"] for index, sim in enumerate(sims) if sim < min_similarity])\n            max_topic = max(set(merged_topics[\"topics\"]))\n\n            # Merge Topic Representations\n            new_topics_dict = {}\n            for new_topic in new_topics:\n                if new_topic != -1:\n                    max_topic += 1\n                    new_topics_dict[new_topic] = max_topic\n                    merged_topics[\"topic_representations\"][str(max_topic)] = selected_topics[\"topic_representations\"][str(new_topic)]\n                    merged_topics[\"topic_labels\"][str(max_topic)] = selected_topics[\"topic_labels\"][str(new_topic)]\n\n                    # Add new aspects\n                    if selected_topics[\"topic_aspects\"]:\n                        aspects_1 = set(merged_topics[\"topic_aspects\"].keys())\n                        aspects_2 = set(selected_topics[\"topic_aspects\"].keys())\n                        aspects_diff = aspects_2.difference(aspects_1)\n                        if aspects_diff:\n                            for aspect in aspects_diff:\n                                merged_topics[\"topic_aspects\"][aspect] = {}\n\n                        # If the original model does not have topic aspects but the to be added model does\n                        if not merged_topics.get(\"topic_aspects\"):\n                            merged_topics[\"topic_aspects\"] = selected_topics[\"topic_aspects\"]\n\n                        # If they both contain topic aspects, add to the existing set of aspects\n                        else:\n                            for aspect, values in selected_topics[\"topic_aspects\"].items():\n                                merged_topics[\"topic_aspects\"][aspect][str(max_topic)] = values[str(new_topic)]\n\n                    # Add new embeddings\n                    new_tensors = tensors[new_topic + selected_topics[\"_outliers\"]]\n                    merged_tensors = np.vstack([merged_tensors, new_tensors])\n\n            # Topic Mapper\n            merged_topics[\"topic_mapper\"] = TopicMapper(list(range(-1, max_topic+1, 1))).mappings_\n\n            # Find similar topics and re-assign those from the new models\n            sims_idx = np.argmax(sim_matrix, axis=1)\n            sims = np.max(sim_matrix, axis=1)\n            to_merge = {\n                a - selected_topics[\"_outliers\"]:\n                b - merged_topics[\"_outliers\"] for a, (b, val) in enumerate(zip(sims_idx, sims))\n                if val >= min_similarity\n            }\n            to_merge.update(new_topics_dict)\n            to_merge[-1] = -1\n            topics = [to_merge[topic] for topic in selected_topics[\"topics\"]]\n            merged_topics[\"topics\"].extend(topics)\n            merged_topics[\"topic_sizes\"] = dict(Counter(merged_topics[\"topics\"]))\n\n        # Create a new model from the merged parameters\n        merged_tensors = {\"topic_embeddings\": torch.from_numpy(merged_tensors)}\n        merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False)\n        merged_model.embedding_model = models[0].embedding_model\n\n        # Replace embedding model if one is specifically chosen\n        if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder:\n            merged_model.embedding_model = select_backend(embedding_model)\n        return merged_model\n\n    def push_to_hf_hub(\n            self,\n            repo_id: str,\n            commit_message: str = 'Add BERTopic model',\n            token: str = None,\n            revision: str = None,\n            private: bool = False,\n            create_pr: bool = False,\n            model_card: bool = True,\n            serialization: str = \"safetensors\",\n            save_embedding_model: Union[str, bool] = True,\n            save_ctfidf: bool = False,\n            ):\n        \"\"\" Push your BERTopic model to a HuggingFace Hub\n\n        Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:\n\n        * Log in to your HuggingFace account with the following command:\n            ```bash\n            huggingface-cli login\n\n            # or using an environment variable\n            huggingface-cli login --token $HUGGINGFACE_TOKEN\n            ```\n        * Alternatively, you can programmatically login using login() in a notebook or a script:\n            ```python\n            from huggingface_hub import login\n            login()\n            ```\n        * Or you can give a token with the `token` variable\n\n        Arguments:\n            repo_id: The name of your HuggingFace repository\n            commit_message: A commit message\n            token: Token to add if not already logged in\n            revision: Repository revision\n            private: Whether to create a private repository\n            create_pr: Whether to upload the model as a Pull Request\n            model_card: Whether to automatically create a modelcard\n            serialization: The type of serialization.\n                           Either `safetensors` or `pytorch`\n            save_embedding_model: A pointer towards a HuggingFace model to be loaded in with\n                                  SentenceTransformers. E.g.,\n                                  `sentence-transformers/all-MiniLM-L6-v2`\n            save_ctfidf: Whether to save c-TF-IDF information\n\n\n        Examples:\n\n        ```python\n        topic_model.push_to_hf_hub(\n            repo_id=\"ArXiv\",\n            save_ctfidf=True,\n            save_embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n        )\n        ```\n        \"\"\"\n        return save_utils.push_to_hf_hub(model=self, repo_id=repo_id, commit_message=commit_message,\n                                         token=token, revision=revision, private=private, create_pr=create_pr,\n                                         model_card=model_card, serialization=serialization,\n                                         save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf)\n\n    def get_params(self, deep: bool = False) -> Mapping[str, Any]:\n        \"\"\" Get parameters for this estimator.\n\n        Adapted from:\n            https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178\n\n        Arguments:\n            deep: bool, default=True\n                  If True, will return the parameters for this estimator and\n                  contained subobjects that are estimators.\n\n        Returns:\n            out: Parameter names mapped to their values.\n        \"\"\"\n        out = dict()\n        for key in self._get_param_names():\n            value = getattr(self, key)\n            if deep and hasattr(value, 'get_params'):\n                deep_items = value.get_params().items()\n                out.update((key + '__' + k, val) for k, val in deep_items)\n            out[key] = value\n        return out\n\n    def _extract_embeddings(self,\n                            documents: Union[List[str], str],\n                            images: List[str] = None,\n                            method: str = \"document\",\n                            verbose: bool = None) -> np.ndarray:\n        \"\"\" Extract sentence/document embeddings through pre-trained embeddings\n        For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            images: A list of paths to the images to fit on or the images themselves\n            method: Whether to extract document or word-embeddings, options are \"document\" and \"word\"\n            verbose: Whether to show a progressbar demonstrating the time to extract embeddings\n\n        Returns:\n            embeddings: The extracted embeddings.\n        \"\"\"\n        if isinstance(documents, str):\n            documents = [documents]\n\n        if images is not None and hasattr(self.embedding_model, \"embed_images\"):\n            embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose)\n        elif method == \"word\":\n            embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose)\n        elif method == \"document\":\n            embeddings = self.embedding_model.embed_documents(documents, verbose=verbose)\n        elif documents[0] is None and images is None:\n            raise ValueError(\"Make sure to use an embedding model that can either embed documents\"\n                             \"or images depending on which you want to embed.\")\n        else:\n            raise ValueError(\"Wrong method for extracting document/word embeddings. \"\n                             \"Either choose 'word' or 'document' as the method. \")\n        return embeddings\n\n    def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:\n        \"\"\" Convert images to text \"\"\"\n        logger.info(\"Images - Converting images to text. This might take a while.\")\n        if isinstance(self.representation_model, dict):\n            for tuner in self.representation_model.values():\n                if getattr(tuner, 'image_to_text_model', False):\n                    documents = tuner.image_to_text(documents, embeddings)\n        elif isinstance(self.representation_model, list):\n            for tuner in self.representation_model:\n                if getattr(tuner, 'image_to_text_model', False):\n                    documents = tuner.image_to_text(documents, embeddings)\n        elif isinstance(self.representation_model, BaseRepresentation):\n            if getattr(self.representation_model, 'image_to_text_model', False):\n                documents = self.representation_model.image_to_text(documents, embeddings)\n        logger.info(\"Images - Completed \\u2713\")\n        return documents\n\n    def _map_predictions(self, predictions: List[int]) -> List[int]:\n        \"\"\" Map predictions to the correct topics if topics were reduced \"\"\"\n        mappings = self.topic_mapper_.get_mappings(original_topics=True)\n        mapped_predictions = [mappings[prediction]\n                              if prediction in mappings\n                              else -1\n                              for prediction in predictions]\n        return mapped_predictions\n\n    def _reduce_dimensionality(self,\n                               embeddings: Union[np.ndarray, csr_matrix],\n                               y: Union[List[int], np.ndarray] = None,\n                               partial_fit: bool = False) -> np.ndarray:\n        \"\"\" Reduce dimensionality of embeddings using UMAP and train a UMAP model\n\n        Arguments:\n            embeddings: The extracted embeddings using the sentence transformer module.\n            y: The target class for (semi)-supervised dimensionality reduction\n            partial_fit: Whether to run `partial_fit` for online learning\n\n        Returns:\n            umap_embeddings: The reduced embeddings\n        \"\"\"\n        logger.info(\"Dimensionality - Fitting the dimensionality reduction algorithm\")\n        # Partial fit\n        if partial_fit:\n            if hasattr(self.umap_model, \"partial_fit\"):\n                self.umap_model = self.umap_model.partial_fit(embeddings)\n            elif self.topic_representations_ is None:\n                self.umap_model.fit(embeddings)\n\n        # Regular fit\n        else:\n            try:\n                # cuml umap needs y to be an numpy array\n                y = np.array(y) if y is not None else None\n                self.umap_model.fit(embeddings, y=y)\n            except TypeError:\n\n                self.umap_model.fit(embeddings)\n\n        umap_embeddings = self.umap_model.transform(embeddings)\n        logger.info(\"Dimensionality - Completed \\u2713\")\n        return np.nan_to_num(umap_embeddings)\n\n    def _cluster_embeddings(self,\n                            umap_embeddings: np.ndarray,\n                            documents: pd.DataFrame,\n                            partial_fit: bool = False,\n                            y: np.ndarray = None) -> Tuple[pd.DataFrame,\n                                                           np.ndarray]:\n        \"\"\" Cluster UMAP embeddings with HDBSCAN\n\n        Arguments:\n            umap_embeddings: The reduced sentence embeddings with UMAP\n            documents: Dataframe with documents and their corresponding IDs\n            partial_fit: Whether to run `partial_fit` for online learning\n\n        Returns:\n            documents: Updated dataframe with documents and their corresponding IDs\n                       and newly added Topics\n            probabilities: The distribution of probabilities\n        \"\"\"\n        logger.info(\"Cluster - Start clustering the reduced embeddings\")\n        if partial_fit:\n            self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings)\n            labels = self.hdbscan_model.labels_\n            documents['Topic'] = labels\n            self.topics_ = labels\n        else:\n            try:\n                self.hdbscan_model.fit(umap_embeddings, y=y)\n            except TypeError:\n                self.hdbscan_model.fit(umap_embeddings)\n\n            try:\n                labels = self.hdbscan_model.labels_\n            except AttributeError:\n                labels = y\n            documents['Topic'] = labels\n            self._update_topic_size(documents)\n\n        # Some algorithms have outlier labels (-1) that can be tricky to work\n        # with if you are slicing data based on that labels. Therefore, we\n        # track if there are outlier labels and act accordingly when slicing.\n        self._outliers = 1 if -1 in set(labels) else 0\n\n        # Extract probabilities\n        probabilities = None\n        if hasattr(self.hdbscan_model, \"probabilities_\"):\n            probabilities = self.hdbscan_model.probabilities_\n\n            if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model):\n                probabilities = hdbscan_delegator(self.hdbscan_model, \"all_points_membership_vectors\")\n\n        if not partial_fit:\n            self.topic_mapper_ = TopicMapper(self.topics_)\n        logger.info(\"Cluster - Completed \\u2713\")\n        return documents, probabilities\n\n    def _zeroshot_topic_modeling(self, documents: pd.DataFrame, embeddings: np.ndarray) -> Tuple[pd.DataFrame, np.array,\n                                                                                                 pd.DataFrame, np.array]:\n        \"\"\" Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list\n\n        We transform the topics in `self.zeroshot_topic_list` to embeddings and\n        compare them through cosine similarity with the document embeddings.\n        If they pass the `self.zeroshot_min_similarity` threshold, they are assigned.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            embeddings: The document embeddings\n\n        Returns:\n            documents: The leftover documents that were not assigned to any topic\n            embeddings: The leftover embeddings that were not assigned to any topic\n        \"\"\"\n        logger.info(\"Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics\")\n        # Similarity between document and zero-shot topic embeddings\n        zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list)\n        cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings)\n        assignment = np.argmax(cosine_similarities, 1)\n        assignment_vals = np.max(cosine_similarities, 1)\n        assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity]\n        non_assigned_ids = [index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity]\n\n        # Assign topics\n        assigned_documents = documents.iloc[assigned_ids]\n        assigned_documents[\"Topic\"] = [topic for topic in assignment[assigned_ids]]\n        assigned_documents[\"Old_ID\"] = assigned_documents[\"ID\"].copy()\n        assigned_documents[\"ID\"] = range(len(assigned_documents))\n        assigned_embeddings = embeddings[assigned_ids]\n\n        # Select non-assigned topics to be clustered\n        documents = documents.iloc[non_assigned_ids]\n        documents[\"Old_ID\"] = documents[\"ID\"].copy()\n        documents[\"ID\"] = range(len(documents))\n        embeddings = embeddings[non_assigned_ids]\n\n        # If only matches were found\n        if len(non_assigned_ids) == 0:\n            return None, None, assigned_documents, assigned_embeddings\n        logger.info(\"Zeroshot Step 1 - Completed \\u2713\")\n        return documents, embeddings, assigned_documents, assigned_embeddings\n\n    def _is_zeroshot(self):\n        \"\"\" Check whether zero-shot topic modeling is possible\n\n        * There should be a cluster model used\n        * Embedding model is necessary to convert zero-shot topics to embeddings\n        * Zero-shot topics should be defined\n        \"\"\"\n        if self.zeroshot_topic_list is not None and self.embedding_model is not None and type(self.hdbscan_model) != BaseCluster:\n            return True\n        return False\n\n    def _combine_zeroshot_topics(self,\n                                 documents: pd.DataFrame,\n                                 assigned_documents: pd.DataFrame,\n                                 embeddings: np.ndarray) -> Union[Tuple[np.ndarray, np.ndarray], np.ndarray]:\n        \"\"\" Combine the zero-shot topics with the clustered topics\n\n        There are three cases considered:\n        * Only zero-shot topics were found which will only return the zero-shot topic model\n        * Only clustered topics were found which will only return the clustered topic model\n        * Both zero-shot and clustered topics were found which will return a merged model\n          * This merged model is created using the `merge_models` function which will ignore\n            the underlying UMAP and HDBSCAN models\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            assigned_documents: Dataframe with documents and their corresponding IDs\n                                that were assigned to a zero-shot topic\n            embeddings: The document embeddings\n\n        Returns:\n            topics: The topics for each document\n            probabilities: The probabilities for each document\n        \"\"\"\n        logger.info(\"Zeroshot Step 2 - Clustering documents that were not found in the zero-shot model...\")\n\n        # Fit BERTopic without actually performing any clustering\n        docs = assigned_documents.Document.tolist()\n        y = assigned_documents.Topic.tolist()\n        empty_dimensionality_model = BaseDimensionalityReduction()\n        empty_cluster_model = BaseCluster()\n        zeroshot_model = BERTopic(\n                n_gram_range=self.n_gram_range,\n                low_memory=self.low_memory,\n                calculate_probabilities=self.calculate_probabilities,\n                embedding_model=self.embedding_model,\n                umap_model=empty_dimensionality_model,\n                hdbscan_model=empty_cluster_model,\n                vectorizer_model=self.vectorizer_model,\n                ctfidf_model=self.ctfidf_model,\n                representation_model=self.representation_model,\n                verbose=self.verbose\n        ).fit(docs, embeddings=embeddings, y=y)\n        logger.info(\"Zeroshot Step 2 - Completed \\u2713\")\n        logger.info(\"Zeroshot Step 3 - Combining clustered topics with the zeroshot model\")\n\n        # Update model\n        self.umap_model = BaseDimensionalityReduction()\n        self.hdbscan_model = BaseCluster()\n\n        # Update topic label\n        assigned_topics = assigned_documents.groupby(\"Topic\").first().reset_index()\n        indices, topics = assigned_topics.ID.values, assigned_topics.Topic.values\n        labels = [zeroshot_model.topic_labels_[zeroshot_model.topics_[index]] for index in indices]\n        labels = {label: self.zeroshot_topic_list[topic] for label, topic in zip(labels, topics)}\n\n        # If only zero-shot matches were found and clustering was not performed\n        if documents is None:\n            for topic in range(len(set(y))):\n                if zeroshot_model.topic_labels_.get(topic):\n                    if labels.get(zeroshot_model.topic_labels_[topic]):\n                        zeroshot_model.topic_labels_[topic] = labels[zeroshot_model.topic_labels_[topic]]\n            self.__dict__.clear()\n            self.__dict__.update(zeroshot_model.__dict__)\n            return self.topics_, self.probabilities_\n\n        # Merge the two topic models\n        merged_model = BERTopic.merge_models([zeroshot_model, self], min_similarity=1)\n\n        # Update topic labels and representative docs of the zero-shot model\n        for topic in range(len(set(y))):\n            if merged_model.topic_labels_.get(topic):\n                if labels.get(merged_model.topic_labels_[topic]):\n                    label = labels[merged_model.topic_labels_[topic]]\n                    merged_model.topic_labels_[topic] = label\n                    merged_model.representative_docs_[topic] = zeroshot_model.representative_docs_[topic]\n\n        # Add representative docs of the clustered model\n        for topic in set(self.topics_):\n            merged_model.representative_docs_[topic + self._outliers + len(set(y))] = self.representative_docs_[topic]\n\n        if self._outliers and merged_model.topic_sizes_.get(-1):\n            merged_model.topic_sizes_[len(set(y))] = merged_model.topic_sizes_[-1]\n            del merged_model.topic_sizes_[-1]\n\n        # Update topic assignment by finding the documents with the\n        # correct updated topics\n        zeroshot_indices = list(assigned_documents.Old_ID.values)\n        zeroshot_topics = [self.zeroshot_topic_list[topic] for topic in assigned_documents.Topic.values]\n\n        cluster_indices = list(documents.Old_ID.values)\n        cluster_names = list(merged_model.topic_labels_.values())[len(set(y)):]\n        if self._outliers:\n            cluster_topics = [cluster_names[topic] if topic != -1 else \"Outliers\" for topic in documents.Topic.values]\n        else:\n            cluster_topics = [cluster_names[topic] for topic in documents.Topic.values]\n\n        df = pd.DataFrame({\n            \"Indices\": zeroshot_indices + cluster_indices,\n            \"Label\": zeroshot_topics + cluster_topics}\n        ).sort_values(\"Indices\")\n        reverse_topic_labels = dict((v, k) for k, v in merged_model.topic_labels_.items())\n        if self._outliers:\n            reverse_topic_labels[\"Outliers\"] = -1\n        df.Label = df.Label.map(reverse_topic_labels)\n        merged_model.topics_ = df.Label.astype(int).tolist()\n\n        # Update the class internally\n        has_outliers = bool(self._outliers)\n        self.__dict__.clear()\n        self.__dict__.update(merged_model.__dict__)\n        logger.info(\"Zeroshot Step 3 - Completed \\u2713\")\n\n        # Move -1 topic back to position 0 if it exists\n        if has_outliers:\n            nr_zeroshot_topics = len(set(y))\n\n            # Re-map the topics such that the -1 topic is at position 0\n            new_mappings = {}\n            for topic in self.topics_:\n                if topic < nr_zeroshot_topics:\n                    new_mappings[topic] = topic\n                elif topic == nr_zeroshot_topics:\n                    new_mappings[topic] = -1\n                else:\n                    new_mappings[topic] = topic - 1\n\n            # Re-map the topics including all representations (labels, sizes, embeddings, etc.)\n            self.topics_ = [new_mappings[topic] for topic in self.topics_]\n            self.topic_representations_ = {new_mappings[topic]: repr for topic, repr in self.topic_representations_.items()}\n            self.topic_labels_ = {new_mappings[topic]: label for topic, label in self.topic_labels_.items()}\n            self.topic_sizes_ = collections.Counter(self.topics_)\n            self.topic_embeddings_ = np.vstack([\n                self.topic_embeddings_[nr_zeroshot_topics],\n                self.topic_embeddings_[:nr_zeroshot_topics],\n                self.topic_embeddings_[nr_zeroshot_topics+1:]\n            ])\n            self._outliers = 1\n\n        return self.topics_\n\n    def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]:\n        \"\"\" Apply Guided Topic Modeling\n\n        We transform the seeded topics to embeddings using the\n        same embedder as used for generating document embeddings.\n\n        Then, we apply cosine similarity between the embeddings\n        and set labels for documents that are more similar to\n        one of the topics than the average document.\n\n        If a document is more similar to the average document\n        than any of the topics, it gets the -1 label and is\n        thereby not included in UMAP.\n\n        Arguments:\n            embeddings: The document embeddings\n\n        Returns\n            y: The labels for each seeded topic\n            embeddings: Updated embeddings\n        \"\"\"\n        logger.info(\"Guided - Find embeddings highly related to seeded topics.\")\n        # Create embeddings from the seeded topics\n        seed_topic_list = [\" \".join(seed_topic) for seed_topic in self.seed_topic_list]\n        seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose)\n        seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)])\n\n        # Label documents that are most similar to one of the seeded topics\n        sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings)\n        y = [np.argmax(sim_matrix[index]) for index in range(sim_matrix.shape[0])]\n        y = [val if val != len(seed_topic_list) else -1 for val in y]\n\n        # Average the document embeddings related to the seeded topics with the\n        # embedding of the seeded topic to force the documents in a cluster\n        for seed_topic in range(len(seed_topic_list)):\n            indices = [index for index, topic in enumerate(y) if topic == seed_topic]\n            embeddings[indices] = np.average([embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1])\n        logger.info(\"Guided - Completed \\u2713\")\n        return y, embeddings\n\n    def _extract_topics(self, documents: pd.DataFrame, embeddings: np.ndarray = None, mappings=None, verbose: bool = False):\n        \"\"\" Extract topics from the clusters using a class-based TF-IDF\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            embeddings: The document embeddings\n            mappings: The mappings from topic to word\n            verbose: Whether to log the process of extracting topics\n\n        Returns:\n            c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic\n        \"\"\"\n        if verbose:\n            logger.info(\"Representation - Extracting topics from clusters using representation models.\")\n        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})\n        self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)\n        self.topic_representations_ = self._extract_words_per_topic(words, documents)\n        self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)\n        self.topic_labels_ = {key: f\"{key}_\" + \"_\".join([word[0] for word in values[:4]])\n                              for key, values in\n                              self.topic_representations_.items()}\n        if verbose:\n            logger.info(\"Representation - Completed \\u2713\")\n\n    def _save_representative_docs(self, documents: pd.DataFrame):\n        \"\"\" Save the 3 most representative docs per topic\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n\n        Updates:\n            self.representative_docs_: Populate each topic with 3 representative docs\n        \"\"\"\n        repr_docs, _, _, _ = self._extract_representative_docs(\n            self.c_tf_idf_,\n            documents,\n            self.topic_representations_,\n            nr_samples=500,\n            nr_repr_docs=3\n        )\n        self.representative_docs_ = repr_docs\n\n    def _extract_representative_docs(self,\n                                     c_tf_idf: csr_matrix,\n                                     documents: pd.DataFrame,\n                                     topics: Mapping[str, List[Tuple[str, float]]],\n                                     nr_samples: int = 500,\n                                     nr_repr_docs: int = 5,\n                                     diversity: float = None\n                                     ) -> Union[List[str], List[List[int]]]:\n        \"\"\" Approximate most representative documents per topic by sampling\n        a subset of the documents in each topic and calculating which are\n        most represenative to their topic based on the cosine similarity between\n        c-TF-IDF representations.\n\n        Arguments:\n            c_tf_idf: The topic c-TF-IDF representation\n            documents: All input documents\n            topics: The candidate topics as calculated with c-TF-IDF\n            nr_samples: The number of candidate documents to extract per topic\n            nr_repr_docs: The number of representative documents to extract per topic\n            diversity: The diversity between the most representative documents.\n                       If None, no MMR is used. Otherwise, accepts values between 0 and 1.\n\n        Returns:\n            repr_docs_mappings: A dictionary from topic to representative documents\n            representative_docs: A flat list of representative documents\n            repr_doc_indices: Ordered indices of representative documents\n                              that belong to each topic\n            repr_doc_ids: The indices of representative documents\n                          that belong to each topic\n        \"\"\"\n        # Sample documents per topic\n        documents_per_topic = (\n            documents.drop(\"Image\", axis=1, errors=\"ignore\")\n                     .groupby('Topic')\n                     .sample(n=nr_samples, replace=True, random_state=42)\n                     .drop_duplicates()\n        )\n\n        # Find and extract documents that are most similar to the topic\n        repr_docs = []\n        repr_docs_indices = []\n        repr_docs_mappings = {}\n        repr_docs_ids = []\n        labels = sorted(list(topics.keys()))\n        for index, topic in enumerate(labels):\n\n            # Slice data\n            selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :]\n            selected_docs = selection[\"Document\"].values\n            selected_docs_ids = selection.index.tolist()\n\n            # Calculate similarity\n            nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs)\n            bow = self.vectorizer_model.transform(selected_docs)\n            ctfidf = self.ctfidf_model.transform(bow)\n            sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index])\n\n            # Use MMR to find representative but diverse documents\n            if diversity:\n                docs = mmr(c_tf_idf[index], ctfidf, selected_docs, top_n=nr_docs, diversity=diversity)\n\n            # Extract top n most representative documents\n            else:\n                indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:]\n                docs = [selected_docs[index] for index in indices]\n\n            doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs]\n            repr_docs_ids.append(doc_ids)\n            repr_docs.extend(docs)\n            repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)])\n        repr_docs_mappings = {topic: repr_docs[i[0]:i[-1]+1] for topic, i in zip(topics.keys(), repr_docs_indices)}\n\n        return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids\n\n    def _create_topic_vectors(self, documents: pd.DataFrame = None, embeddings: np.ndarray = None, mappings=None):\n        \"\"\" Creates embeddings per topics based on their topic representation\n\n        As a default, topic vectors (topic embeddings) are created by taking\n        the average of all document embeddings within a topic. If topics are\n        merged, then a weighted average of topic embeddings is taken based on\n        the initial topic sizes.\n\n        For the `.partial_fit` and `.update_topics` method, the average\n        of all document embeddings is not taken since those are not known.\n        Instead, the weighted average of the embeddings of the top n words\n        is taken for each topic. The weighting is done based on the c-TF-IDF\n        score. This will put more emphasis to words that represent a topic best.\n        \"\"\"\n        # Topic embeddings based on input embeddings\n        if embeddings is not None and documents is not None:\n            topic_embeddings = []\n            topics = documents.sort_values(\"Topic\").Topic.unique()\n            for topic in topics:\n                indices = documents.loc[documents.Topic == topic, \"ID\"].values\n                indices = [int(index) for index in indices]\n                topic_embedding = np.mean(embeddings[indices], axis=0)\n                topic_embeddings.append(topic_embedding)\n            self.topic_embeddings_ = np.array(topic_embeddings)\n\n        # Topic embeddings when merging topics\n        elif self.topic_embeddings_ is not None and mappings is not None:\n            topic_embeddings_dict = {}\n            for topic_from, topics_to in mappings.items():\n                topic_ids = topics_to[\"topics_to\"]\n                topic_sizes = topics_to[\"topic_sizes\"]\n                if topic_ids:\n                    embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers]\n                    topic_embedding = np.average(embds, axis=0, weights=topic_sizes)\n                    topic_embeddings_dict[topic_from] = topic_embedding\n\n            # Re-order topic embeddings\n            topics_to_map = {topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]}\n            topic_embeddings = {}\n            for topic, embds in topic_embeddings_dict.items():\n                topic_embeddings[topics_to_map[topic]] = embds\n            unique_topics = sorted(list(topic_embeddings.keys()))\n            self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics])\n\n        # Topic embeddings based on keyword representations\n        elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder:\n            topic_list = list(self.topic_representations_.keys())\n            topic_list.sort()\n\n            # Only extract top n words\n            n = len(self.topic_representations_[topic_list[0]])\n            if self.top_n_words < n:\n                n = self.top_n_words\n\n            # Extract embeddings for all words in all topics\n            topic_words = [self.get_topic(topic) for topic in topic_list]\n            topic_words = [word[0] for topic in topic_words for word in topic]\n            word_embeddings = self._extract_embeddings(\n                topic_words,\n                method=\"word\",\n                verbose=False\n            )\n\n            # Take the weighted average of word embeddings in a topic based on their c-TF-IDF value\n            # The embeddings var is a single numpy matrix and therefore slicing is necessary to\n            # access the words per topic\n            topic_embeddings = []\n            for i, topic in enumerate(topic_list):\n                word_importance = [val[1] for val in self.get_topic(topic)]\n                if sum(word_importance) == 0:\n                    word_importance = [1 for _ in range(len(self.get_topic(topic)))]\n                topic_embedding = np.average(word_embeddings[i * n: n + (i * n)], weights=word_importance, axis=0)\n                topic_embeddings.append(topic_embedding)\n\n            self.topic_embeddings_ = np.array(topic_embeddings)\n\n    def _c_tf_idf(self,\n                  documents_per_topic: pd.DataFrame,\n                  fit: bool = True,\n                  partial_fit: bool = False) -> Tuple[csr_matrix, List[str]]:\n        \"\"\" Calculate a class-based TF-IDF where m is the number of total documents.\n\n        Arguments:\n            documents_per_topic: The joined documents per topic such that each topic has a single\n                                 string made out of multiple documents\n            m: The total number of documents (unjoined)\n            fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model\n            partial_fit: Whether to run `partial_fit` for online learning\n\n        Returns:\n            tf_idf: The resulting matrix giving a value (importance score) for each word per topic\n            words: The names of the words to which values were given\n        \"\"\"\n        documents = self._preprocess_text(documents_per_topic.Document.values)\n\n        if partial_fit:\n            X = self.vectorizer_model.partial_fit(documents).update_bow(documents)\n        elif fit:\n            X = self.vectorizer_model.fit_transform(documents)\n        else:\n            X = self.vectorizer_model.transform(documents)\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = self.vectorizer_model.get_feature_names_out()\n        else:\n            words = self.vectorizer_model.get_feature_names()\n\n        multiplier = None\n        if self.ctfidf_model.seed_words and self.seed_topic_list:\n            seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]\n            multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words])\n            multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])\n        elif self.ctfidf_model.seed_words:\n            multiplier = np.array([self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words])\n        elif self.seed_topic_list:\n            seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]\n            multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])\n\n        if fit:\n            self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)\n\n        c_tf_idf = self.ctfidf_model.transform(X)\n\n        return c_tf_idf, words\n\n    def _update_topic_size(self, documents: pd.DataFrame):\n        \"\"\" Calculate the topic sizes\n\n        Arguments:\n            documents: Updated dataframe with documents and their corresponding IDs and newly added Topics\n        \"\"\"\n        self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist())\n        self.topics_ = documents.Topic.astype(int).tolist()\n\n    def _extract_words_per_topic(self,\n                                 words: List[str],\n                                 documents: pd.DataFrame,\n                                 c_tf_idf: csr_matrix = None,\n                                 calculate_aspects: bool = True) -> Mapping[str,\n                                                                            List[Tuple[str, float]]]:\n        \"\"\" Based on tf_idf scores per topic, extract the top n words per topic\n\n        If the top words per topic need to be extracted, then only the `words` parameter\n        needs to be passed. If the top words per topic in a specific timestamp, then it\n        is important to pass the timestamp-based c-TF-IDF matrix and its corresponding\n        labels.\n\n        Arguments:\n            words: List of all words (sorted according to tf_idf matrix position)\n            documents: DataFrame with documents and their topic IDs\n            c_tf_idf: A c-TF-IDF matrix from which to calculate the top words\n\n        Returns:\n            topics: The top words per topic\n        \"\"\"\n        if c_tf_idf is None:\n            c_tf_idf = self.c_tf_idf_\n\n        labels = sorted(list(documents.Topic.unique()))\n        labels = [int(label) for label in labels]\n\n        # Get at least the top 30 indices and values per row in a sparse c-TF-IDF matrix\n        top_n_words = max(self.top_n_words, 30)\n        indices = self._top_n_idx_sparse(c_tf_idf, top_n_words)\n        scores = self._top_n_values_sparse(c_tf_idf, indices)\n        sorted_indices = np.argsort(scores, 1)\n        indices = np.take_along_axis(indices, sorted_indices, axis=1)\n        scores = np.take_along_axis(scores, sorted_indices, axis=1)\n\n        # Get top 30 words per topic based on c-TF-IDF score\n        topics = {label: [(words[word_index], score)\n                          if word_index is not None and score > 0\n                          else (\"\", 0.00001)\n                          for word_index, score in zip(indices[index][::-1], scores[index][::-1])\n                          ]\n                  for index, label in enumerate(labels)}\n\n        # Fine-tune the topic representations\n        if isinstance(self.representation_model, list):\n            for tuner in self.representation_model:\n                topics = tuner.extract_topics(self, documents, c_tf_idf, topics)\n        elif isinstance(self.representation_model, BaseRepresentation):\n            topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)\n        elif isinstance(self.representation_model, dict):\n            if self.representation_model.get(\"Main\"):\n                topics = self.representation_model[\"Main\"].extract_topics(self, documents, c_tf_idf, topics)\n        topics = {label: values[:self.top_n_words] for label, values in topics.items()}\n\n        # Extract additional topic aspects\n        if calculate_aspects and isinstance(self.representation_model, dict):\n            for aspect, aspect_model in self.representation_model.items():\n                aspects = topics.copy()\n                if aspect != \"Main\":\n                    if isinstance(aspect_model, list):\n                        for tuner in aspect_model:\n                            aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects)\n                        self.topic_aspects_[aspect] = aspects\n                    elif isinstance(aspect_model, BaseRepresentation):\n                        self.topic_aspects_[aspect] = aspect_model.extract_topics(self, documents, c_tf_idf, aspects)\n\n        return topics\n\n    def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:\n        \"\"\" Reduce topics to self.nr_topics\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n\n        Returns:\n            documents: Updated dataframe with documents and the reduced number of Topics\n        \"\"\"\n        logger.info(\"Topic reduction - Reducing number of topics\")\n        initial_nr_topics = len(self.get_topics())\n\n        if isinstance(self.nr_topics, int):\n            if self.nr_topics < initial_nr_topics:\n                documents = self._reduce_to_n_topics(documents)\n        elif isinstance(self.nr_topics, str):\n            documents = self._auto_reduce_topics(documents)\n        else:\n            raise ValueError(\"nr_topics needs to be an int or 'auto'! \")\n\n        logger.info(f\"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}\")\n        return documents\n\n    def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:\n        \"\"\" Reduce topics to self.nr_topics\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n\n        Returns:\n            documents: Updated dataframe with documents and the reduced number of Topics\n        \"\"\"\n        topics = documents.Topic.tolist().copy()\n\n        # Create topic distance matrix\n        if self.topic_embeddings_ is not None:\n            topic_embeddings = self.topic_embeddings_[self._outliers:, ]\n        else:\n            topic_embeddings = self.c_tf_idf_[self._outliers:, ].toarray()\n        distance_matrix = 1-cosine_similarity(topic_embeddings)\n        np.fill_diagonal(distance_matrix, 0)\n\n        # Cluster the topic embeddings using AgglomerativeClustering\n        if version.parse(sklearn_version) >= version.parse(\"1.4.0\"):\n            cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric=\"precomputed\", linkage=\"average\")\n        else:\n            cluster = AgglomerativeClustering(self.nr_topics - self._outliers, affinity=\"precomputed\", linkage=\"average\")\n        cluster.fit(distance_matrix)\n        new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics]\n\n        # Track mappings and sizes of topics for merging topic embeddings\n        mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)}\n        mappings = defaultdict(list)\n        for key, val in sorted(mapped_topics.items()):\n            mappings[val].append(key)\n        mappings = {topic_from:\n                    {\"topics_to\": topics_to,\n                     \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_to]}\n                    for topic_from, topics_to in mappings.items()}\n\n        # Map topics\n        documents.Topic = new_topics\n        self._update_topic_size(documents)\n        self.topic_mapper_.add_mappings(mapped_topics)\n\n        # Update representations\n        documents = self._sort_mappings_by_frequency(documents)\n        self._extract_topics(documents, mappings=mappings)\n        self._update_topic_size(documents)\n        return documents\n\n    def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:\n        \"\"\" Reduce the number of topics automatically using HDBSCAN\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n\n        Returns:\n            documents: Updated dataframe with documents and the reduced number of Topics\n        \"\"\"\n        topics = documents.Topic.tolist().copy()\n        unique_topics = sorted(list(documents.Topic.unique()))[self._outliers:]\n        max_topic = unique_topics[-1]\n\n        # Find similar topics\n        if self.topic_embeddings_ is not None:\n            embeddings = np.array(self.topic_embeddings_)\n        else:\n            embeddings = self.c_tf_idf_.toarray()\n        norm_data = normalize(embeddings, norm='l2')\n        predictions = hdbscan.HDBSCAN(min_cluster_size=2,\n                                      metric='euclidean',\n                                      cluster_selection_method='eom',\n                                      prediction_data=True).fit_predict(norm_data[self._outliers:])\n\n        # Map similar topics\n        mapped_topics = {unique_topics[index]: prediction + max_topic\n                         for index, prediction in enumerate(predictions)\n                         if prediction != -1}\n        documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)\n        mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}\n\n        # Track mappings and sizes of topics for merging topic embeddings\n        mappings = defaultdict(list)\n        for key, val in sorted(mapped_topics.items()):\n            mappings[val].append(key)\n        mappings = {topic_from:\n                    {\"topics_to\": topics_to,\n                     \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_to]}\n                    for topic_from, topics_to in mappings.items()}\n\n        # Update documents and topics\n        self.topic_mapper_.add_mappings(mapped_topics)\n        documents = self._sort_mappings_by_frequency(documents)\n        self._extract_topics(documents, mappings=mappings)\n        self._update_topic_size(documents)\n        return documents\n\n    def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame:\n        \"\"\" Reorder mappings by their frequency.\n\n        For example, if topic 88 was mapped to topic\n        5 and topic 5 turns out to be the largest topic,\n        then topic 5 will be topic 0. The second largest\n        will be topic 1, etc.\n\n        If there are no mappings since no reduction of topics\n        took place, then the topics will simply be ordered\n        by their frequency and will get the topic ids based\n        on that order.\n\n        This means that -1 will remain the outlier class, and\n        that the rest of the topics will be in descending order\n        of ids and frequency.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n\n        Returns:\n            documents: Updated dataframe with documents and the mapped\n                       and re-ordered topic ids\n        \"\"\"\n        self._update_topic_size(documents)\n\n        # Map topics based on frequency\n        df = pd.DataFrame(self.topic_sizes_.items(), columns=[\"Old_Topic\", \"Size\"]).sort_values(\"Size\", ascending=False)\n        df = df[df.Old_Topic != -1]\n        sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))}\n        self.topic_mapper_.add_mappings(sorted_topics)\n\n        # Map documents\n        documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int)\n        self._update_topic_size(documents)\n        return documents\n\n    def _map_probabilities(self,\n                           probabilities: Union[np.ndarray, None],\n                           original_topics: bool = False) -> Union[np.ndarray, None]:\n        \"\"\" Map the probabilities to the reduced topics.\n        This is achieved by adding together the probabilities\n        of all topics that are mapped to the same topic. Then,\n        the topics that were mapped from are set to 0 as they\n        were reduced.\n\n        Arguments:\n            probabilities: An array containing probabilities\n            original_topics: Whether we want to map from the\n                             original topics to the most recent topics\n                             or from the second-most recent topics.\n\n        Returns:\n            mapped_probabilities: Updated probabilities\n        \"\"\"\n        mappings = self.topic_mapper_.get_mappings(original_topics)\n\n        # Map array of probabilities (probability for assigned topic per document)\n        if probabilities is not None:\n            if len(probabilities.shape) == 2:\n                mapped_probabilities = np.zeros((probabilities.shape[0],\n                                                 len(set(mappings.values())) - self._outliers))\n                for from_topic, to_topic in mappings.items():\n                    if to_topic != -1 and from_topic != -1:\n                        mapped_probabilities[:, to_topic] += probabilities[:, from_topic]\n\n                return mapped_probabilities\n\n        return probabilities\n\n    def _preprocess_text(self, documents: np.ndarray) -> List[str]:\n        \"\"\" Basic preprocessing of text\n\n        Steps:\n            * Replace \\n and \\t with whitespace\n            * Only keep alpha-numerical characters\n        \"\"\"\n        cleaned_documents = [doc.replace(\"\\n\", \" \") for doc in documents]\n        cleaned_documents = [doc.replace(\"\\t\", \" \") for doc in cleaned_documents]\n        if self.language == \"english\":\n            cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents]\n        cleaned_documents = [doc if doc != \"\" else \"emptydoc\" for doc in cleaned_documents]\n        return cleaned_documents\n\n    @staticmethod\n    def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray:\n        \"\"\" Return indices of top n values in each row of a sparse matrix\n\n        Retrieved from:\n            https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix\n\n        Arguments:\n            matrix: The sparse matrix from which to get the top n indices per row\n            n: The number of highest values to extract from each row\n\n        Returns:\n            indices: The top n indices per row\n        \"\"\"\n        indices = []\n        for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):\n            n_row_pick = min(n, ri - le)\n            values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]]\n            values = [values[index] if len(values) >= index + 1 else None for index in range(n)]\n            indices.append(values)\n        return np.array(indices)\n\n    @staticmethod\n    def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray:\n        \"\"\" Return the top n values for each row in a sparse matrix\n\n        Arguments:\n            matrix: The sparse matrix from which to get the top n indices per row\n            indices: The top n indices per row\n\n        Returns:\n            top_values: The top n scores per row\n        \"\"\"\n        top_values = []\n        for row, values in enumerate(indices):\n            scores = np.array([matrix[row, value] if value is not None else 0 for value in values])\n            top_values.append(scores)\n        return np.array(top_values)\n\n    @classmethod\n    def _get_param_names(cls):\n        \"\"\"Get parameter names for the estimator\n\n        Adapted from:\n            https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178\n        \"\"\"\n        init_signature = inspect.signature(cls.__init__)\n        parameters = sorted([p.name for p in init_signature.parameters.values()\n                             if p.name != 'self' and p.kind != p.VAR_KEYWORD])\n        return parameters\n\n    def __str__(self):\n        \"\"\"Get a string representation of the current object.\n\n        Returns:\n            str: Human readable representation of the most important model parameters.\n                 The parameters that represent models are ignored due to their length.\n        \"\"\"\n        parameters = \"\"\n        for parameter, value in self.get_params().items():\n            value = str(value)\n            if \"(\" in value and value[0] != \"(\":\n                value = value.split(\"(\")[0] + \"(...)\"\n            parameters += f\"{parameter}={value}, \"\n\n        return f\"BERTopic({parameters[:-2]})\"\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.__init__","title":"__init__(self, language='english', top_n_words=10, n_gram_range=(1, 1), min_topic_size=10, nr_topics=None, low_memory=False, calculate_probabilities=False, seed_topic_list=None, zeroshot_topic_list=None, zeroshot_min_similarity=0.7, embedding_model=None, umap_model=None, hdbscan_model=None, vectorizer_model=None, ctfidf_model=None, representation_model=None, verbose=False) special","text":"

        BERTopic initialization

        Parameters:

        Name Type Description Default language str

        The main language used in your documents. The default sentence-transformers model for \"english\" is all-MiniLM-L6-v2. For a full overview of supported languages see bertopic.backend.languages. Select \"multilingual\" to load in the paraphrase-multilingual-MiniLM-L12-v2 sentence-transformers model that supports 50+ languages. NOTE: This is not used if embedding_model is used.

        'english' top_n_words int

        The number of words per topic to extract. Setting this too high can negatively impact topic embeddings as topics are typically best represented by at most 10 words.

        10 n_gram_range Tuple[int, int]

        The n-gram range for the CountVectorizer. Advised to keep high values between 1 and 3. More would likely lead to memory issues. NOTE: This param will not be used if you pass in your own CountVectorizer.

        (1, 1) min_topic_size int

        The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics and vice versa. It is the same parameter as min_cluster_size in HDBSCAN. NOTE: This param will not be used if you are using hdbscan_model.

        10 nr_topics Union[int, str]

        Specifying the number of topics will reduce the initial number of topics to the value specified. This reduction can take a while as each reduction in topics (-1) activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use \"auto\" to automatically reduce topics using HDBSCAN. NOTE: Controlling the number of topics is best done by adjusting min_topic_size first before adjusting this parameter.

        None low_memory bool

        Sets UMAP low memory to True to make sure less memory is used. NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP this parameter will not be used.

        False calculate_probabilities bool

        Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document. This could slow down the extraction of topics if you have many documents (> 100_000). NOTE: If false you cannot use the corresponding visualization method visualize_probabilities. NOTE: This is an approximation of topic probabilities as used in HDBSCAN and not an exact representation.

        False seed_topic_list List[List[str]]

        A list of seed words per topic to converge around

        None zeroshot_topic_list List[str]

        A list of topic names to use for zero-shot classification

        None zeroshot_min_similarity float

        The minimum similarity between a zero-shot topic and a document for assignment. The higher this value, the more confident the model needs to be to assign a zero-shot topic to a document.

        0.7 verbose bool

        Changes the verbosity of the model, Set to True if you want to track the stages of the model.

        False embedding_model

        Use a custom embedding model. The following backends are currently supported * SentenceTransformers * Flair * Spacy * Gensim * USE (TF-Hub) You can also pass in a string that points to one of the following sentence-transformers models: * https://www.sbert.net/docs/pretrained_models.html

        None umap_model UMAP

        Pass in a UMAP model to be used instead of the default. NOTE: You can also pass in any dimensionality reduction algorithm as long as it has .fit and .transform functions.

        None hdbscan_model HDBSCAN

        Pass in a hdbscan.HDBSCAN model to be used instead of the default NOTE: You can also pass in any clustering algorithm as long as it has .fit and .predict functions along with the .labels_ variable.

        None vectorizer_model CountVectorizer

        Pass in a custom CountVectorizer instead of the default model.

        None ctfidf_model TfidfTransformer

        Pass in a custom ClassTfidfTransformer instead of the default model.

        None representation_model BaseRepresentation

        Pass in a model that fine-tunes the topic representations calculated through c-TF-IDF. Models from bertopic.representation are supported.

        None Source code in bertopic\\_bertopic.py
        def __init__(self,\n             language: str = \"english\",\n             top_n_words: int = 10,\n             n_gram_range: Tuple[int, int] = (1, 1),\n             min_topic_size: int = 10,\n             nr_topics: Union[int, str] = None,\n             low_memory: bool = False,\n             calculate_probabilities: bool = False,\n             seed_topic_list: List[List[str]] = None,\n             zeroshot_topic_list: List[str] = None,\n             zeroshot_min_similarity: float = .7,\n             embedding_model=None,\n             umap_model: UMAP = None,\n             hdbscan_model: hdbscan.HDBSCAN = None,\n             vectorizer_model: CountVectorizer = None,\n             ctfidf_model: TfidfTransformer = None,\n             representation_model: BaseRepresentation = None,\n             verbose: bool = False,\n             ):\n    \"\"\"BERTopic initialization\n\n    Arguments:\n        language: The main language used in your documents. The default sentence-transformers\n                  model for \"english\" is `all-MiniLM-L6-v2`. For a full overview of\n                  supported languages see bertopic.backend.languages. Select\n                  \"multilingual\" to load in the `paraphrase-multilingual-MiniLM-L12-v2`\n                  sentence-transformers model that supports 50+ languages.\n                  NOTE: This is not used if `embedding_model` is used.\n        top_n_words: The number of words per topic to extract. Setting this\n                     too high can negatively impact topic embeddings as topics\n                     are typically best represented by at most 10 words.\n        n_gram_range: The n-gram range for the CountVectorizer.\n                      Advised to keep high values between 1 and 3.\n                      More would likely lead to memory issues.\n                      NOTE: This param will not be used if you pass in your own\n                      CountVectorizer.\n        min_topic_size: The minimum size of the topic. Increasing this value will lead\n                        to a lower number of clusters/topics and vice versa. \n                        It is the same parameter as `min_cluster_size` in HDBSCAN.\n                        NOTE: This param will not be used if you are using `hdbscan_model`.\n        nr_topics: Specifying the number of topics will reduce the initial\n                   number of topics to the value specified. This reduction can take\n                   a while as each reduction in topics (-1) activates a c-TF-IDF\n                   calculation. If this is set to None, no reduction is applied. Use\n                   \"auto\" to automatically reduce topics using HDBSCAN.\n                   NOTE: Controlling the number of topics is best done by adjusting\n                   `min_topic_size` first before adjusting this parameter.\n        low_memory: Sets UMAP low memory to True to make sure less memory is used.\n                    NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP\n                    this parameter will not be used.\n        calculate_probabilities: Calculate the probabilities of all topics\n                                 per document instead of the probability of the assigned\n                                 topic per document. This could slow down the extraction\n                                 of topics if you have many documents (> 100_000).\n                                 NOTE: If false you cannot use the corresponding\n                                 visualization method `visualize_probabilities`.\n                                 NOTE: This is an approximation of topic probabilities\n                                 as used in HDBSCAN and not an exact representation.\n        seed_topic_list: A list of seed words per topic to converge around\n        zeroshot_topic_list: A list of topic names to use for zero-shot classification\n        zeroshot_min_similarity: The minimum similarity between a zero-shot topic and\n                                 a document for assignment. The higher this value, the more\n                                 confident the model needs to be to assign a zero-shot topic to a document.\n        verbose: Changes the verbosity of the model, Set to True if you want\n                 to track the stages of the model.\n        embedding_model: Use a custom embedding model.\n                         The following backends are currently supported\n                           * SentenceTransformers\n                           * Flair\n                           * Spacy\n                           * Gensim\n                           * USE (TF-Hub)\n                         You can also pass in a string that points to one of the following\n                         sentence-transformers models:\n                           * https://www.sbert.net/docs/pretrained_models.html\n        umap_model: Pass in a UMAP model to be used instead of the default.\n                    NOTE: You can also pass in any dimensionality reduction algorithm as long\n                    as it has `.fit` and `.transform` functions.\n        hdbscan_model: Pass in a hdbscan.HDBSCAN model to be used instead of the default\n                       NOTE: You can also pass in any clustering algorithm as long as it has\n                       `.fit` and `.predict` functions along with the `.labels_` variable.\n        vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model.\n        ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model.\n        representation_model: Pass in a model that fine-tunes the topic representations\n                              calculated through c-TF-IDF. Models from `bertopic.representation`\n                              are supported.\n    \"\"\"\n    # Topic-based parameters\n    if top_n_words > 100:\n        logger.warning(\"Note that extracting more than 100 words from a sparse \"\n                       \"can slow down computation quite a bit.\")\n\n    self.top_n_words = top_n_words\n    self.min_topic_size = min_topic_size\n    self.nr_topics = nr_topics\n    self.low_memory = low_memory\n    self.calculate_probabilities = calculate_probabilities\n    self.verbose = verbose\n    self.seed_topic_list = seed_topic_list\n    self.zeroshot_topic_list = zeroshot_topic_list\n    self.zeroshot_min_similarity = zeroshot_min_similarity\n\n    # Embedding model\n    self.language = language if not embedding_model else None\n    self.embedding_model = embedding_model\n\n    # Vectorizer\n    self.n_gram_range = n_gram_range\n    self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range)\n    self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n\n    # Representation model\n    self.representation_model = representation_model\n\n    # UMAP or another algorithm that has .fit and .transform functions\n    self.umap_model = umap_model or UMAP(n_neighbors=15,\n                                         n_components=5,\n                                         min_dist=0.0,\n                                         metric='cosine',\n                                         low_memory=self.low_memory)\n\n    # HDBSCAN or another clustering algorithm that has .fit and .predict functions and\n    # the .labels_ variable to extract the labels\n    self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(min_cluster_size=self.min_topic_size,\n                                                          metric='euclidean',\n                                                          cluster_selection_method='eom',\n                                                          prediction_data=True)\n\n    # Public attributes\n    self.topics_ = None\n    self.probabilities_ = None\n    self.topic_sizes_ = None\n    self.topic_mapper_ = None\n    self.topic_representations_ = None\n    self.topic_embeddings_ = None\n    self.topic_labels_ = None\n    self.custom_labels_ = None\n    self.c_tf_idf_ = None\n    self.representative_images_ = None\n    self.representative_docs_ = {}\n    self.topic_aspects_ = {}\n\n    # Private attributes for internal tracking purposes\n    self._outliers = 1\n    self._merged_topics = None\n\n    if verbose:\n        logger.set_level(\"DEBUG\")\n    else:\n        logger.set_level(\"WARNING\")\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.__str__","title":"__str__(self) special","text":"

        Get a string representation of the current object.

        Returns:

        Type Description str

        Human readable representation of the most important model parameters. The parameters that represent models are ignored due to their length.

        Source code in bertopic\\_bertopic.py
        def __str__(self):\n    \"\"\"Get a string representation of the current object.\n\n    Returns:\n        str: Human readable representation of the most important model parameters.\n             The parameters that represent models are ignored due to their length.\n    \"\"\"\n    parameters = \"\"\n    for parameter, value in self.get_params().items():\n        value = str(value)\n        if \"(\" in value and value[0] != \"(\":\n            value = value.split(\"(\")[0] + \"(...)\"\n        parameters += f\"{parameter}={value}, \"\n\n    return f\"BERTopic({parameters[:-2]})\"\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.approximate_distribution","title":"approximate_distribution(self, documents, window=4, stride=1, min_similarity=0.1, batch_size=1000, padding=False, use_embedding_model=False, calculate_tokens=False, separator=' ')","text":"

        A post-hoc approximation of topic distributions across documents.

        In order to perform this approximation, each document is split into tokens according to the provided tokenizer in the CountVectorizer. Then, a sliding window is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the sentence:

        Solving the right problem is difficult.

        can be split up into solving the right, the right problem, right problem is, and problem is difficult. These are called tokensets. For each of these tokensets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. Then, the similarities to the topics for each tokenset are summed up in order to create a topic distribution for the entire document.

        We can also dive into this a bit deeper by then splitting these tokensets up into individual tokens and calculate how much a word, in a specific sentence, contributes to the topics found in that document. This can be enabled by setting calculate_tokens=True which can be used for visualization purposes in topic_model.visualize_approximate_distribution.

        The main output, topic_distributions, can also be used directly in .visualize_distribution(topic_distributions[index]) by simply selecting a single distribution.

        Parameters:

        Name Type Description Default documents Union[str, List[str]]

        A single document or a list of documents for which we approximate their topic distributions

        required window int

        Size of the moving window which indicates the number of tokens being considered.

        4 stride int

        How far the window should move at each step.

        1 min_similarity float

        The minimum similarity of a document's tokenset with respect to the topics.

        0.1 batch_size int

        The number of documents to process at a time. If None, then all documents are processed at once. NOTE: With a large number of documents, it is not advised to process all documents at once.

        1000 padding bool

        Whether to pad the beginning and ending of a document with empty tokens.

        False use_embedding_model bool

        Whether to use the topic model's embedding model to calculate the similarity between tokensets and topics instead of using c-TF-IDF.

        False calculate_tokens bool

        Calculate the similarity of tokens with all topics. NOTE: This is computation-wise more expensive and can require more memory. Using this over batches of documents might be preferred.

        False separator str

        The separator used to merge tokens into tokensets.

        ' '

        Returns:

        Type Description topic_distributions

        A n x m matrix containing the topic distributions for all input documents with n being the documents and m the topics. topic_token_distributions: A list of t x m arrays with t being the number of tokens for the respective document and m the topics.

        Examples:

        After fitting the model, the topic distributions can be calculated regardless of the clustering model and regardless of whether the documents were previously seen or not:

        topic_distr, _ = topic_model.approximate_distribution(docs)\n

        As a result, the topic distributions are calculated in topic_distr for the entire document based on a token set with a specific window size and stride.

        If you want to calculate the topic distributions on a token-level:

        topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n

        The topic_token_distr then contains, for each token, the best fitting topics. As with topic_distr, it can contain multiple topics for a single token.

        Source code in bertopic\\_bertopic.py
        def approximate_distribution(self,\n                             documents: Union[str, List[str]],\n                             window: int = 4,\n                             stride: int = 1,\n                             min_similarity: float = 0.1,\n                             batch_size: int = 1000,\n                             padding: bool = False,\n                             use_embedding_model: bool = False,\n                             calculate_tokens: bool = False,\n                             separator: str = \" \") -> Tuple[np.ndarray,\n                                                            Union[List[np.ndarray], None]]:\n    \"\"\" A post-hoc approximation of topic distributions across documents.\n\n    In order to perform this approximation, each document is split into tokens\n    according to the provided tokenizer in the `CountVectorizer`. Then, a\n    sliding window is applied on each document creating subsets of the document.\n    For example, with a window size of 3 and stride of 1, the sentence:\n\n    `Solving the right problem is difficult.`\n\n    can be split up into `solving the right`, `the right problem`, `right problem is`,\n    and `problem is difficult`. These are called tokensets. For each of these\n    tokensets, we calculate their c-TF-IDF representation and find out\n    how similar they are to the previously generated topics. Then, the\n    similarities to the topics for each tokenset are summed up in order to\n    create a topic distribution for the entire document.\n\n    We can also dive into this a bit deeper by then splitting these tokensets\n    up into individual tokens and calculate how much a word, in a specific sentence,\n    contributes to the topics found in that document. This can be enabled by\n    setting `calculate_tokens=True` which can be used for visualization purposes\n    in `topic_model.visualize_approximate_distribution`.\n\n    The main output, `topic_distributions`, can also be used directly in\n    `.visualize_distribution(topic_distributions[index])` by simply selecting\n    a single distribution.\n\n    Arguments:\n        documents: A single document or a list of documents for which we\n                   approximate their topic distributions\n        window: Size of the moving window which indicates the number of\n                tokens being considered.\n        stride: How far the window should move at each step.\n        min_similarity: The minimum similarity of a document's tokenset\n                        with respect to the topics.\n        batch_size: The number of documents to process at a time. If None,\n                    then all documents are processed at once.\n                    NOTE: With a large number of documents, it is not\n                    advised to process all documents at once.\n        padding: Whether to pad the beginning and ending of a document with\n                 empty tokens.\n        use_embedding_model: Whether to use the topic model's embedding\n                             model to calculate the similarity between\n                             tokensets and topics instead of using c-TF-IDF.\n        calculate_tokens: Calculate the similarity of tokens with all topics.\n                          NOTE: This is computation-wise more expensive and\n                          can require more memory. Using this over batches of\n                          documents might be preferred.\n        separator: The separator used to merge tokens into tokensets.\n\n    Returns:\n        topic_distributions: A `n` x `m` matrix containing the topic distributions\n                             for all input documents with `n` being the documents\n                             and `m` the topics.\n        topic_token_distributions: A list of `t` x `m` arrays with `t` being the\n                                   number of tokens for the respective document\n                                   and `m` the topics.\n\n    Examples:\n\n    After fitting the model, the topic distributions can be calculated regardless\n    of the clustering model and regardless of whether the documents were previously\n    seen or not:\n\n    ```python\n    topic_distr, _ = topic_model.approximate_distribution(docs)\n    ```\n\n    As a result, the topic distributions are calculated in `topic_distr` for the\n    entire document based on a token set with a specific window size and stride.\n\n    If you want to calculate the topic distributions on a token-level:\n\n    ```python\n    topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n    ```\n\n    The `topic_token_distr` then contains, for each token, the best fitting topics.\n    As with `topic_distr`, it can contain multiple topics for a single token.\n    \"\"\"\n    if isinstance(documents, str):\n        documents = [documents]\n\n    if batch_size is None:\n        batch_size = len(documents)\n        batches = 1\n    else:\n        batches = math.ceil(len(documents)/batch_size)\n\n    topic_distributions = []\n    topic_token_distributions = []\n\n    for i in tqdm(range(batches), disable=not self.verbose):\n        doc_set = documents[i*batch_size: (i+1) * batch_size]\n\n        # Extract tokens\n        analyzer = self.vectorizer_model.build_tokenizer()\n        tokens = [analyzer(document) for document in doc_set]\n\n        # Extract token sets\n        all_sentences = []\n        all_indices = [0]\n        all_token_sets_ids = []\n\n        for tokenset in tokens:\n            if len(tokenset) < window:\n                token_sets = [tokenset]\n                token_sets_ids = [list(range(len(tokenset)))]\n            else:\n\n                # Extract tokensets using window and stride parameters\n                stride_indices = list(range(len(tokenset)))[::stride]\n                token_sets = []\n                token_sets_ids = []\n                for stride_index in stride_indices:\n                    selected_tokens = tokenset[stride_index: stride_index+window]\n\n                    if padding or len(selected_tokens) == window:\n                        token_sets.append(selected_tokens)\n                        token_sets_ids.append(list(range(stride_index, stride_index+len(selected_tokens))))\n\n                # Add empty tokens at the beginning and end of a document\n                if padding:\n                    padded = []\n                    padded_ids = []\n                    t = math.ceil(window / stride) - 1\n                    for i in range(math.ceil(window / stride) - 1):\n                        padded.append(tokenset[:window - ((t-i) * stride)])\n                        padded_ids.append(list(range(0, window - ((t-i) * stride))))\n\n                    token_sets = padded + token_sets\n                    token_sets_ids = padded_ids + token_sets_ids\n\n            # Join the tokens\n            sentences = [separator.join(token) for token in token_sets]\n            all_sentences.extend(sentences)\n            all_token_sets_ids.extend(token_sets_ids)\n            all_indices.append(all_indices[-1] + len(sentences))\n\n        # Calculate similarity between embeddings of token sets and the topics\n        if use_embedding_model:\n            embeddings = self._extract_embeddings(all_sentences, method=\"document\", verbose=True)\n            similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers:])\n\n        # Calculate similarity between c-TF-IDF of token sets and the topics\n        else:\n            bow_doc = self.vectorizer_model.transform(all_sentences)\n            c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n            similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:])\n\n        # Only keep similarities that exceed the minimum\n        similarity[similarity < min_similarity] = 0\n\n        # Aggregate results on an individual token level\n        if calculate_tokens:\n            topic_distribution = []\n            topic_token_distribution = []\n            for index, token in enumerate(tokens):\n                start = all_indices[index]\n                end = all_indices[index+1]\n\n                if start == end:\n                    end = end + 1\n\n                # Assign topics to individual tokens\n                token_id = [i for i in range(len(token))]\n                token_val = {index: [] for index in token_id}\n                for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]):\n                    for token in token_set:\n                        if token in token_val:\n                            token_val[token].append(sim)\n\n                matrix = []\n                for _, value in token_val.items():\n                    matrix.append(np.add.reduce(value))\n\n                # Take empty documents into account\n                matrix = np.array(matrix)\n                if len(matrix.shape) == 1:\n                    matrix = np.zeros((1, len(self.topic_labels_) - self._outliers))\n\n                topic_token_distribution.append(np.array(matrix))\n                topic_distribution.append(np.add.reduce(matrix))\n\n            topic_distribution = normalize(topic_distribution, norm='l1', axis=1)\n\n        # Aggregate on a tokenset level indicated by the window and stride\n        else:\n            topic_distribution = []\n            for index in range(len(all_indices)-1):\n                start = all_indices[index]\n                end = all_indices[index+1]\n\n                if start == end:\n                    end = end + 1\n                group = similarity[start:end].sum(axis=0)\n                topic_distribution.append(group)\n            topic_distribution = normalize(np.array(topic_distribution), norm='l1', axis=1)\n            topic_token_distribution = None\n\n        # Combine results\n        topic_distributions.append(topic_distribution)\n        if topic_token_distribution is None:\n            topic_token_distributions = None\n        else:\n            topic_token_distributions.extend(topic_token_distribution)\n\n    topic_distributions = np.vstack(topic_distributions)\n\n    return topic_distributions, topic_token_distributions\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.find_topics","title":"find_topics(self, search_term=None, image=None, top_n=5)","text":"

        Find topics most similar to a search_term

        Creates an embedding for search_term and compares that with the topic embeddings. The most similar topics are returned along with their similarity values.

        The search_term can be of any size but since it is compared with the topic representation it is advised to keep it below 5 words.

        Parameters:

        Name Type Description Default search_term str

        the term you want to use to search for topics.

        None top_n int

        the number of topics to return

        5

        Returns:

        Type Description similar_topics

        the most similar topics from high to low similarity: the similarity scores from high to low

        Examples:

        You can use the underlying embedding model to find topics that best represent the search term:

        topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n

        Note that the search query is typically more accurate if the search_term consists of a phrase or multiple words.

        Source code in bertopic\\_bertopic.py
        def find_topics(self,\n                search_term: str = None,\n                image: str = None,\n                top_n: int = 5) -> Tuple[List[int], List[float]]:\n    \"\"\" Find topics most similar to a search_term\n\n    Creates an embedding for search_term and compares that with\n    the topic embeddings. The most similar topics are returned\n    along with their similarity values.\n\n    The search_term can be of any size but since it is compared\n    with the topic representation it is advised to keep it\n    below 5 words.\n\n    Arguments:\n        search_term: the term you want to use to search for topics.\n        top_n: the number of topics to return\n\n    Returns:\n        similar_topics: the most similar topics from high to low\n        similarity: the similarity scores from high to low\n\n    Examples:\n\n    You can use the underlying embedding model to find topics that\n    best represent the search term:\n\n    ```python\n    topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n    ```\n\n    Note that the search query is typically more accurate if the\n    search_term consists of a phrase or multiple words.\n    \"\"\"\n    if self.embedding_model is None:\n        raise Exception(\"This method can only be used if you did not use custom embeddings.\")\n\n    topic_list = list(self.topic_representations_.keys())\n    topic_list.sort()\n\n    # Extract search_term embeddings and compare with topic embeddings\n    if search_term is not None:\n        search_embedding = self._extract_embeddings([search_term],\n                                                    method=\"word\",\n                                                    verbose=False).flatten()\n    elif image is not None:\n        search_embedding = self._extract_embeddings([None],\n                                                    images=[image],\n                                                    method=\"document\",\n                                                    verbose=False).flatten()\n    sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten()\n\n    # Extract topics most similar to search_term\n    ids = np.argsort(sims)[-top_n:]\n    similarity = [sims[i] for i in ids][::-1]\n    similar_topics = [topic_list[index] for index in ids][::-1]\n\n    return similar_topics, similarity\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.fit","title":"fit(self, documents, embeddings=None, images=None, y=None)","text":"

        Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents to fit on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model

        None images List[str]

        A list of paths to the images to fit on or the images themselves

        None y Union[List[int], numpy.ndarray]

        The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified.

        None

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic().fit(docs)\n

        If you want to use your own embeddings, use it as follows:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Create embeddings\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# Create topic model\ntopic_model = BERTopic().fit(docs, embeddings)\n
        Source code in bertopic\\_bertopic.py
        def fit(self,\n        documents: List[str],\n        embeddings: np.ndarray = None,\n        images: List[str] = None,\n        y: Union[List[int], np.ndarray] = None):\n    \"\"\" Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics\n\n    Arguments:\n        documents: A list of documents to fit on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model\n        images: A list of paths to the images to fit on or the images themselves\n        y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n           specific instance is specified.\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic().fit(docs)\n    ```\n\n    If you want to use your own embeddings, use it as follows:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    # Create embeddings\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n    # Create topic model\n    topic_model = BERTopic().fit(docs, embeddings)\n    ```\n    \"\"\"\n    self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images)\n    return self\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.fit_transform","title":"fit_transform(self, documents, embeddings=None, images=None, y=None)","text":"

        Fit the models on a collection of documents, generate topics, and return the probabilities and topic per document.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents to fit on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model

        None images List[str]

        A list of paths to the images to fit on or the images themselves

        None y Union[List[int], numpy.ndarray]

        The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified.

        None

        Returns:

        Type Description predictions

        Topic predictions for each documents probabilities: The probability of the assigned topic per document. If calculate_probabilities in BERTopic is set to True, then it calculates the probabilities of all topics across all documents instead of only the assigned topic. This, however, slows down computation and may increase memory usage.

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        If you want to use your own embeddings, use it as follows:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Create embeddings\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# Create topic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n
        Source code in bertopic\\_bertopic.py
        def fit_transform(self,\n                  documents: List[str],\n                  embeddings: np.ndarray = None,\n                  images: List[str] = None,\n                  y: Union[List[int], np.ndarray] = None) -> Tuple[List[int],\n                                                                   Union[np.ndarray, None]]:\n    \"\"\" Fit the models on a collection of documents, generate topics,\n    and return the probabilities and topic per document.\n\n    Arguments:\n        documents: A list of documents to fit on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model\n        images: A list of paths to the images to fit on or the images themselves\n        y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n           specific instance is specified.\n\n    Returns:\n        predictions: Topic predictions for each documents\n        probabilities: The probability of the assigned topic per document.\n                       If `calculate_probabilities` in BERTopic is set to True, then\n                       it calculates the probabilities of all topics across all documents\n                       instead of only the assigned topic. This, however, slows down\n                       computation and may increase memory usage.\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    ```\n\n    If you want to use your own embeddings, use it as follows:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    # Create embeddings\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n    # Create topic model\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs, embeddings)\n    ```\n    \"\"\"\n    if documents is not None:\n        check_documents_type(documents)\n        check_embeddings_shape(embeddings, documents)\n\n    doc_ids = range(len(documents)) if documents is not None else range(len(images))\n    documents = pd.DataFrame({\"Document\": documents,\n                              \"ID\": doc_ids,\n                              \"Topic\": None,\n                              \"Image\": images})\n\n    # Extract embeddings\n    if embeddings is None:\n        logger.info(\"Embedding - Transforming documents to embeddings.\")\n        self.embedding_model = select_backend(self.embedding_model,\n                                              language=self.language)\n        embeddings = self._extract_embeddings(documents.Document.values.tolist(),\n                                              images=images,\n                                              method=\"document\",\n                                              verbose=self.verbose)\n        logger.info(\"Embedding - Completed \\u2713\")\n    else:\n        if self.embedding_model is not None:\n            self.embedding_model = select_backend(self.embedding_model,\n                                                  language=self.language)\n\n    # Guided Topic Modeling\n    if self.seed_topic_list is not None and self.embedding_model is not None:\n        y, embeddings = self._guided_topic_modeling(embeddings)\n\n    # Zero-shot Topic Modeling\n    if self._is_zeroshot():\n        documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(documents, embeddings)\n        if documents is None:\n            return self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings)\n\n    # Reduce dimensionality\n    umap_embeddings = self._reduce_dimensionality(embeddings, y)\n\n    # Cluster reduced embeddings\n    documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)\n\n    # Sort and Map Topic IDs by their frequency\n    if not self.nr_topics:\n        documents = self._sort_mappings_by_frequency(documents)\n\n    # Create documents from images if we have images only\n    if documents.Document.values[0] is None:\n        custom_documents = self._images_to_text(documents, embeddings)\n\n        # Extract topics by calculating c-TF-IDF\n        self._extract_topics(custom_documents, embeddings=embeddings)\n        self._create_topic_vectors(documents=documents, embeddings=embeddings)\n\n        # Reduce topics\n        if self.nr_topics:\n            custom_documents = self._reduce_topics(custom_documents)\n\n        # Save the top 3 most representative documents per topic\n        self._save_representative_docs(custom_documents)\n    else:\n        # Extract topics by calculating c-TF-IDF\n        self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)\n\n        # Reduce topics\n        if self.nr_topics:\n            documents = self._reduce_topics(documents)\n\n        # Save the top 3 most representative documents per topic\n        self._save_representative_docs(documents)\n\n    # Resulting output\n    self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)\n    predictions = documents.Topic.to_list()\n\n    # Combine Zero-shot with outliers\n    if self._is_zeroshot() and len(documents) != len(doc_ids):\n        predictions = self._combine_zeroshot_topics(documents, assigned_documents, assigned_embeddings)\n\n    return predictions, self.probabilities_\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.generate_topic_labels","title":"generate_topic_labels(self, nr_words=3, topic_prefix=True, word_length=None, separator='_', aspect=None)","text":"

        Get labels for each topic in a user-defined format

        Parameters:

        Name Type Description Default nr_words int

        Top n words per topic to use

        3 topic_prefix bool

        Whether to use the topic ID as a prefix. If set to True, the topic ID will be separated using the separator

        True word_length int

        The maximum length of each word in the topic label. Some words might be relatively long and setting this value helps to make sure that all labels have relatively similar lengths.

        None separator str

        The string with which the words and topic prefix will be separated. Underscores are the default but a nice alternative is \", \".

        '_' aspect str

        The aspect from which to generate topic labels

        None

        Returns:

        Type Description topic_labels

        A list of topic labels sorted from the lowest topic ID to the highest. If the topic model was trained using HDBSCAN, the lowest topic ID is -1, otherwise it is 0.

        Examples:

        To create our custom topic labels, usage is rather straightforward:

        topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=\", \")\n
        Source code in bertopic\\_bertopic.py
        def generate_topic_labels(self,\n                          nr_words: int = 3,\n                          topic_prefix: bool = True,\n                          word_length: int = None,\n                          separator: str = \"_\",\n                          aspect: str = None) -> List[str]:\n    \"\"\" Get labels for each topic in a user-defined format\n\n    Arguments:\n        nr_words: Top `n` words per topic to use\n        topic_prefix: Whether to use the topic ID as a prefix.\n                      If set to True, the topic ID will be separated\n                      using the `separator`\n        word_length: The maximum length of each word in the topic label.\n                     Some words might be relatively long and setting this\n                     value helps to make sure that all labels have relatively\n                     similar lengths.\n        separator: The string with which the words and topic prefix will be\n                   separated. Underscores are the default but a nice alternative\n                   is `\", \"`.\n        aspect: The aspect from which to generate topic labels\n\n    Returns:\n        topic_labels: A list of topic labels sorted from the lowest topic ID to the highest.\n                      If the topic model was trained using HDBSCAN, the lowest topic ID is -1,\n                      otherwise it is 0.\n\n    Examples:\n\n    To create our custom topic labels, usage is rather straightforward:\n\n    ```python\n    topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=\", \")\n    ```\n    \"\"\"\n    unique_topics = sorted(set(self.topics_))\n\n    topic_labels = []\n    for topic in unique_topics:\n        if aspect:\n            words, _ = zip(*self.topic_aspects_[aspect][topic])\n        else:\n            words, _ = zip(*self.get_topic(topic))\n\n        if word_length:\n            words = [word[:word_length] for word in words][:nr_words]\n        else:\n            words = list(words)[:nr_words]\n\n        if topic_prefix:\n            topic_label = f\"{topic}{separator}\" + separator.join(words)\n        else:\n            topic_label = separator.join(words)\n\n        topic_labels.append(topic_label)\n\n    return topic_labels\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_document_info","title":"get_document_info(self, docs, df=None, metadata=None)","text":"

        Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and probability of the clustering if the cluster model supports it.

        There are also options to include other meta data, such as the topic distributions or the x and y coordinates of the reduced embeddings.

        Parameters:

        Name Type Description Default docs List[str]

        The documents on which the topic model was trained.

        required df DataFrame

        A dataframe containing the metadata and the documents on which the topic model was originally trained on.

        None metadata Mapping[str, Any]

        A dictionary with meta data for each document in the form of column name (key) and the respective values (value).

        None

        Returns:

        Type Description document_info

        A dataframe with several statistics regarding the documents on which the topic model was trained.

        Usage:

        To get the document info, you will only need to pass the documents on which the topic model was trained:

        document_info = topic_model.get_document_info(docs)\n

        There are additionally options to include meta data, such as the topic distributions. Moreover, we can pass the original dataframe that contains the documents and extend it with the information retrieved from BERTopic:

        from sklearn.datasets import fetch_20newsgroups\n\n# The original data in a dataframe format to include the target variable\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndf = pd.DataFrame({\"Document\": data['data'], \"Class\": data['target']})\n\n# Add information about the percentage of the document that relates to the topic\ntopic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)\ndistributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]\n\n# Create our documents dataframe using the original dataframe and meta data about\n# the topic distributions\ndocument_info = topic_model.get_document_info(docs, df=df,\n                                              metadata={\"Topic_distribution\": distributions})\n
        Source code in bertopic\\_bertopic.py
        def get_document_info(self,\n                      docs: List[str],\n                      df: pd.DataFrame = None,\n                      metadata: Mapping[str, Any] = None) -> pd.DataFrame:\n    \"\"\" Get information about the documents on which the topic was trained\n    including the documents themselves, their respective topics, the name\n    of each topic, the top n words of each topic, whether it is a\n    representative document, and probability of the clustering if the cluster\n    model supports it.\n\n    There are also options to include other meta data, such as the topic\n    distributions or the x and y coordinates of the reduced embeddings.\n\n    Arguments:\n        docs: The documents on which the topic model was trained.\n        df: A dataframe containing the metadata and the documents on which\n            the topic model was originally trained on.\n        metadata: A dictionary with meta data for each document in the form\n                  of column name (key) and the respective values (value).\n\n    Returns:\n        document_info: A dataframe with several statistics regarding\n                       the documents on which the topic model was trained.\n\n    Usage:\n\n    To get the document info, you will only need to pass the documents on which\n    the topic model was trained:\n\n    ```python\n    document_info = topic_model.get_document_info(docs)\n    ```\n\n    There are additionally options to include meta data, such as the topic\n    distributions. Moreover, we can pass the original dataframe that contains\n    the documents and extend it with the information retrieved from BERTopic:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n\n    # The original data in a dataframe format to include the target variable\n    data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\n    df = pd.DataFrame({\"Document\": data['data'], \"Class\": data['target']})\n\n    # Add information about the percentage of the document that relates to the topic\n    topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)\n    distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]\n\n    # Create our documents dataframe using the original dataframe and meta data about\n    # the topic distributions\n    document_info = topic_model.get_document_info(docs, df=df,\n                                                  metadata={\"Topic_distribution\": distributions})\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    if df is not None:\n        document_info = df.copy()\n        document_info[\"Document\"] = docs\n        document_info[\"Topic\"] = self.topics_\n    else:\n        document_info = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_})\n\n    # Add topic info through `.get_topic_info()`\n    topic_info = self.get_topic_info().drop(\"Count\", axis=1)\n    document_info = pd.merge(document_info, topic_info, on=\"Topic\", how=\"left\")\n\n    # Add top n words\n    top_n_words = {topic: \" - \".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)}\n    document_info[\"Top_n_words\"] = document_info.Topic.map(top_n_words)\n\n    # Add flat probabilities\n    if self.probabilities_ is not None:\n        if len(self.probabilities_.shape) == 1:\n            document_info[\"Probability\"] = self.probabilities_\n        else:\n            document_info[\"Probability\"] = [max(probs) if topic != -1 else 1-sum(probs)\n                                            for topic, probs in zip(self.topics_, self.probabilities_)]\n\n    # Add representative document labels\n    repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs]\n    document_info[\"Representative_document\"] = False\n    document_info.loc[document_info.Document.isin(repr_docs), \"Representative_document\"] = True\n\n    # Add custom meta data provided by the user\n    if metadata is not None:\n        for column, values in metadata.items():\n            document_info[column] = values\n    return document_info\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_params","title":"get_params(self, deep=False)","text":"

        Get parameters for this estimator.

        Adapted from: https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178

        Parameters:

        Name Type Description Default deep bool

        bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        False

        Returns:

        Type Description out

        Parameter names mapped to their values.

        Source code in bertopic\\_bertopic.py
        def get_params(self, deep: bool = False) -> Mapping[str, Any]:\n    \"\"\" Get parameters for this estimator.\n\n    Adapted from:\n        https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178\n\n    Arguments:\n        deep: bool, default=True\n              If True, will return the parameters for this estimator and\n              contained subobjects that are estimators.\n\n    Returns:\n        out: Parameter names mapped to their values.\n    \"\"\"\n    out = dict()\n    for key in self._get_param_names():\n        value = getattr(self, key)\n        if deep and hasattr(value, 'get_params'):\n            deep_items = value.get_params().items()\n            out.update((key + '__' + k, val) for k, val in deep_items)\n        out[key] = value\n    return out\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_representative_docs","title":"get_representative_docs(self, topic=None)","text":"

        Extract the best representing documents per topic.

        Note

        This does not extract all documents per topic as all documents are not saved within BERTopic. To get all documents, please run the following:

        # When you used `.fit_transform`:\ndf = pd.DataFrame({\"Document\": docs, \"Topic\": topic})\n\n# When you used `.fit`:\ndf = pd.DataFrame({\"Document\": docs, \"Topic\": topic_model.topics_})\n

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want the representative documents

        None

        Returns:

        Type Description List[str]

        Representative documents of the chosen topic

        Examples:

        To extract the representative docs of all topics:

        representative_docs = topic_model.get_representative_docs()\n

        To get the representative docs of a single topic:

        representative_docs = topic_model.get_representative_docs(12)\n
        Source code in bertopic\\_bertopic.py
        def get_representative_docs(self, topic: int = None) -> List[str]:\n    \"\"\" Extract the best representing documents per topic.\n\n    NOTE:\n        This does not extract all documents per topic as all documents\n        are not saved within BERTopic. To get all documents, please\n        run the following:\n\n        ```python\n        # When you used `.fit_transform`:\n        df = pd.DataFrame({\"Document\": docs, \"Topic\": topic})\n\n        # When you used `.fit`:\n        df = pd.DataFrame({\"Document\": docs, \"Topic\": topic_model.topics_})\n        ```\n\n    Arguments:\n        topic: A specific topic for which you want\n               the representative documents\n\n    Returns:\n        Representative documents of the chosen topic\n\n    Examples:\n\n    To extract the representative docs of all topics:\n\n    ```python\n    representative_docs = topic_model.get_representative_docs()\n    ```\n\n    To get the representative docs of a single topic:\n\n    ```python\n    representative_docs = topic_model.get_representative_docs(12)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    if isinstance(topic, int):\n        if self.representative_docs_.get(topic):\n            return self.representative_docs_[topic]\n        else:\n            return None\n    else:\n        return self.representative_docs_\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic","title":"get_topic(self, topic, full=False)","text":"

        Return top n words for a specific topic and their c-TF-IDF scores

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want its representation

        required full bool

        If True, returns all different forms of topic representations for a topic, including aspects

        False

        Returns:

        Type Description Union[Mapping[str, Tuple[str, float]], bool]

        The top n words for a specific word and its respective c-TF-IDF scores

        Examples:

        topic = topic_model.get_topic(12)\n
        Source code in bertopic\\_bertopic.py
        def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]:\n    \"\"\" Return top n words for a specific topic and their c-TF-IDF scores\n\n    Arguments:\n        topic: A specific topic for which you want its representation\n        full: If True, returns all different forms of topic representations\n              for a topic, including aspects\n\n    Returns:\n        The top n words for a specific word and its respective c-TF-IDF scores\n\n    Examples:\n\n    ```python\n    topic = topic_model.get_topic(12)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    if topic in self.topic_representations_:\n        if full:\n            representations = {\"Main\": self.topic_representations_[topic]}\n            aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()}\n            representations.update(aspects)\n            return representations\n        else:\n            return self.topic_representations_[topic]\n    else:\n        return False\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic_freq","title":"get_topic_freq(self, topic=None)","text":"

        Return the size of topics (descending order)

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want the frequency

        None

        Returns:

        Type Description Union[pandas.core.frame.DataFrame, int]

        Either the frequency of a single topic or dataframe with the frequencies of all topics

        Examples:

        To extract the frequency of all topics:

        frequency = topic_model.get_topic_freq()\n

        To get the frequency of a single topic:

        frequency = topic_model.get_topic_freq(12)\n
        Source code in bertopic\\_bertopic.py
        def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:\n    \"\"\" Return the size of topics (descending order)\n\n    Arguments:\n        topic: A specific topic for which you want the frequency\n\n    Returns:\n        Either the frequency of a single topic or dataframe with\n        the frequencies of all topics\n\n    Examples:\n\n    To extract the frequency of all topics:\n\n    ```python\n    frequency = topic_model.get_topic_freq()\n    ```\n\n    To get the frequency of a single topic:\n\n    ```python\n    frequency = topic_model.get_topic_freq(12)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    if isinstance(topic, int):\n        return self.topic_sizes_[topic]\n    else:\n        return pd.DataFrame(self.topic_sizes_.items(), columns=['Topic', 'Count']).sort_values(\"Count\",\n                                                                                               ascending=False)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic_info","title":"get_topic_info(self, topic=None)","text":"

        Get information about each topic including its ID, frequency, and name.

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want the frequency

        None

        Returns:

        Type Description info

        The information relating to either a single topic or all topics

        Examples:

        info_df = topic_model.get_topic_info()\n
        Source code in bertopic\\_bertopic.py
        def get_topic_info(self, topic: int = None) -> pd.DataFrame:\n    \"\"\" Get information about each topic including its ID, frequency, and name.\n\n    Arguments:\n        topic: A specific topic for which you want the frequency\n\n    Returns:\n        info: The information relating to either a single topic or all topics\n\n    Examples:\n\n    ```python\n    info_df = topic_model.get_topic_info()\n    ```\n    \"\"\"\n    check_is_fitted(self)\n\n    info = pd.DataFrame(self.topic_sizes_.items(), columns=[\"Topic\", \"Count\"]).sort_values(\"Topic\")\n    info[\"Name\"] = info.Topic.map(self.topic_labels_)\n\n    # Custom label\n    if self.custom_labels_ is not None:\n        if len(self.custom_labels_) == len(info):\n            labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)}\n            info[\"CustomName\"] = info[\"Topic\"].map(labels)\n\n    # Main Keywords\n    values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()}\n    info[\"Representation\"] = info[\"Topic\"].map(values)\n\n    # Extract all topic aspects\n    if self.topic_aspects_:\n        for aspect, values in self.topic_aspects_.items():\n            if isinstance(list(values.values())[-1], list):\n                if isinstance(list(values.values())[-1][0], tuple) or isinstance(list(values.values())[-1][0], list):\n                    values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()}\n                elif isinstance(list(values.values())[-1][0], str):\n                    values = {topic: \" \".join(value).strip() for topic, value in values.items()}\n            info[aspect] = info[\"Topic\"].map(values)\n\n    # Representative Docs / Images\n    if self.representative_docs_ is not None:\n        info[\"Representative_Docs\"] = info[\"Topic\"].map(self.representative_docs_)\n    if self.representative_images_ is not None:\n        info[\"Representative_Images\"] = info[\"Topic\"].map(self.representative_images_)\n\n    # Select specific topic to return\n    if topic is not None:\n        info = info.loc[info.Topic == topic, :]\n\n    return info.reset_index(drop=True)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic_tree","title":"get_topic_tree(hier_topics, max_distance=None, tight_layout=False) staticmethod","text":"

        Extract the topic tree such that it can be printed

        Parameters:

        Name Type Description Default hier_topics DataFrame

        A dataframe containing the structure of the topic tree. This is the output of topic_model.hierachical_topics()

        required max_distance float

        The maximum distance between two topics. This value is based on the Distance column in hier_topics.

        None tight_layout bool

        Whether to use a tight layout (narrow width) for easier readability if you have hundreds of topics.

        False

        Returns:

        Type Description A tree that has the following structure when printed

        . . \u2514\u2500health_medical_disease_patients_hiv \u251c\u2500patients_medical_disease_candida_health \u2502 \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48 \u2502 \u2514\u2500patients_disease_cancer_medical_doctor \u2502 \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34 \u2502 \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26 \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9

        The blocks (\u25a0) indicate that the topic is one you can directly access from topic_model.get_topic. In other words, they are the original un-grouped topics.

        Examples:

        # Train model\nfrom bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Print topic tree\ntree = topic_model.get_topic_tree(hierarchical_topics)\nprint(tree)\n
        Source code in bertopic\\_bertopic.py
        @staticmethod\ndef get_topic_tree(hier_topics: pd.DataFrame,\n                   max_distance: float = None,\n                   tight_layout: bool = False) -> str:\n    \"\"\" Extract the topic tree such that it can be printed\n\n    Arguments:\n        hier_topics: A dataframe containing the structure of the topic tree.\n                     This is the output of `topic_model.hierachical_topics()`\n        max_distance: The maximum distance between two topics. This value is\n                      based on the Distance column in `hier_topics`.\n        tight_layout: Whether to use a tight layout (narrow width) for\n                      easier readability if you have hundreds of topics.\n\n    Returns:\n        A tree that has the following structure when printed:\n            .\n            .\n            \u2514\u2500health_medical_disease_patients_hiv\n                \u251c\u2500patients_medical_disease_candida_health\n                \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n                \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n                \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n                \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n                \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n\n        The blocks (\u25a0) indicate that the topic is one you can directly access\n        from `topic_model.get_topic`. In other words, they are the original un-grouped topics.\n\n    Examples:\n\n    ```python\n    # Train model\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Print topic tree\n    tree = topic_model.get_topic_tree(hierarchical_topics)\n    print(tree)\n    ```\n    \"\"\"\n    width = 1 if tight_layout else 4\n    if max_distance is None:\n        max_distance = hier_topics.Distance.max() + 1\n\n    max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1\n\n    # Extract mapping from ID to name\n    topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name))\n    topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)))\n    topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()}\n\n    # Create tree\n    tree = {str(row[1].Parent_ID): [str(row[1].Child_Left_ID), str(row[1].Child_Right_ID)]\n            for row in hier_topics.iterrows()}\n\n    def get_tree(start, tree):\n        \"\"\" Based on: https://stackoverflow.com/a/51920869/10532563 \"\"\"\n\n        def _tree(to_print, start, parent, tree, grandpa=None, indent=\"\"):\n\n            # Get distance between merged topics\n            distance = hier_topics.loc[(hier_topics.Child_Left_ID == parent) |\n                                       (hier_topics.Child_Right_ID == parent), \"Distance\"]\n            distance = distance.values[0] if len(distance) > 0 else 10\n\n            if parent != start:\n                if grandpa is None:\n                    to_print += topic_to_name[parent]\n                else:\n                    if int(parent) <= max_original_topic:\n\n                        # Do not append topic ID if they are not merged\n                        if distance < max_distance:\n                            to_print += \"\u25a0\u2500\u2500\" + topic_to_name[parent] + f\" \u2500\u2500 Topic: {parent}\" + \"\\n\"\n                        else:\n                            to_print += \"O \\n\"\n                    else:\n                        to_print += topic_to_name[parent] + \"\\n\"\n\n            if parent not in tree:\n                return to_print\n\n            for child in tree[parent][:-1]:\n                to_print += indent + \"\u251c\" + \"\u2500\"\n                to_print = _tree(to_print, start, child, tree, parent, indent + \"\u2502\" + \" \" * width)\n\n            child = tree[parent][-1]\n            to_print += indent + \"\u2514\" + \"\u2500\"\n            to_print = _tree(to_print, start, child, tree, parent, indent + \" \" * (width+1))\n\n            return to_print\n\n        to_print = \".\" + \"\\n\"\n        to_print = _tree(to_print, start, start, tree)\n        return to_print\n\n    start = str(hier_topics.Parent_ID.astype(int).max())\n    return get_tree(start, tree)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topics","title":"get_topics(self, full=False)","text":"

        Return topics with top n words and their c-TF-IDF score

        Parameters:

        Name Type Description Default full bool

        If True, returns all different forms of topic representations for each topic, including aspects

        False

        Returns:

        Type Description self.topic_representations_

        The top n words per topic and the corresponding c-TF-IDF score

        Examples:

        all_topics = topic_model.get_topics()\n
        Source code in bertopic\\_bertopic.py
        def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:\n    \"\"\" Return topics with top n words and their c-TF-IDF score\n\n    Arguments:\n        full: If True, returns all different forms of topic representations\n              for each topic, including aspects\n\n    Returns:\n        self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score\n\n    Examples:\n\n    ```python\n    all_topics = topic_model.get_topics()\n    ```\n    \"\"\"\n    check_is_fitted(self)\n\n    if full:\n        topic_representations = {\"Main\": self.topic_representations_}\n        topic_representations.update(self.topic_aspects_)\n        return topic_representations\n    else:\n        return self.topic_representations_\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.hierarchical_topics","title":"hierarchical_topics(self, docs, linkage_function=None, distance_function=None)","text":"

        Create a hierarchy of topics

        To create this hierarchy, BERTopic needs to be already fitted once. Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF representation using scipy.cluster.hierarchy.linkage.

        Based on that hierarchy, we calculate the topic representation at each merged step. This is a local representation, as we only assume that the chosen step is merged and not all others which typically improves the topic representation.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

        The linkage function to use. Default is: lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

        None distance_function Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

        The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x). You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix.

        None

        Returns:

        Type Description hierarchical_topics

        A dataframe that contains a hierarchy of topics represented by their parents and their children

        Examples:

        from bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n

        A custom linkage function can be used as follows:

        from scipy.cluster import hierarchy as sch\nfrom bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Hierarchical topics\nlinkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\nhierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n
        Source code in bertopic\\_bertopic.py
        def hierarchical_topics(self,\n                        docs: List[str],\n                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n                        distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame:\n    \"\"\" Create a hierarchy of topics\n\n    To create this hierarchy, BERTopic needs to be already fitted once.\n    Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF\n    representation using `scipy.cluster.hierarchy.linkage`.\n\n    Based on that hierarchy, we calculate the topic representation at each\n    merged step. This is a local representation, as we only assume that the\n    chosen step is merged and not all others which typically improves the\n    topic representation.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        linkage_function: The linkage function to use. Default is:\n                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                           `lambda x: 1 - cosine_similarity(x)`.\n                           You can pass any function that returns either a square matrix of \n                           shape (n_samples, n_samples) with zeros on the diagonal and \n                           non-negative values or condensed distance matrix of shape\n                           (n_samples * (n_samples - 1) / 2,) containing the upper\n                           triangular of the distance matrix.\n\n    Returns:\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n    ```\n\n    A custom linkage function can be used as follows:\n\n    ```python\n    from scipy.cluster import hierarchy as sch\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n\n    # Hierarchical topics\n    linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n    hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    if distance_function is None:\n        distance_function = lambda x: 1 - cosine_similarity(x)\n\n    if linkage_function is None:\n        linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n\n    # Calculate distance\n    embeddings = self.c_tf_idf_[self._outliers:]\n    X = distance_function(embeddings)\n    X = validate_distance_matrix(X, embeddings.shape[0])\n\n    # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix\n    Z = linkage_function(X)\n\n    # Calculate basic bag-of-words to be iteratively merged later\n    documents = pd.DataFrame({\"Document\": docs,\n                              \"ID\": range(len(docs)),\n                              \"Topic\": self.topics_})\n    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})\n    documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :]\n    clean_documents = self._preprocess_text(documents_per_topic.Document.values)\n\n    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n    # and will be removed in 1.2. Please use get_feature_names_out instead.\n    if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n        words = self.vectorizer_model.get_feature_names_out()\n    else:\n        words = self.vectorizer_model.get_feature_names()\n\n    bow = self.vectorizer_model.transform(clean_documents)\n\n    # Extract clusters\n    hier_topics = pd.DataFrame(columns=[\"Parent_ID\", \"Parent_Name\", \"Topics\",\n                                        \"Child_Left_ID\", \"Child_Left_Name\",\n                                        \"Child_Right_ID\", \"Child_Right_Name\"])\n    for index in tqdm(range(len(Z))):\n\n        # Find clustered documents\n        clusters = sch.fcluster(Z, t=Z[index][2], criterion='distance') - self._outliers\n        nr_clusters = len(clusters)\n\n        # Extract first topic we find to get the set of topics in a merged topic\n        topic = None\n        val = Z[index][0]\n        while topic is None:\n            if val - len(clusters) < 0:\n                topic = int(val)\n            else:\n                val = Z[int(val - len(clusters))][0]\n        clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]]\n\n        # Group bow per cluster, calculate c-TF-IDF and extract words\n        grouped = csr_matrix(bow[clustered_topics].sum(axis=0))\n        c_tf_idf = self.ctfidf_model.transform(grouped)\n        selection = documents.loc[documents.Topic.isin(clustered_topics), :]\n        selection.Topic = 0\n        words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n\n        # Extract parent's name and ID\n        parent_id = index + len(clusters)\n        parent_name = \"_\".join([x[0] for x in words_per_topic[0]][:5])\n\n        # Extract child's name and ID\n        Z_id = Z[index][0]\n        child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n        if Z_id - nr_clusters < 0:\n            child_left_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n        else:\n            child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name\n\n        # Extract child's name and ID\n        Z_id = Z[index][1]\n        child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n        if Z_id - nr_clusters < 0:\n            child_right_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n        else:\n            child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name\n\n        # Save results\n        hier_topics.loc[len(hier_topics), :] = [parent_id, parent_name,\n                                                clustered_topics,\n                                                int(Z[index][0]), child_left_name,\n                                                int(Z[index][1]), child_right_name]\n\n    hier_topics[\"Distance\"] = Z[:, 2]\n    hier_topics = hier_topics.sort_values(\"Parent_ID\", ascending=False)\n    hier_topics[[\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]] = hier_topics[[\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]].astype(str)\n\n    return hier_topics\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.load","title":"load(path, embedding_model=None) classmethod","text":"

        Loads the model from the specified path or directory

        Parameters:

        Name Type Description Default path str

        Either load a BERTopic model from a file (.pickle) or a folder containing .safetensors or .bin files.

        required embedding_model

        Additionally load in an embedding model if it was not saved in the BERTopic model file or directory.

        None

        Examples:

        BERTopic.load(\"model_dir\")\n

        or if you did not save the embedding model:

        BERTopic.load(\"model_dir\", embedding_model=\"all-MiniLM-L6-v2\")\n
        Source code in bertopic\\_bertopic.py
        @classmethod\ndef load(cls,\n         path: str,\n         embedding_model=None):\n    \"\"\" Loads the model from the specified path or directory\n\n    Arguments:\n        path: Either load a BERTopic model from a file (`.pickle`) or a folder containing\n              `.safetensors` or `.bin` files.\n        embedding_model: Additionally load in an embedding model if it was not saved\n                         in the BERTopic model file or directory.\n\n    Examples:\n\n    ```python\n    BERTopic.load(\"model_dir\")\n    ```\n\n    or if you did not save the embedding model:\n\n    ```python\n    BERTopic.load(\"model_dir\", embedding_model=\"all-MiniLM-L6-v2\")\n    ```\n    \"\"\"\n    file_or_dir = Path(path)\n\n    # Load from Pickle\n    if file_or_dir.is_file():\n        with open(file_or_dir, 'rb') as file:\n            if embedding_model:\n                topic_model = joblib.load(file)\n                topic_model.embedding_model = select_backend(embedding_model)\n            else:\n                topic_model = joblib.load(file)\n            return topic_model\n\n    # Load from directory or HF\n    if file_or_dir.is_dir():\n        topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir)\n    elif \"/\" in str(path):\n        topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path)\n    else:\n        raise ValueError(\"Make sure to either pass a valid directory or HF model.\")\n    topic_model = _create_model_from_files(topics, params, tensors, ctfidf_tensors, ctfidf_config, images,\n                                           warn_no_backend=(embedding_model is None))\n\n    # Replace embedding model if one is specifically chosen\n    if embedding_model is not None:\n        topic_model.embedding_model = select_backend(embedding_model)\n\n    return topic_model\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.merge_models","title":"merge_models(models, min_similarity=0.7, embedding_model=None) classmethod","text":"

        Merge multiple pre-trained BERTopic models into a single model.

        The models are merged as if they were all saved using pytorch or safetensors, so a minimal version without c-TF-IDF.

        To do this, we choose the first model in the list of models as a baseline. Then, we check each model whether they contain topics that are not in the baseline. This check is based on the cosine similarity between topics embeddings. If topic embeddings between two models are similar, then the topic of the second model is re-assigned to the first. If they are dissimilar, the topic of the second model is assigned to the first.

        In essence, we simply check whether sufficiently \"new\" topics emerge and add them.

        Parameters:

        Name Type Description Default models

        A list of fitted BERTopic models

        required min_similarity float

        The minimum similarity for when topics are merged.

        0.7 embedding_model

        Additionally load in an embedding model if necessary.

        None

        Returns:

        Type Description

        A new BERTopic model that was created as if you were loading a model from the HuggingFace Hub without c-TF-IDF

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# Create three separate models\ntopic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000])\ntopic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000])\ntopic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:])\n\n# Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n
        Source code in bertopic\\_bertopic.py
        @classmethod\ndef merge_models(cls, models, min_similarity: float = .7, embedding_model=None):\n    \"\"\" Merge multiple pre-trained BERTopic models into a single model.\n\n    The models are merged as if they were all saved using pytorch or\n    safetensors, so a minimal version without c-TF-IDF.\n\n    To do this, we choose the first model in the list of\n    models as a baseline. Then, we check each model whether\n    they contain topics that are not in the baseline.\n    This check is based on the cosine similarity between\n    topics embeddings. If topic embeddings between two models\n    are similar, then the topic of the second model is re-assigned\n    to the first. If they are dissimilar, the topic of the second\n    model is assigned to the first.\n\n    In essence, we simply check whether sufficiently \"new\"\n    topics emerge and add them.\n\n    Arguments:\n        models: A list of fitted BERTopic models\n        min_similarity: The minimum similarity for when topics are merged.\n        embedding_model: Additionally load in an embedding model if necessary.\n\n    Returns:\n        A new BERTopic model that was created as if you were\n        loading a model from the HuggingFace Hub without c-TF-IDF\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n    # Create three separate models\n    topic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000])\n    topic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000])\n    topic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:])\n\n    # Combine all models into one\n    merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n    ```\n    \"\"\"\n    import torch\n\n    # Temporarily save model and push to HF\n    with TemporaryDirectory() as tmpdir:\n\n        # Save model weights and config.\n        all_topics, all_params, all_tensors = [], [], []\n        for index, model in enumerate(models):\n            model.save(tmpdir, serialization=\"pytorch\")\n            topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir))\n            all_topics.append(topics)\n            all_params.append(params)\n            all_tensors.append(np.array(tensors[\"topic_embeddings\"]))\n\n            # Create a base set of parameters\n            if index == 0:\n                merged_topics = topics\n                merged_params = params\n                merged_tensors = np.array(tensors[\"topic_embeddings\"])\n                merged_topics[\"custom_labels\"] = None\n\n    for tensors, selected_topics in zip(all_tensors[1:], all_topics[1:]):\n        # Calculate similarity matrix\n        sim_matrix = cosine_similarity(tensors, merged_tensors)\n        sims = np.max(sim_matrix, axis=1)\n\n        # Extract new topics\n        new_topics = sorted([index - selected_topics[\"_outliers\"] for index, sim in enumerate(sims) if sim < min_similarity])\n        max_topic = max(set(merged_topics[\"topics\"]))\n\n        # Merge Topic Representations\n        new_topics_dict = {}\n        for new_topic in new_topics:\n            if new_topic != -1:\n                max_topic += 1\n                new_topics_dict[new_topic] = max_topic\n                merged_topics[\"topic_representations\"][str(max_topic)] = selected_topics[\"topic_representations\"][str(new_topic)]\n                merged_topics[\"topic_labels\"][str(max_topic)] = selected_topics[\"topic_labels\"][str(new_topic)]\n\n                # Add new aspects\n                if selected_topics[\"topic_aspects\"]:\n                    aspects_1 = set(merged_topics[\"topic_aspects\"].keys())\n                    aspects_2 = set(selected_topics[\"topic_aspects\"].keys())\n                    aspects_diff = aspects_2.difference(aspects_1)\n                    if aspects_diff:\n                        for aspect in aspects_diff:\n                            merged_topics[\"topic_aspects\"][aspect] = {}\n\n                    # If the original model does not have topic aspects but the to be added model does\n                    if not merged_topics.get(\"topic_aspects\"):\n                        merged_topics[\"topic_aspects\"] = selected_topics[\"topic_aspects\"]\n\n                    # If they both contain topic aspects, add to the existing set of aspects\n                    else:\n                        for aspect, values in selected_topics[\"topic_aspects\"].items():\n                            merged_topics[\"topic_aspects\"][aspect][str(max_topic)] = values[str(new_topic)]\n\n                # Add new embeddings\n                new_tensors = tensors[new_topic + selected_topics[\"_outliers\"]]\n                merged_tensors = np.vstack([merged_tensors, new_tensors])\n\n        # Topic Mapper\n        merged_topics[\"topic_mapper\"] = TopicMapper(list(range(-1, max_topic+1, 1))).mappings_\n\n        # Find similar topics and re-assign those from the new models\n        sims_idx = np.argmax(sim_matrix, axis=1)\n        sims = np.max(sim_matrix, axis=1)\n        to_merge = {\n            a - selected_topics[\"_outliers\"]:\n            b - merged_topics[\"_outliers\"] for a, (b, val) in enumerate(zip(sims_idx, sims))\n            if val >= min_similarity\n        }\n        to_merge.update(new_topics_dict)\n        to_merge[-1] = -1\n        topics = [to_merge[topic] for topic in selected_topics[\"topics\"]]\n        merged_topics[\"topics\"].extend(topics)\n        merged_topics[\"topic_sizes\"] = dict(Counter(merged_topics[\"topics\"]))\n\n    # Create a new model from the merged parameters\n    merged_tensors = {\"topic_embeddings\": torch.from_numpy(merged_tensors)}\n    merged_model = _create_model_from_files(merged_topics, merged_params, merged_tensors, None, None, None, warn_no_backend=False)\n    merged_model.embedding_model = models[0].embedding_model\n\n    # Replace embedding model if one is specifically chosen\n    if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder:\n        merged_model.embedding_model = select_backend(embedding_model)\n    return merged_model\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.merge_topics","title":"merge_topics(self, docs, topics_to_merge, images=None)","text":"

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics_to_merge List[Union[Iterable[int], int]]

        Either a list of topics or a list of list of topics to merge. For example: [1, 2, 3] will merge topics 1, 2 and 3 [[1, 2], [3, 4]] will merge topics 1 and 2, and separately merge topics 3 and 4.

        required images List[str]

        A list of paths to the images used when calling either fit or fit_transform

        None

        Examples:

        If you want to merge topics 1, 2, and 3:

        topics_to_merge = [1, 2, 3]\ntopic_model.merge_topics(docs, topics_to_merge)\n

        or if you want to merge topics 1 and 2, and separately merge topics 3 and 4:

        topics_to_merge = [[1, 2],\n                    [3, 4]]\ntopic_model.merge_topics(docs, topics_to_merge)\n
        Source code in bertopic\\_bertopic.py
        def merge_topics(self,\n                 docs: List[str],\n                 topics_to_merge: List[Union[Iterable[int], int]],\n                 images: List[str] = None) -> None:\n    \"\"\"\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics_to_merge: Either a list of topics or a list of list of topics\n                         to merge. For example:\n                            [1, 2, 3] will merge topics 1, 2 and 3\n                            [[1, 2], [3, 4]] will merge topics 1 and 2, and\n                            separately merge topics 3 and 4.\n        images: A list of paths to the images used when calling either\n                `fit` or `fit_transform`\n\n    Examples:\n\n    If you want to merge topics 1, 2, and 3:\n\n    ```python\n    topics_to_merge = [1, 2, 3]\n    topic_model.merge_topics(docs, topics_to_merge)\n    ```\n\n    or if you want to merge topics 1 and 2, and separately\n    merge topics 3 and 4:\n\n    ```python\n    topics_to_merge = [[1, 2],\n                        [3, 4]]\n    topic_model.merge_topics(docs, topics_to_merge)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Image\": images, \"ID\": range(len(docs))})\n\n    mapping = {topic: topic for topic in set(self.topics_)}\n    if isinstance(topics_to_merge[0], int):\n        for topic in sorted(topics_to_merge):\n            mapping[topic] = topics_to_merge[0]\n    elif isinstance(topics_to_merge[0], Iterable):\n        for topic_group in sorted(topics_to_merge):\n            for topic in topic_group:\n                mapping[topic] = topic_group[0]\n    else:\n        raise ValueError(\"Make sure that `topics_to_merge` is either\"\n                         \"a list of topics or a list of list of topics.\")\n\n    # Track mappings and sizes of topics for merging topic embeddings\n    mappings = defaultdict(list)\n    for key, val in sorted(mapping.items()):\n        mappings[val].append(key)\n    mappings = {topic_from:\n                {\"topics_to\": topics_to,\n                 \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_to]}\n                for topic_from, topics_to in mappings.items()}\n\n    # Update topics\n    documents.Topic = documents.Topic.map(mapping)\n    self.topic_mapper_.add_mappings(mapping)\n    documents = self._sort_mappings_by_frequency(documents)\n    self._extract_topics(documents, mappings=mappings)\n    self._update_topic_size(documents)\n    self._save_representative_docs(documents)\n    self.probabilities_ = self._map_probabilities(self.probabilities_)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.partial_fit","title":"partial_fit(self, documents, embeddings=None, y=None)","text":"

        Fit BERTopic on a subset of the data and perform online learning with batch-like data.

        Online topic modeling in BERTopic is performed by using dimensionality reduction and cluster algorithms that support a partial_fit method in order to incrementally train the topic model.

        Likewise, the bertopic.vectorizers.OnlineCountVectorizer is used to dynamically update its vocabulary when presented with new data. It has several parameters for modeling decay and updating the representations.

        In other words, although the main algorithm stays the same, the training procedure now works as follows:

        For each subset of the data:

        1. Generate embeddings with a pre-traing language model
        2. Incrementally update the dimensionality reduction algorithm with partial_fit
        3. Incrementally update the cluster algorithm with partial_fit
        4. Incrementally update the OnlineCountVectorizer and apply some form of decay

        Note that it is advised to use partial_fit with batches and not single documents for the best performance.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents to fit on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model

        None y Union[List[int], numpy.ndarray]

        The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified.

        None

        Examples:

        from sklearn.datasets import fetch_20newsgroups\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.decomposition import IncrementalPCA\nfrom bertopic.vectorizers import OnlineCountVectorizer\nfrom bertopic import BERTopic\n\n# Prepare documents\ndocs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\n# Prepare sub-models that support online learning\numap_model = IncrementalPCA(n_components=5)\ncluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\ntopic_model = BERTopic(umap_model=umap_model,\n                       hdbscan_model=cluster_model,\n                       vectorizer_model=vectorizer_model)\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor index in range(0, len(docs), 1000):\n    topic_model.partial_fit(docs[index: index+1000])\n
        Source code in bertopic\\_bertopic.py
        def partial_fit(self,\n                documents: List[str],\n                embeddings: np.ndarray = None,\n                y: Union[List[int], np.ndarray] = None):\n    \"\"\" Fit BERTopic on a subset of the data and perform online learning\n    with batch-like data.\n\n    Online topic modeling in BERTopic is performed by using dimensionality\n    reduction and cluster algorithms that support a `partial_fit` method\n    in order to incrementally train the topic model.\n\n    Likewise, the `bertopic.vectorizers.OnlineCountVectorizer` is used\n    to dynamically update its vocabulary when presented with new data.\n    It has several parameters for modeling decay and updating the\n    representations.\n\n    In other words, although the main algorithm stays the same, the training\n    procedure now works as follows:\n\n    For each subset of the data:\n\n    1. Generate embeddings with a pre-traing language model\n    2. Incrementally update the dimensionality reduction algorithm with `partial_fit`\n    3. Incrementally update the cluster algorithm with `partial_fit`\n    4. Incrementally update the OnlineCountVectorizer and apply some form of decay\n\n    Note that it is advised to use `partial_fit` with batches and\n    not single documents for the best performance.\n\n    Arguments:\n        documents: A list of documents to fit on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model\n        y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n           specific instance is specified.\n\n    Examples:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sklearn.cluster import MiniBatchKMeans\n    from sklearn.decomposition import IncrementalPCA\n    from bertopic.vectorizers import OnlineCountVectorizer\n    from bertopic import BERTopic\n\n    # Prepare documents\n    docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\n    # Prepare sub-models that support online learning\n    umap_model = IncrementalPCA(n_components=5)\n    cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\n    vectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\n    topic_model = BERTopic(umap_model=umap_model,\n                           hdbscan_model=cluster_model,\n                           vectorizer_model=vectorizer_model)\n\n    # Incrementally fit the topic model by training on 1000 documents at a time\n    for index in range(0, len(docs), 1000):\n        topic_model.partial_fit(docs[index: index+1000])\n    ```\n    \"\"\"\n    # Checks\n    check_embeddings_shape(embeddings, documents)\n    if not hasattr(self.hdbscan_model, \"partial_fit\"):\n        raise ValueError(\"In order to use `.partial_fit`, the cluster model should have \"\n                         \"a `.partial_fit` function.\")\n\n    # Prepare documents\n    if isinstance(documents, str):\n        documents = [documents]\n    documents = pd.DataFrame({\"Document\": documents,\n                              \"ID\": range(len(documents)),\n                              \"Topic\": None})\n\n    # Extract embeddings\n    if embeddings is None:\n        if self.topic_representations_ is None:\n            self.embedding_model = select_backend(self.embedding_model,\n                                                  language=self.language)\n        embeddings = self._extract_embeddings(documents.Document.values.tolist(),\n                                              method=\"document\",\n                                              verbose=self.verbose)\n    else:\n        if self.embedding_model is not None and self.topic_representations_ is None:\n            self.embedding_model = select_backend(self.embedding_model,\n                                                  language=self.language)\n\n    # Reduce dimensionality\n    if self.seed_topic_list is not None and self.embedding_model is not None:\n        y, embeddings = self._guided_topic_modeling(embeddings)\n    umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True)\n\n    # Cluster reduced embeddings\n    documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True)\n    topics = documents.Topic.to_list()\n\n    # Map and find new topics\n    if not self.topic_mapper_:\n        self.topic_mapper_ = TopicMapper(topics)\n    mappings = self.topic_mapper_.get_mappings()\n    new_topics = set(topics).difference(set(mappings.keys()))\n    new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)}\n    self.topic_mapper_.add_new_topics(new_topic_ids)\n    updated_mappings = self.topic_mapper_.get_mappings()\n    updated_topics = [updated_mappings[topic] for topic in topics]\n    documents[\"Topic\"] = updated_topics\n\n    # Add missing topics (topics that were originally created but are now missing)\n    if self.topic_representations_:\n        missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics))\n        for missing_topic in missing_topics:\n            documents.loc[len(documents), :] = [\" \", len(documents), missing_topic]\n    else:\n        missing_topics = {}\n\n    # Prepare documents\n    documents_per_topic = documents.sort_values(\"Topic\").groupby(['Topic'], as_index=False)\n    updated_topics = documents_per_topic.first().Topic.astype(int)\n    documents_per_topic = documents_per_topic.agg({'Document': ' '.join})\n\n    # Update topic representations\n    self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True)\n    self.topic_representations_ = self._extract_words_per_topic(updated_words, documents, self.c_tf_idf_, calculate_aspects=False)\n    self._create_topic_vectors()\n    self.topic_labels_ = {key: f\"{key}_\" + \"_\".join([word[0] for word in values[:4]])\n                          for key, values in self.topic_representations_.items()}\n\n    # Update topic sizes\n    if len(missing_topics) > 0:\n        documents = documents.iloc[:-len(missing_topics)]\n\n    if self.topic_sizes_ is None:\n        self._update_topic_size(documents)\n    else:\n        sizes = documents.groupby(['Topic'], as_index=False).count()\n        for _, row in sizes.iterrows():\n            topic = int(row.Topic)\n            if self.topic_sizes_.get(topic) is not None and topic not in missing_topics:\n                self.topic_sizes_[topic] += int(row.Document)\n            elif self.topic_sizes_.get(topic) is None:\n                self.topic_sizes_[topic] = int(row.Document)\n        self.topics_ = documents.Topic.astype(int).tolist()\n\n    return self\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.push_to_hf_hub","title":"push_to_hf_hub(self, repo_id, commit_message='Add BERTopic model', token=None, revision=None, private=False, create_pr=False, model_card=True, serialization='safetensors', save_embedding_model=True, save_ctfidf=False)","text":"

        Push your BERTopic model to a HuggingFace Hub

        Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:

        • Log in to your HuggingFace account with the following command:
          huggingface-cli login\n\n# or using an environment variable\nhuggingface-cli login --token $HUGGINGFACE_TOKEN\n
        • Alternatively, you can programmatically login using login() in a notebook or a script:
          from huggingface_hub import login\nlogin()\n
        • Or you can give a token with the token variable

        Parameters:

        Name Type Description Default repo_id str

        The name of your HuggingFace repository

        required commit_message str

        A commit message

        'Add BERTopic model' token str

        Token to add if not already logged in

        None revision str

        Repository revision

        None private bool

        Whether to create a private repository

        False create_pr bool

        Whether to upload the model as a Pull Request

        False model_card bool

        Whether to automatically create a modelcard

        True serialization str

        The type of serialization. Either safetensors or pytorch

        'safetensors' save_embedding_model Union[str, bool]

        A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., sentence-transformers/all-MiniLM-L6-v2

        True save_ctfidf bool

        Whether to save c-TF-IDF information

        False

        Examples:

        topic_model.push_to_hf_hub(\n    repo_id=\"ArXiv\",\n    save_ctfidf=True,\n    save_embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n)\n
        Source code in bertopic\\_bertopic.py
        def push_to_hf_hub(\n        self,\n        repo_id: str,\n        commit_message: str = 'Add BERTopic model',\n        token: str = None,\n        revision: str = None,\n        private: bool = False,\n        create_pr: bool = False,\n        model_card: bool = True,\n        serialization: str = \"safetensors\",\n        save_embedding_model: Union[str, bool] = True,\n        save_ctfidf: bool = False,\n        ):\n    \"\"\" Push your BERTopic model to a HuggingFace Hub\n\n    Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:\n\n    * Log in to your HuggingFace account with the following command:\n        ```bash\n        huggingface-cli login\n\n        # or using an environment variable\n        huggingface-cli login --token $HUGGINGFACE_TOKEN\n        ```\n    * Alternatively, you can programmatically login using login() in a notebook or a script:\n        ```python\n        from huggingface_hub import login\n        login()\n        ```\n    * Or you can give a token with the `token` variable\n\n    Arguments:\n        repo_id: The name of your HuggingFace repository\n        commit_message: A commit message\n        token: Token to add if not already logged in\n        revision: Repository revision\n        private: Whether to create a private repository\n        create_pr: Whether to upload the model as a Pull Request\n        model_card: Whether to automatically create a modelcard\n        serialization: The type of serialization.\n                       Either `safetensors` or `pytorch`\n        save_embedding_model: A pointer towards a HuggingFace model to be loaded in with\n                              SentenceTransformers. E.g.,\n                              `sentence-transformers/all-MiniLM-L6-v2`\n        save_ctfidf: Whether to save c-TF-IDF information\n\n\n    Examples:\n\n    ```python\n    topic_model.push_to_hf_hub(\n        repo_id=\"ArXiv\",\n        save_ctfidf=True,\n        save_embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n    )\n    ```\n    \"\"\"\n    return save_utils.push_to_hf_hub(model=self, repo_id=repo_id, commit_message=commit_message,\n                                     token=token, revision=revision, private=private, create_pr=create_pr,\n                                     model_card=model_card, serialization=serialization,\n                                     save_embedding_model=save_embedding_model, save_ctfidf=save_ctfidf)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.reduce_outliers","title":"reduce_outliers(self, documents, topics, images=None, strategy='distributions', probabilities=None, threshold=0, embeddings=None, distributions_params={})","text":"

        Reduce outliers by merging them with their nearest topic according to one of several strategies.

        When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created that do not fall within any of the created topics. These are labeled as -1. This function allows the user to match outlier documents with their nearest topic using one of the following strategies using the strategy parameter: * \"probabilities\" This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True. * \"distributions\" Use the topic distributions, as calculated with .approximate_distribution to find the most frequent topic in each outlier document. You can use the distributions_params variable to tweak the parameters of .approximate_distribution. * \"c-tf-idf\" Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity. * \"embeddings\" Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents for which we reduce or remove the outliers.

        required topics List[int]

        The topics that correspond to the documents

        required images List[str]

        A list of paths to the images used when calling either fit or fit_transform

        None strategy str

        The strategy used for reducing outliers. Options: * \"probabilities\" This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.

            * \"distributions\"\n        Use the topic distributions, as calculated with `.approximate_distribution`\n        to find the most frequent topic in each outlier document.\n\n    * \"c-tf-idf\"\n        Calculate the c-TF-IDF representation for outlier documents and\n        find the best matching c-TF-IDF topic representation.\n\n    * \"embeddings\"\n        Calculate the embeddings for outlier documents and\n        find the best matching topic embedding.\n
        'distributions' threshold float

        The threshold for assigning topics to outlier documents. This value represents the minimum probability when strategy=\"probabilities\". For all other strategies, it represents the minimum similarity.

        0 embeddings ndarray

        The pre-computed embeddings to be used when strategy=\"embeddings\". If this is None, then it will compute the embeddings for the outlier documents.

        None distributions_params Mapping[str, Any]

        The parameters used in .approximate_distribution when using the strategy \"distributions\".

        {}

        Returns:

        Type Description new_topics

        The updated topics

        Usage:

        The default settings uses the \"distributions\" strategy:

        new_topics = topic_model.reduce_outliers(docs, topics)\n

        When you use the \"probabilities\" strategy, make sure to also pass the probabilities as generated through HDBSCAN:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs)\n\nnew_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n
        Source code in bertopic\\_bertopic.py
        def reduce_outliers(self,\n                    documents: List[str],\n                    topics: List[int],\n                    images: List[str] = None,\n                    strategy: str = \"distributions\",\n                    probabilities: np.ndarray = None,\n                    threshold: float = 0,\n                    embeddings: np.ndarray = None,\n                    distributions_params: Mapping[str, Any] = {}) -> List[int]:\n    \"\"\" Reduce outliers by merging them with their nearest topic according\n    to one of several strategies.\n\n    When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created\n    that do not fall within any of the created topics. These are labeled as -1.\n    This function allows the user to match outlier documents with their nearest topic\n    using one of the following strategies using the `strategy` parameter:\n        * \"probabilities\"\n            This uses the soft-clustering as performed by HDBSCAN to find the\n            best matching topic for each outlier document. To use this, make\n            sure to calculate the `probabilities` beforehand by instantiating\n            BERTopic with `calculate_probabilities=True`.\n        * \"distributions\"\n            Use the topic distributions, as calculated with `.approximate_distribution`\n            to find the most frequent topic in each outlier document. You can use the\n            `distributions_params` variable to tweak the parameters of\n            `.approximate_distribution`.\n        * \"c-tf-idf\"\n            Calculate the c-TF-IDF representation for each outlier document and\n            find the best matching c-TF-IDF topic representation using\n            cosine similarity.\n        * \"embeddings\"\n            Using the embeddings of each outlier documents, find the best\n            matching topic embedding using cosine similarity.\n\n    Arguments:\n        documents: A list of documents for which we reduce or remove the outliers.\n        topics: The topics that correspond to the documents\n        images: A list of paths to the images used when calling either\n                `fit` or `fit_transform`\n        strategy: The strategy used for reducing outliers.\n                Options:\n                    * \"probabilities\"\n                        This uses the soft-clustering as performed by HDBSCAN\n                        to find the best matching topic for each outlier document.\n\n                    * \"distributions\"\n                        Use the topic distributions, as calculated with `.approximate_distribution`\n                        to find the most frequent topic in each outlier document.\n\n                    * \"c-tf-idf\"\n                        Calculate the c-TF-IDF representation for outlier documents and\n                        find the best matching c-TF-IDF topic representation.\n\n                    * \"embeddings\"\n                        Calculate the embeddings for outlier documents and\n                        find the best matching topic embedding.\n        threshold: The threshold for assigning topics to outlier documents. This value\n                   represents the minimum probability when `strategy=\"probabilities\"`.\n                   For all other strategies, it represents the minimum similarity.\n        embeddings: The pre-computed embeddings to be used when `strategy=\"embeddings\"`.\n                    If this is None, then it will compute the embeddings for the outlier documents.\n        distributions_params: The parameters used in `.approximate_distribution` when using\n                              the strategy `\"distributions\"`.\n\n    Returns:\n        new_topics: The updated topics\n\n    Usage:\n\n    The default settings uses the `\"distributions\"` strategy:\n\n    ```python\n    new_topics = topic_model.reduce_outliers(docs, topics)\n    ```\n\n    When you use the `\"probabilities\"` strategy, make sure to also pass the probabilities\n    as generated through HDBSCAN:\n\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic(calculate_probabilities=True)\n    topics, probs = topic_model.fit_transform(docs)\n\n    new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n    ```\n    \"\"\"\n    if images is not None:\n        strategy = \"embeddings\"\n\n    # Check correct use of parameters\n    if strategy.lower() == \"probabilities\" and probabilities is None:\n        raise ValueError(\"Make sure to pass in `probabilities` in order to use the probabilities strategy\")\n\n    # Reduce outliers by extracting most likely topics through the topic-term probability matrix\n    if strategy.lower() == \"probabilities\":\n        new_topics = [np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic\n                      for topic, prob in zip(topics, probabilities)]\n\n    # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions\n    elif strategy.lower() == \"distributions\":\n        outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n        outlier_docs = [documents[index] for index in outlier_ids]\n        topic_distr, _ = self.approximate_distribution(outlier_docs, min_similarity=threshold, **distributions_params)\n        outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr])\n        new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n    # Reduce outliers by finding the most similar c-TF-IDF representations\n    elif strategy.lower() == \"c-tf-idf\":\n        outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n        outlier_docs = [documents[index] for index in outlier_ids]\n\n        # Calculate c-TF-IDF of outlier documents with all topics\n        bow_doc = self.vectorizer_model.transform(outlier_docs)\n        c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n        similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers:])\n\n        # Update topics\n        similarity[similarity < threshold] = 0\n        outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n        new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n    # Reduce outliers by finding the most similar topic embeddings\n    elif strategy.lower() == \"embeddings\":\n        if self.embedding_model is None and embeddings is None:\n            raise ValueError(\"To use this strategy, you will need to pass a model to `embedding_model`\"\n                             \"when instantiating BERTopic.\")\n        outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n        if images is not None:\n            outlier_docs = [images[index] for index in outlier_ids]\n        else:\n            outlier_docs = [documents[index] for index in outlier_ids]\n\n        # Extract or calculate embeddings for outlier documents\n        if embeddings is not None:\n            outlier_embeddings = np.array([embeddings[index] for index in outlier_ids])\n        elif images is not None:\n            outlier_images = [images[index] for index in outlier_ids]\n            outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose)\n        else:\n            outlier_embeddings = self.embedding_model.embed_documents(outlier_docs)\n        similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers:])\n\n        # Update topics\n        similarity[similarity < threshold] = 0\n        outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n        new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n    return new_topics\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.reduce_topics","title":"reduce_topics(self, docs, nr_topics=20, images=None)","text":"

        Reduce the number of topics to a fixed number of topics or automatically.

        If nr_topics is an integer, then the number of topics is reduced to nr_topics using AgglomerativeClustering on the cosine distance matrix of the topic embeddings.

        If nr_topics is \"auto\", then HDBSCAN is used to automatically reduce the number of topics by running it on the topic embeddings.

        The topics, their sizes, and representations are updated.

        Parameters:

        Name Type Description Default docs List[str]

        The docs you used when calling either fit or fit_transform

        required nr_topics Union[int, str]

        The number of topics you want reduced to

        20 images List[str]

        A list of paths to the images used when calling either fit or fit_transform

        None

        Updates

        topics_ : Assigns topics to their merged representations. probabilities_ : Assigns probabilities to their merged representations.

        Examples:

        You can further reduce the topics by passing the documents with their topics and probabilities (if they were calculated):

        topic_model.reduce_topics(docs, nr_topics=30)\n

        You can then access the updated topics and probabilities with:

        topics = topic_model.topics_\nprobabilities = topic_model.probabilities_\n
        Source code in bertopic\\_bertopic.py
        def reduce_topics(self,\n                  docs: List[str],\n                  nr_topics: Union[int, str] = 20,\n                  images: List[str] = None) -> None:\n    \"\"\" Reduce the number of topics to a fixed number of topics\n    or automatically.\n\n    If nr_topics is an integer, then the number of topics is reduced\n    to nr_topics using `AgglomerativeClustering` on the cosine distance matrix\n    of the topic embeddings.\n\n    If nr_topics is `\"auto\"`, then HDBSCAN is used to automatically\n    reduce the number of topics by running it on the topic embeddings.\n\n    The topics, their sizes, and representations are updated.\n\n    Arguments:\n        docs: The docs you used when calling either `fit` or `fit_transform`\n        nr_topics: The number of topics you want reduced to\n        images: A list of paths to the images used when calling either\n                `fit` or `fit_transform`\n\n    Updates:\n        topics_ : Assigns topics to their merged representations.\n        probabilities_ : Assigns probabilities to their merged representations.\n\n    Examples:\n\n    You can further reduce the topics by passing the documents with their\n    topics and probabilities (if they were calculated):\n\n    ```python\n    topic_model.reduce_topics(docs, nr_topics=30)\n    ```\n\n    You can then access the updated topics and probabilities with:\n\n    ```python\n    topics = topic_model.topics_\n    probabilities = topic_model.probabilities_\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n\n    self.nr_topics = nr_topics\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Image\": images, \"ID\": range(len(docs))})\n\n    # Reduce number of topics\n    documents = self._reduce_topics(documents)\n    self._merged_topics = None\n    self._save_representative_docs(documents)\n    self.probabilities_ = self._map_probabilities(self.probabilities_)\n\n    return self\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.save","title":"save(self, path, serialization='pickle', save_embedding_model=True, save_ctfidf=False)","text":"

        Saves the model to the specified path or folder

        When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic should not be loaded in other versions.

        Parameters:

        Name Type Description Default path

        If serialization is 'safetensors' or pytorch, this is a directory. If serialization is pickle, then this is a file.

        required serialization Literal['safetensors', 'pickle', 'pytorch']

        If pickle, the entire model will be pickled. If safetensors or pytorch the model will be saved without the embedding, dimensionality reduction, and clustering algorithms. This is a very efficient format and typically advised.

        'pickle' save_embedding_model Union[bool, str]

        If serialization is pickle, then you can choose to skip saving the embedding model. If serialization is safetensors or pytorch, this variable can be used as a string pointing towards a huggingface model.

        True save_ctfidf bool

        Whether to save c-TF-IDF information if serialization is safetensors or pytorch

        False

        Examples:

        To save the model in an efficient and safe format (safetensors) with c-TF-IDF information:

        topic_model.save(\"model_dir\", serialization=\"safetensors\", save_ctfidf=True)\n

        If you wish to also add a pointer to the embedding model, which will be downloaded from HuggingFace upon loading:

        embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"model_dir\", serialization=\"safetensors\", save_embedding_model=embedding_model)\n

        or if you want save the full model with pickle:

        topic_model.save(\"my_model\")\n

        NOTE: Pickle can run arbitrary code and is generally considered to be less safe than safetensors.

        Source code in bertopic\\_bertopic.py
        def save(self,\n         path,\n         serialization: Literal[\"safetensors\", \"pickle\", \"pytorch\"] = \"pickle\",\n         save_embedding_model: Union[bool, str] = True,\n         save_ctfidf: bool = False):\n    \"\"\" Saves the model to the specified path or folder\n\n    When saving the model, make sure to also keep track of the versions\n    of dependencies and Python used. Loading and saving the model should\n    be done using the same dependencies and Python. Moreover, models\n    saved in one version of BERTopic should not be loaded in other versions.\n\n    Arguments:\n        path: If `serialization` is 'safetensors' or `pytorch`, this is a directory.\n              If `serialization` is `pickle`, then this is a file.\n        serialization: If `pickle`, the entire model will be pickled. If `safetensors`\n                       or `pytorch` the model will be saved without the embedding,\n                       dimensionality reduction, and clustering algorithms.\n                       This is a very efficient format and typically advised.\n        save_embedding_model: If serialization is `pickle`, then you can choose to skip\n                              saving the embedding model. If serialization is `safetensors`\n                              or `pytorch`, this variable can be used as a string pointing\n                              towards a huggingface model.\n        save_ctfidf: Whether to save c-TF-IDF information if serialization is `safetensors`\n                     or `pytorch`\n\n    Examples:\n\n    To save the model in an efficient and safe format (safetensors) with c-TF-IDF information:\n\n    ```python\n    topic_model.save(\"model_dir\", serialization=\"safetensors\", save_ctfidf=True)\n    ```\n\n    If you wish to also add a pointer to the embedding model, which will be downloaded from\n    HuggingFace upon loading:\n\n    ```python\n    embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\n    topic_model.save(\"model_dir\", serialization=\"safetensors\", save_embedding_model=embedding_model)\n    ```\n\n    or if you want save the full model with pickle:\n\n    ```python\n    topic_model.save(\"my_model\")\n    ```\n\n    NOTE: Pickle can run arbitrary code and is generally considered to be less safe than\n    safetensors.\n    \"\"\"\n    if serialization == \"pickle\":\n        logger.warning(\"When you use `pickle` to save/load a BERTopic model,\"\n                       \"please make sure that the environments in which you save\"\n                       \"and load the model are **exactly** the same. The version of BERTopic,\"\n                       \"its dependencies, and python need to remain the same.\")\n\n        with open(path, 'wb') as file:\n\n            # This prevents the vectorizer from being too large in size if `min_df` was\n            # set to a value higher than 1\n            self.vectorizer_model.stop_words_ = None\n\n            if not save_embedding_model:\n                embedding_model = self.embedding_model\n                self.embedding_model = None\n                joblib.dump(self, file)\n                self.embedding_model = embedding_model\n            else:\n                joblib.dump(self, file)\n    elif serialization == \"safetensors\" or serialization == \"pytorch\":\n\n        # Directory\n        save_directory = Path(path)\n        save_directory.mkdir(exist_ok=True, parents=True)\n\n        # Check embedding model\n        if save_embedding_model and hasattr(self.embedding_model, '_hf_model') and not isinstance(save_embedding_model, str):\n            save_embedding_model = self.embedding_model._hf_model\n        elif not save_embedding_model:\n            logger.warning(\"You are saving a BERTopic model without explicitly defining an embedding model.\"\n                           \"If you are using a sentence-transformers model or a HuggingFace model supported\"\n                           \"by sentence-transformers, please save the model by using a pointer towards that model.\"\n                           \"For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`\")\n\n        # Minimal\n        save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization)\n        save_utils.save_topics(model=self, path=save_directory / \"topics.json\")\n        save_utils.save_images(model=self, path=save_directory / \"images\")\n        save_utils.save_config(model=self, path=save_directory / 'config.json', embedding_model=save_embedding_model)\n\n        # Additional\n        if save_ctfidf:\n            save_utils.save_ctfidf(model=self, save_directory=save_directory, serialization=serialization)\n            save_utils.save_ctfidf_config(model=self, path=save_directory / 'ctfidf_config.json')\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.set_topic_labels","title":"set_topic_labels(self, topic_labels)","text":"

        Set custom topic labels in your fitted BERTopic model

        Parameters:

        Name Type Description Default topic_labels Union[List[str], Mapping[int, str]]

        If a list of topic labels, it should contain the same number of labels as there are topics. This must be ordered from the topic with the lowest ID to the highest ID, including topic -1 if it exists. If a dictionary of topic ID: topic_label, it can have any number of topics as it will only map the topics found in the dictionary.

        required

        Examples:

        First, we define our topic labels with .generate_topic_labels in which we can customize our topic labels:

        topic_labels = topic_model.generate_topic_labels(nr_words=2,\n                                            topic_prefix=True,\n                                            word_length=10,\n                                            separator=\", \")\n

        Then, we pass these topic_labels to our topic model which can be accessed at any time with .custom_labels_:

        topic_model.set_topic_labels(topic_labels)\ntopic_model.custom_labels_\n

        You might want to change only a few topic labels instead of all of them. To do so, you can pass a dictionary where the keys are the topic IDs and its keys the topic labels:

        topic_model.set_topic_labels({0: \"Space\", 1: \"Sports\", 2: \"Medicine\"})\ntopic_model.custom_labels_\n
        Source code in bertopic\\_bertopic.py
        def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None:\n    \"\"\" Set custom topic labels in your fitted BERTopic model\n\n    Arguments:\n        topic_labels: If a list of topic labels, it should contain the same number\n                      of labels as there are topics. This must be ordered\n                      from the topic with the lowest ID to the highest ID,\n                      including topic -1 if it exists.\n                      If a dictionary of `topic ID`: `topic_label`, it can have\n                      any number of topics as it will only map the topics found\n                      in the dictionary.\n\n    Examples:\n\n    First, we define our topic labels with `.generate_topic_labels` in which\n    we can customize our topic labels:\n\n    ```python\n    topic_labels = topic_model.generate_topic_labels(nr_words=2,\n                                                topic_prefix=True,\n                                                word_length=10,\n                                                separator=\", \")\n    ```\n\n    Then, we pass these `topic_labels` to our topic model which\n    can be accessed at any time with `.custom_labels_`:\n\n    ```python\n    topic_model.set_topic_labels(topic_labels)\n    topic_model.custom_labels_\n    ```\n\n    You might want to change only a few topic labels instead of all of them.\n    To do so, you can pass a dictionary where the keys are the topic IDs and\n    its keys the topic labels:\n\n    ```python\n    topic_model.set_topic_labels({0: \"Space\", 1: \"Sports\", 2: \"Medicine\"})\n    topic_model.custom_labels_\n    ```\n    \"\"\"\n    unique_topics = sorted(set(self.topics_))\n\n    if isinstance(topic_labels, dict):\n        if self.custom_labels_ is not None:\n            original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)}\n        else:\n            info = self.get_topic_info()\n            original_labels = dict(zip(info.Topic, info.Name))\n        custom_labels = [topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic] for topic in unique_topics]\n\n    elif isinstance(topic_labels, list):\n        if len(topic_labels) == len(unique_topics):\n            custom_labels = topic_labels\n        else:\n            raise ValueError(\"Make sure that `topic_labels` contains the same number \"\n                             \"of labels as there are topics.\")\n\n    self.custom_labels_ = custom_labels\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.topics_over_time","title":"topics_over_time(self, docs, timestamps, topics=None, nr_bins=None, datetime_format=None, evolution_tuning=True, global_tuning=True)","text":"

        Create topics over time

        To create the topics over time, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculate at each timestamp t. Then, the c-TF-IDF representations at timestamp t are averaged with the global c-TF-IDF representations in order to fine-tune the local representations.

        Note

        Make sure to use a limited number of unique timestamps (<100) as the c-TF-IDF representation will be calculated at each single unique timestamp. Having a large number of unique timestamps can take some time to be calculated. Moreover, there aren't many use-cases where you would like to see the difference in topic representations over more than 100 different timestamps.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required timestamps Union[List[str], List[int]]

        The timestamp of each document. This can be either a list of strings or ints. If it is a list of strings, then the datetime format will be automatically inferred. If it is a list of ints, then the documents will be ordered in ascending order.

        required topics List[int]

        A list of topics where each topic is related to a document in docs and a timestamp in timestamps. You can use this to apply topics_over_time on a subset of the data. Make sure that docs, timestamps, and topics all correspond to one another and have the same size.

        None nr_bins int

        The number of bins you want to create for the timestamps. The left interval will be chosen as the timestamp. An additional column will be created with the entire interval.

        None datetime_format str

        The datetime format of the timestamps if they are strings, eg \u201c%d/%m/%Y\u201d. Set this to None if you want to have it automatically detect the format. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.

        None evolution_tuning bool

        Fine-tune each topic representation at timestamp t by averaging its c-TF-IDF matrix with the c-TF-IDF matrix at timestamp t-1. This creates evolutionary topic representations.

        True global_tuning bool

        Fine-tune each topic representation at timestamp t by averaging its c-TF-IDF matrix with the global c-TF-IDF matrix. Turn this off if you want to prevent words in topic representations that could not be found in the documents at timestamp t.

        True

        Returns:

        Type Description topics_over_time

        A dataframe that contains the topic, words, and frequency of topic at timestamp t.

        Examples:

        The timestamps variable represents the timestamp of each document. If you have over 100 unique timestamps, it is advised to bin the timestamps as shown below:

        from bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n
        Source code in bertopic\\_bertopic.py
        def topics_over_time(self,\n                     docs: List[str],\n                     timestamps: Union[List[str],\n                                       List[int]],\n                     topics: List[int] = None,\n                     nr_bins: int = None,\n                     datetime_format: str = None,\n                     evolution_tuning: bool = True,\n                     global_tuning: bool = True) -> pd.DataFrame:\n    \"\"\" Create topics over time\n\n    To create the topics over time, BERTopic needs to be already fitted once.\n    From the fitted models, the c-TF-IDF representations are calculate at\n    each timestamp t. Then, the c-TF-IDF representations at timestamp t are\n    averaged with the global c-TF-IDF representations in order to fine-tune the\n    local representations.\n\n    NOTE:\n        Make sure to use a limited number of unique timestamps (<100) as the\n        c-TF-IDF representation will be calculated at each single unique timestamp.\n        Having a large number of unique timestamps can take some time to be calculated.\n        Moreover, there aren't many use-cases where you would like to see the difference\n        in topic representations over more than 100 different timestamps.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        timestamps: The timestamp of each document. This can be either a list of strings or ints.\n                    If it is a list of strings, then the datetime format will be automatically\n                    inferred. If it is a list of ints, then the documents will be ordered in\n                    ascending order.\n        topics: A list of topics where each topic is related to a document in `docs` and\n                a timestamp in `timestamps`. You can use this to apply topics_over_time on\n                a subset of the data. Make sure that `docs`, `timestamps`, and `topics`\n                all correspond to one another and have the same size.\n        nr_bins: The number of bins you want to create for the timestamps. The left interval will\n                 be chosen as the timestamp. An additional column will be created with the\n                 entire interval.\n        datetime_format: The datetime format of the timestamps if they are strings, eg \u201c%d/%m/%Y\u201d.\n                         Set this to None if you want to have it automatically detect the format.\n                         See strftime documentation for more information on choices:\n                         https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.\n        evolution_tuning: Fine-tune each topic representation at timestamp *t* by averaging its\n                          c-TF-IDF matrix with the c-TF-IDF matrix at timestamp *t-1*. This creates\n                          evolutionary topic representations.\n        global_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix\n                   with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                   topic representations that could not be found in the documents at timestamp *t*.\n\n    Returns:\n        topics_over_time: A dataframe that contains the topic, words, and frequency of topic\n                          at timestamp *t*.\n\n    Examples:\n\n    The timestamps variable represents the timestamp of each document. If you have over\n    100 unique timestamps, it is advised to bin the timestamps as shown below:\n\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    selected_topics = topics if topics else self.topics_\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": selected_topics, \"Timestamps\": timestamps})\n    global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)\n\n    all_topics = sorted(list(documents.Topic.unique()))\n    all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}\n\n    if isinstance(timestamps[0], str):\n        infer_datetime_format = True if not datetime_format else False\n        documents[\"Timestamps\"] = pd.to_datetime(documents[\"Timestamps\"],\n                                                 infer_datetime_format=infer_datetime_format,\n                                                 format=datetime_format)\n\n    if nr_bins:\n        documents[\"Bins\"] = pd.cut(documents.Timestamps, bins=nr_bins)\n        documents[\"Timestamps\"] = documents.apply(lambda row: row.Bins.left, 1)\n\n    # Sort documents in chronological order\n    documents = documents.sort_values(\"Timestamps\")\n    timestamps = documents.Timestamps.unique()\n    if len(timestamps) > 100:\n        logger.warning(f\"There are more than 100 unique timestamps (i.e., {len(timestamps)}) \"\n                       \"which significantly slows down the application. Consider setting `nr_bins` \"\n                       \"to a value lower than 100 to speed up calculation. \")\n\n    # For each unique timestamp, create topic representations\n    topics_over_time = []\n    for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose):\n\n        # Calculate c-TF-IDF representation for a specific timestamp\n        selection = documents.loc[documents.Timestamps == timestamp, :]\n        documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join,\n                                                                                \"Timestamps\": \"count\"})\n        c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n        if global_tuning or evolution_tuning:\n            c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False)\n\n        # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF\n        # matrix at timestamp t-1\n        if evolution_tuning and index != 0:\n            current_topics = sorted(list(documents_per_topic.Topic.values))\n            overlapping_topics = sorted(list(set(previous_topics).intersection(set(current_topics))))\n\n            current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]\n            previous_overlap_idx = [previous_topics.index(topic) for topic in overlapping_topics]\n\n            c_tf_idf.tolil()[current_overlap_idx] = ((c_tf_idf[current_overlap_idx] +\n                                                      previous_c_tf_idf[previous_overlap_idx]) / 2.0).tolil()\n\n        # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n        # by simply taking the average of the two\n        if global_tuning:\n            selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]\n            c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0\n\n        # Extract the words per topic\n        words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n        topic_frequency = pd.Series(documents_per_topic.Timestamps.values,\n                                    index=documents_per_topic.Topic).to_dict()\n\n        # Fill dataframe with results\n        topics_at_timestamp = [(topic,\n                                \", \".join([words[0] for words in values][:5]),\n                                topic_frequency[topic],\n                                timestamp) for topic, values in words_per_topic.items()]\n        topics_over_time.extend(topics_at_timestamp)\n\n        if evolution_tuning:\n            previous_topics = sorted(list(documents_per_topic.Topic.values))\n            previous_c_tf_idf = c_tf_idf.copy()\n\n    return pd.DataFrame(topics_over_time, columns=[\"Topic\", \"Words\", \"Frequency\", \"Timestamp\"])\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.topics_per_class","title":"topics_per_class(self, docs, classes, global_tuning=True)","text":"

        Create topics per class

        To create the topics per class, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculated at each class c. Then, the c-TF-IDF representations at class c are averaged with the global c-TF-IDF representations in order to fine-tune the local representations. This can be turned off if the pure representation is needed.

        Note

        Make sure to use a limited number of unique classes (<100) as the c-TF-IDF representation will be calculated at each single unique class. Having a large number of unique classes can take some time to be calculated.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required classes Union[List[int], List[str]]

        The class of each document. This can be either a list of strings or ints.

        required global_tuning bool

        Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix with the global c-TF-IDF matrix. Turn this off if you want to prevent words in topic representations that could not be found in the documents for class c.

        True

        Returns:

        Type Description topics_per_class

        A dataframe that contains the topic, words, and frequency of topics for each class.

        Examples:

        from bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_per_class = topic_model.topics_per_class(docs, classes)\n
        Source code in bertopic\\_bertopic.py
        def topics_per_class(self,\n                     docs: List[str],\n                     classes: Union[List[int], List[str]],\n                     global_tuning: bool = True) -> pd.DataFrame:\n    \"\"\" Create topics per class\n\n    To create the topics per class, BERTopic needs to be already fitted once.\n    From the fitted models, the c-TF-IDF representations are calculated at\n    each class c. Then, the c-TF-IDF representations at class c are\n    averaged with the global c-TF-IDF representations in order to fine-tune the\n    local representations. This can be turned off if the pure representation is\n    needed.\n\n    NOTE:\n        Make sure to use a limited number of unique classes (<100) as the\n        c-TF-IDF representation will be calculated at each single unique class.\n        Having a large number of unique classes can take some time to be calculated.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        classes: The class of each document. This can be either a list of strings or ints.\n        global_tuning: Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix\n                       with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                       topic representations that could not be found in the documents for class c.\n\n    Returns:\n        topics_per_class: A dataframe that contains the topic, words, and frequency of topics\n                          for each class.\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    topics_per_class = topic_model.topics_per_class(docs, classes)\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Class\": classes})\n    global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm='l1', copy=False)\n\n    # For each unique timestamp, create topic representations\n    topics_per_class = []\n    for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose):\n\n        # Calculate c-TF-IDF representation for a specific timestamp\n        selection = documents.loc[documents.Class == class_, :]\n        documents_per_topic = selection.groupby(['Topic'], as_index=False).agg({'Document': ' '.join,\n                                                                                \"Class\": \"count\"})\n        c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n        # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n        # by simply taking the average of the two\n        if global_tuning:\n            c_tf_idf = normalize(c_tf_idf, axis=1, norm='l1', copy=False)\n            c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0\n\n        # Extract the words per topic\n        words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n        topic_frequency = pd.Series(documents_per_topic.Class.values,\n                                    index=documents_per_topic.Topic).to_dict()\n\n        # Fill dataframe with results\n        topics_at_class = [(topic,\n                            \", \".join([words[0] for words in values][:5]),\n                            topic_frequency[topic],\n                            class_) for topic, values in words_per_topic.items()]\n        topics_per_class.extend(topics_at_class)\n\n    topics_per_class = pd.DataFrame(topics_per_class, columns=[\"Topic\", \"Words\", \"Frequency\", \"Class\"])\n\n    return topics_per_class\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.transform","title":"transform(self, documents, embeddings=None, images=None)","text":"

        After having fit a model, use transform to predict new instances

        Parameters:

        Name Type Description Default documents Union[str, List[str]]

        A single document or a list of documents to predict on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model.

        None images List[str]

        A list of paths to the images to predict on or the images themselves

        None

        Returns:

        Type Description predictions

        Topic predictions for each documents probabilities: The topic probability distribution which is returned by default. If calculate_probabilities in BERTopic is set to False, then the probabilities are not calculated to speed up computation and decrease memory usage.

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic().fit(docs)\ntopics, probs = topic_model.transform(docs)\n

        If you want to use your own embeddings:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Create embeddings\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# Create topic model\ntopic_model = BERTopic().fit(docs, embeddings)\ntopics, probs = topic_model.transform(docs, embeddings)\n
        Source code in bertopic\\_bertopic.py
        def transform(self,\n              documents: Union[str, List[str]],\n              embeddings: np.ndarray = None,\n              images: List[str] = None) -> Tuple[List[int], np.ndarray]:\n    \"\"\" After having fit a model, use transform to predict new instances\n\n    Arguments:\n        documents: A single document or a list of documents to predict on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model.\n        images: A list of paths to the images to predict on or the images themselves\n\n    Returns:\n        predictions: Topic predictions for each documents\n        probabilities: The topic probability distribution which is returned by default.\n                       If `calculate_probabilities` in BERTopic is set to False, then the\n                       probabilities are not calculated to speed up computation and\n                       decrease memory usage.\n\n    Examples:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic().fit(docs)\n    topics, probs = topic_model.transform(docs)\n    ```\n\n    If you want to use your own embeddings:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    # Create embeddings\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n    # Create topic model\n    topic_model = BERTopic().fit(docs, embeddings)\n    topics, probs = topic_model.transform(docs, embeddings)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_embeddings_shape(embeddings, documents)\n\n    if isinstance(documents, str) or documents is None:\n        documents = [documents]\n\n    if embeddings is None:\n        embeddings = self._extract_embeddings(documents,\n                                              images=images,\n                                              method=\"document\",\n                                              verbose=self.verbose)\n\n    # Check if an embedding model was found\n    if embeddings is None:\n        raise ValueError(\"No embedding model was found to embed the documents.\"\n                         \"Make sure when loading in the model using BERTopic.load()\"\n                         \"to also specify the embedding model.\")\n\n    # Transform without hdbscan_model and umap_model using only cosine similarity\n    elif type(self.hdbscan_model) == BaseCluster:\n        logger.info(\"Predicting topic assignments through cosine similarity of topic and document embeddings.\")\n        sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))\n        predictions = np.argmax(sim_matrix, axis=1) - self._outliers\n\n        if self.calculate_probabilities:\n            probabilities = sim_matrix\n        else:\n            probabilities = np.max(sim_matrix, axis=1)\n\n    # Transform with full pipeline\n    else:\n        logger.info(\"Dimensionality - Reducing dimensionality of input embeddings.\")\n        umap_embeddings = self.umap_model.transform(embeddings)\n        logger.info(\"Dimensionality - Completed \\u2713\")\n\n        # Extract predictions and probabilities if it is a HDBSCAN-like model\n        logger.info(\"Clustering - Approximating new points with `hdbscan_model`\")\n        if is_supported_hdbscan(self.hdbscan_model):\n            predictions, probabilities = hdbscan_delegator(self.hdbscan_model, \"approximate_predict\", umap_embeddings)\n\n            # Calculate probabilities\n            if self.calculate_probabilities:\n                logger.info(\"Probabilities - Start calculation of probabilities with HDBSCAN\")\n                probabilities = hdbscan_delegator(self.hdbscan_model, \"membership_vector\", umap_embeddings)\n                logger.info(\"Probabilities - Completed \\u2713\")\n        else:\n            predictions = self.hdbscan_model.predict(umap_embeddings)\n            probabilities = None\n        logger.info(\"Cluster - Completed \\u2713\")\n\n        # Map probabilities and predictions\n        probabilities = self._map_probabilities(probabilities, original_topics=True)\n        predictions = self._map_predictions(predictions)\n    return predictions, probabilities\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.update_topics","title":"update_topics(self, docs, images=None, topics=None, top_n_words=10, n_gram_range=None, vectorizer_model=None, ctfidf_model=None, representation_model=None)","text":"

        Updates the topic representation by recalculating c-TF-IDF with the new parameters as defined in this function.

        When you have trained a model and viewed the topics and the words that represent them, you might not be satisfied with the representation. Perhaps you forgot to remove stop_words or you want to try out a different n_gram_range. This function allows you to update the topic representation after they have been formed.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required images List[str]

        The images you used when calling either fit or fit_transform

        None topics List[int]

        A list of topics where each topic is related to a document in docs. Use this variable to change or map the topics. NOTE: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline

        None top_n_words int

        The number of words per topic to extract. Setting this too high can negatively impact topic embeddings as topics are typically best represented by at most 10 words.

        10 n_gram_range Tuple[int, int]

        The n-gram range for the CountVectorizer.

        None vectorizer_model CountVectorizer

        Pass in your own CountVectorizer from scikit-learn

        None ctfidf_model ClassTfidfTransformer

        Pass in your own c-TF-IDF model to update the representations

        None representation_model BaseRepresentation

        Pass in a model that fine-tunes the topic representations calculated through c-TF-IDF. Models from bertopic.representation are supported.

        None

        Examples:

        In order to update the topic representation, you will need to first fit the topic model and extract topics from them. Based on these, you can update the representation:

        topic_model.update_topics(docs, n_gram_range=(2, 3))\n

        You can also use a custom vectorizer to update the representation:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\")\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        You can also use this function to change or map the topics to something else. You can update them as follows:

        topic_model.update_topics(docs, my_updated_topics)\n
        Source code in bertopic\\_bertopic.py
        def update_topics(self,\n                  docs: List[str],\n                  images: List[str] = None,\n                  topics: List[int] = None,\n                  top_n_words: int = 10,\n                  n_gram_range: Tuple[int, int] = None,\n                  vectorizer_model: CountVectorizer = None,\n                  ctfidf_model: ClassTfidfTransformer = None,\n                  representation_model: BaseRepresentation = None):\n    \"\"\" Updates the topic representation by recalculating c-TF-IDF with the new\n    parameters as defined in this function.\n\n    When you have trained a model and viewed the topics and the words that represent them,\n    you might not be satisfied with the representation. Perhaps you forgot to remove\n    stop_words or you want to try out a different n_gram_range. This function allows you\n    to update the topic representation after they have been formed.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        images: The images you used when calling either `fit` or `fit_transform`\n        topics: A list of topics where each topic is related to a document in `docs`.\n                Use this variable to change or map the topics.\n                NOTE: Using a custom list of topic assignments may lead to errors if\n                      topic reduction techniques are used afterwards. Make sure that\n                      manually assigning topics is the last step in the pipeline\n        top_n_words: The number of words per topic to extract. Setting this\n                     too high can negatively impact topic embeddings as topics\n                     are typically best represented by at most 10 words.\n        n_gram_range: The n-gram range for the CountVectorizer.\n        vectorizer_model: Pass in your own CountVectorizer from scikit-learn\n        ctfidf_model: Pass in your own c-TF-IDF model to update the representations\n        representation_model: Pass in a model that fine-tunes the topic representations\n                              calculated through c-TF-IDF. Models from `bertopic.representation`\n                              are supported.\n\n    Examples:\n\n    In order to update the topic representation, you will need to first fit the topic\n    model and extract topics from them. Based on these, you can update the representation:\n\n    ```python\n    topic_model.update_topics(docs, n_gram_range=(2, 3))\n    ```\n\n    You can also use a custom vectorizer to update the representation:\n\n    ```python\n    from sklearn.feature_extraction.text import CountVectorizer\n    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\")\n    topic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n    ```\n\n    You can also use this function to change or map the topics to something else.\n    You can update them as follows:\n\n    ```python\n    topic_model.update_topics(docs, my_updated_topics)\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    check_is_fitted(self)\n    if not n_gram_range:\n        n_gram_range = self.n_gram_range\n\n    if top_n_words > 100:\n        logger.warning(\"Note that extracting more than 100 words from a sparse \"\n                       \"can slow down computation quite a bit.\")\n    self.top_n_words = top_n_words\n    self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)\n    self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n    self.representation_model = representation_model\n\n    if topics is None:\n        topics = self.topics_\n    else:\n        logger.warning(\"Using a custom list of topic assignments may lead to errors if \"\n                       \"topic reduction techniques are used afterwards. Make sure that \"\n                       \"manually assigning topics is the last step in the pipeline.\"\n                       \"Note that topic embeddings will also be created through weighted\"\n                       \"c-TF-IDF embeddings instead of centroid embeddings.\")\n\n    self._outliers = 1 if -1 in set(topics) else 0\n\n    # Extract words\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": topics, \"ID\": range(len(docs)), \"Image\": images})\n    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})\n    self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)\n    self.topic_representations_ = self._extract_words_per_topic(words, documents)\n\n    # Update topic vectors\n    if set(topics) != self.topics_:\n\n        # Remove outlier topic embedding if all that has changed is the outlier class\n        same_position = all([True if old_topic == new_topic else False for old_topic, new_topic in zip(self.topics_, topics) if old_topic != -1])\n        if same_position and -1 not in topics and -1 in self.topics_:\n            self.topic_embeddings_ = self.topic_embeddings_[1:]\n        else:\n            self._create_topic_vectors()\n\n    # Update topic labels\n    self.topic_labels_ = {key: f\"{key}_\" + \"_\".join([word[0] for word in values[:4]])\n                          for key, values in\n                          self.topic_representations_.items()}\n    self._update_topic_size(documents)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_approximate_distribution","title":"visualize_approximate_distribution(self, document, topic_token_distribution, normalize=False)","text":"

        Visualize the topic distribution calculated by .approximate_topic_distribution on a token level. Thereby indicating the extent to which a certain word or phrase belongs to a specific topic. The assumption here is that a single word can belong to multiple similar topics and as such can give information about the broader set of topics within a single document.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required document str

        The document for which you want to visualize the approximated topic distribution.

        required topic_token_distribution ndarray

        The topic-token distribution of the document as extracted by .approximate_topic_distribution

        required normalize bool

        Whether to normalize, between 0 and 1 (summing up to 1), the topic distribution values.

        False

        Returns:

        Type Description df

        A stylized dataframe indicating the best fitting topics for each token.

        Examples:

        # Calculate the topic distributions on a token level\n# Note that we need to have `calculate_token_level=True`\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(\n        docs, calculate_token_level=True\n)\n\n# Visualize the approximated topic distributions\ndf = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])\ndf\n

        To revert this stylized dataframe back to a regular dataframe, you can run the following:

        df.data.columns = [column.strip() for column in df.data.columns]\ndf = df.data\n
        Source code in bertopic\\_bertopic.py
        def visualize_approximate_distribution(self,\n                                       document: str,\n                                       topic_token_distribution: np.ndarray,\n                                       normalize: bool = False):\n    \"\"\" Visualize the topic distribution calculated by `.approximate_topic_distribution`\n    on a token level. Thereby indicating the extent to which a certain word or phrase belongs\n    to a specific topic. The assumption here is that a single word can belong to multiple\n    similar topics and as such can give information about the broader set of topics within\n    a single document.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        document: The document for which you want to visualize\n                  the approximated topic distribution.\n        topic_token_distribution: The topic-token distribution of the document as\n                                  extracted by `.approximate_topic_distribution`\n        normalize: Whether to normalize, between 0 and 1 (summing up to 1), the\n                   topic distribution values.\n\n    Returns:\n        df: A stylized dataframe indicating the best fitting topics\n            for each token.\n\n    Examples:\n\n    ```python\n    # Calculate the topic distributions on a token level\n    # Note that we need to have `calculate_token_level=True`\n    topic_distr, topic_token_distr = topic_model.approximate_distribution(\n            docs, calculate_token_level=True\n    )\n\n    # Visualize the approximated topic distributions\n    df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])\n    df\n    ```\n\n    To revert this stylized dataframe back to a regular dataframe,\n    you can run the following:\n\n    ```python\n    df.data.columns = [column.strip() for column in df.data.columns]\n    df = df.data\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_approximate_distribution(self,\n                                                       document=document,\n                                                       topic_token_distribution=topic_token_distribution,\n                                                       normalize=normalize)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_barchart","title":"visualize_barchart(self, topics=None, top_n_topics=8, n_words=5, custom_labels=False, title='Topic Word Scores', width=250, height=250, autoscale=False)","text":"

        Visualize a barchart of selected topics

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        8 n_words int

        Number of words to show in a topic

        5 custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        'Topic Word Scores' width int

        The width of each figure.

        250 height int

        The height of each figure.

        250 autoscale bool

        Whether to automatically calculate the height of the figures to fit the whole bar text

        False

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the barchart of selected topics simply run:

        topic_model.visualize_barchart()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_barchart()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_barchart(self,\n                       topics: List[int] = None,\n                       top_n_topics: int = 8,\n                       n_words: int = 5,\n                       custom_labels: bool = False,\n                       title: str = \"Topic Word Scores\",\n                       width: int = 250,\n                       height: int = 250,\n                       autoscale: bool=False) -> go.Figure:\n    \"\"\" Visualize a barchart of selected topics\n\n    Arguments:\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_words: Number of words to show in a topic\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of each figure.\n        height: The height of each figure.\n        autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the barchart of selected topics\n    simply run:\n\n    ```python\n    topic_model.visualize_barchart()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_barchart()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_barchart(self,\n                                       topics=topics,\n                                       top_n_topics=top_n_topics,\n                                       n_words=n_words,\n                                       custom_labels=custom_labels,\n                                       title=title,\n                                       width=width,\n                                       height=height,\n                                       autoscale=autoscale)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_distribution","title":"visualize_distribution(self, probabilities, min_probability=0.015, custom_labels=False, title='<b>Topic Probability Distribution</b>', width=800, height=600)","text":"

        Visualize the distribution of topic probabilities

        Parameters:

        Name Type Description Default probabilities ndarray

        An array of probability scores

        required min_probability float

        The minimum probability score to visualize. All others are ignored.

        0.015 custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Topic Probability Distribution</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        600

        Examples:

        Make sure to fit the model before and only input the probabilities of a single document:

        topic_model.visualize_distribution(topic_model.probabilities_[0])\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_distribution(topic_model.probabilities_[0])\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_distribution(self,\n                           probabilities: np.ndarray,\n                           min_probability: float = 0.015,\n                           custom_labels: bool = False,\n                           title: str = \"<b>Topic Probability Distribution</b>\",\n                           width: int = 800,\n                           height: int = 600) -> go.Figure:\n    \"\"\" Visualize the distribution of topic probabilities\n\n    Arguments:\n        probabilities: An array of probability scores\n        min_probability: The minimum probability score to visualize.\n                         All others are ignored.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    Make sure to fit the model before and only input the\n    probabilities of a single document:\n\n    ```python\n    topic_model.visualize_distribution(topic_model.probabilities_[0])\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_distribution(topic_model.probabilities_[0])\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_distribution(self,\n                                           probabilities=probabilities,\n                                           min_probability=min_probability,\n                                           custom_labels=custom_labels,\n                                           title=title,\n                                           width=width,\n                                           height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_document_datamap","title":"visualize_document_datamap(self, docs, topics=None, embeddings=None, reduced_embeddings=None, custom_labels=False, title='Documents and Topics', sub_title=None, width=1200, height=1200, **datamap_kwds)","text":"

        Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best to use a sufficiently large min_topic_size or set nr_topics when building the model.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        'Documents and Topics' sub_title Optional[str]

        Sub-title of the plot.

        None width int

        The width of the figure.

        1200 height int

        The height of the figure.

        1200 **datamap_kwds

        All further keyword args will be passed on to DataMapPlot's create_plot function. See the DataMapPlot documentation for more details.

        {}

        Returns:

        Type Description figure

        A Matplotlib Figure object.

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_document_datamap(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic(min_topic_size=36).fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\nfig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_document_datamap(self,\n                               docs: List[str],\n                               topics: List[int] = None,\n                               embeddings: np.ndarray = None,\n                               reduced_embeddings: np.ndarray = None,\n                               custom_labels: Union[bool, str] = False,\n                               title: str = \"Documents and Topics\",\n                               sub_title: Union[str, None] = None,\n                               width: int = 1200,\n                               height: int = 1200,\n                               **datamap_kwds):\n    \"\"\" Visualize documents and their topics in 2D as a static plot for publication using\n    DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best\n    to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model.\n\n    Arguments:\n        topic_model:  A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        embeddings:  The embeddings of all documents in `docs`.\n        reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.\n        custom_labels:  If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        sub_title: Sub-title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n        **datamap_kwds:  All further keyword args will be passed on to DataMapPlot's\n                         `create_plot` function. See the DataMapPlot documentation\n                         for more details.\n\n    Returns:\n        figure: A Matplotlib Figure object.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_document_datamap(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic(min_topic_size=36).fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    fig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    return plotting.visualize_document_datamap(self,\n                                               docs,\n                                               topics,\n                                               embeddings,\n                                               reduced_embeddings,\n                                               custom_labels,\n                                               title,\n                                               sub_title,\n                                               width,\n                                               height,\n                                               **datamap_kwds)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_documents","title":"visualize_documents(self, docs, topics=None, embeddings=None, reduced_embeddings=None, sample=None, hide_annotations=False, hide_document_hover=False, custom_labels=False, title='<b>Documents and Topics</b>', width=1200, height=750)","text":"

        Visualize documents and their topics in 2D

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample float

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization.

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_documents(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_documents(self,\n                        docs: List[str],\n                        topics: List[int] = None,\n                        embeddings: np.ndarray = None,\n                        reduced_embeddings: np.ndarray = None,\n                        sample: float = None,\n                        hide_annotations: bool = False,\n                        hide_document_hover: bool = False,\n                        custom_labels: bool = False,\n                        title: str = \"<b>Documents and Topics</b>\",\n                        width: int = 1200,\n                        height: int = 750) -> go.Figure:\n    \"\"\" Visualize documents and their topics in 2D\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                            specific points. Helps to speed up generation of visualization.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_documents(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic().fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_documents(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../getting_started/visualization/documents.html\"\n    style=\"width:1000px; height: 800px; border: 0px;\"\"></iframe>\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    return plotting.visualize_documents(self,\n                                        docs=docs,\n                                        topics=topics,\n                                        embeddings=embeddings,\n                                        reduced_embeddings=reduced_embeddings,\n                                        sample=sample,\n                                        hide_annotations=hide_annotations,\n                                        hide_document_hover=hide_document_hover,\n                                        custom_labels=custom_labels,\n                                        title=title,\n                                        width=width,\n                                        height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_heatmap","title":"visualize_heatmap(self, topics=None, top_n_topics=None, n_clusters=None, custom_labels=False, title='<b>Similarity Matrix</b>', width=800, height=800)","text":"

        Visualize a heatmap of the topic's similarity matrix

        Based on the cosine similarity matrix between topic embeddings, a heatmap is created showing the similarity between topics.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        None n_clusters int

        Create n clusters and order the similarity matrix by those clusters.

        None custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Similarity Matrix</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        800

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the similarity matrix of topics simply run:

        topic_model.visualize_heatmap()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_heatmap()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_heatmap(self,\n                      topics: List[int] = None,\n                      top_n_topics: int = None,\n                      n_clusters: int = None,\n                      custom_labels: bool = False,\n                      title: str = \"<b>Similarity Matrix</b>\",\n                      width: int = 800,\n                      height: int = 800) -> go.Figure:\n    \"\"\" Visualize a heatmap of the topic's similarity matrix\n\n    Based on the cosine similarity matrix between topic embeddings,\n    a heatmap is created showing the similarity between topics.\n\n    Arguments:\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_clusters: Create n clusters and order the similarity\n                    matrix by those clusters.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the similarity matrix of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_heatmap()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_heatmap()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_heatmap(self,\n                                      topics=topics,\n                                      top_n_topics=top_n_topics,\n                                      n_clusters=n_clusters,\n                                      custom_labels=custom_labels,\n                                      title=title,\n                                      width=width,\n                                      height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_hierarchical_documents","title":"visualize_hierarchical_documents(self, docs, hierarchical_topics, topics=None, embeddings=None, reduced_embeddings=None, sample=None, hide_annotations=False, hide_document_hover=True, nr_levels=10, level_scale='linear', custom_labels=False, title='<b>Hierarchical Documents and Topics</b>', width=1200, height=750)","text":"

        Visualize documents and their topics in 2D at different levels of hierarchy

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample Union[float, int]

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations.

        True nr_levels int

        The number of levels to be visualized in the hierarchy. First, the distances in hierarchical_topics.Distance are split in nr_levels lists of distances with equal length. Then, for each list of distances, the merged topics, that have a distance less or equal to the maximum distance of the selected list of distances, are selected. NOTE: To get all possible merged steps, make sure that nr_levels is equal to the length of hierarchical_topics.

        10 level_scale str

        Whether to apply a linear or logarithmic ('log') scale levels of the distance vector. Linear scaling will perform an equal number of merges at each level while logarithmic scaling will perform more mergers in earlier levels to provide more resolution at higher levels (this can be used for when the number of topics is large).

        'linear' custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels. NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_hierarchical_documents(self,\n                                     docs: List[str],\n                                     hierarchical_topics: pd.DataFrame,\n                                     topics: List[int] = None,\n                                     embeddings: np.ndarray = None,\n                                     reduced_embeddings: np.ndarray = None,\n                                     sample: Union[float, int] = None,\n                                     hide_annotations: bool = False,\n                                     hide_document_hover: bool = True,\n                                     nr_levels: int = 10,\n                                     level_scale: str = 'linear',\n                                     custom_labels: bool = False,\n                                     title: str = \"<b>Hierarchical Documents and Topics</b>\",\n                                     width: int = 1200,\n                                     height: int = 750) -> go.Figure:\n    \"\"\" Visualize documents and their topics in 2D at different levels of hierarchy\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                            represented by their parents and their children\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                             specific points. Helps to speed up generation of visualizations.\n        nr_levels: The number of levels to be visualized in the hierarchy. First, the distances\n                   in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with\n                   equal length. Then, for each list of distances, the merged topics, that have \n                   a distance less or equal to the maximum distance of the selected list of distances, are selected.\n                   NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to\n                   the length of `hierarchical_topics`.\n        level_scale: Whether to apply a linear or logarithmic ('log') scale levels of the distance\n                     vector. Linear scaling will perform an equal number of merges at each level\n                     while logarithmic scaling will perform more mergers in earlier levels to\n                     provide more resolution at higher levels (this can be used for when the number\n                     of topics is large).\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       NOTE: Custom labels are only generated for the original\n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic and extract hierarchical topics\n    topic_model = BERTopic().fit(docs, embeddings)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../getting_started/visualization/hierarchical_documents.html\"\n    style=\"width:1000px; height: 770px; border: 0px;\"\"></iframe>\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    return plotting.visualize_hierarchical_documents(self,\n                                                     docs=docs,\n                                                     hierarchical_topics=hierarchical_topics,\n                                                     topics=topics,\n                                                     embeddings=embeddings,\n                                                     reduced_embeddings=reduced_embeddings,\n                                                     sample=sample,\n                                                     hide_annotations=hide_annotations,\n                                                     hide_document_hover=hide_document_hover,\n                                                     nr_levels=nr_levels,\n                                                     level_scale=level_scale,\n                                                     custom_labels=custom_labels,\n                                                     title=title,\n                                                     width=width,\n                                                     height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_hierarchy","title":"visualize_hierarchy(self, orientation='left', topics=None, top_n_topics=None, custom_labels=False, title='<b>Hierarchical Clustering</b>', width=1000, height=600, hierarchical_topics=None, linkage_function=None, distance_function=None, color_threshold=1)","text":"

        Visualize a hierarchical structure of the topics

        A ward linkage function is used to perform the hierarchical clustering based on the cosine distance matrix between topic embeddings.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required orientation str

        The orientation of the figure. Either 'left' or 'bottom'

        'left' topics List[int]

        A selection of topics to visualize

        None top_n_topics int

        Only select the top n most frequent topics

        None custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels. NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Clustering</b>' width int

        The width of the figure. Only works if orientation is set to 'left'

        1000 height int

        The height of the figure. Only works if orientation is set to 'bottom'

        600 hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both topics and top_n_topics are not set.

        None linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

        The linkage function to use. Default is: lambda x: sch.linkage(x, 'ward', optimal_ordering=True) NOTE: Make sure to use the same linkage_function as used in topic_model.hierarchical_topics.

        None distance_function Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

        The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x) NOTE: Make sure to use the same distance_function as used in topic_model.hierarchical_topics.

        None color_threshold int

        Value at which the separation of clusters will be made which will result in different colors for different clusters. A higher value will typically lead to less colored clusters.

        1

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the hierarchical structure of topics simply run:

        topic_model.visualize_hierarchy()\n

        If you also want the labels of hierarchical topics visualized, run the following:

        # Extract hierarchical topics and their representations\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Visualize these representations\ntopic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you want to save the resulting figure:

        fig = topic_model.visualize_hierarchy()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_hierarchy(self,\n                        orientation: str = \"left\",\n                        topics: List[int] = None,\n                        top_n_topics: int = None,\n                        custom_labels: bool = False,\n                        title: str = \"<b>Hierarchical Clustering</b>\",\n                        width: int = 1000,\n                        height: int = 600,\n                        hierarchical_topics: pd.DataFrame = None,\n                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n                        distance_function: Callable[[csr_matrix], csr_matrix] = None,\n                        color_threshold: int = 1) -> go.Figure:\n    \"\"\" Visualize a hierarchical structure of the topics\n\n    A ward linkage function is used to perform the\n    hierarchical clustering based on the cosine distance\n    matrix between topic embeddings.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        orientation: The orientation of the figure.\n                     Either 'left' or 'bottom'\n        topics: A selection of topics to visualize\n        top_n_topics: Only select the top n most frequent topics\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       NOTE: Custom labels are only generated for the original\n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure. Only works if orientation is set to 'left'\n        height: The height of the figure. Only works if orientation is set to 'bottom'\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children.\n                             NOTE: The hierarchical topic names are only visualized\n                             if both `topics` and `top_n_topics` are not set.\n        linkage_function: The linkage function to use. Default is:\n                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n                          NOTE: Make sure to use the same `linkage_function` as used\n                          in `topic_model.hierarchical_topics`.\n        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                           `lambda x: 1 - cosine_similarity(x)`\n                           NOTE: Make sure to use the same `distance_function` as used\n                           in `topic_model.hierarchical_topics`.\n        color_threshold: Value at which the separation of clusters will be made which\n                         will result in different colors for different clusters.\n                         A higher value will typically lead to less colored clusters.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the hierarchical structure of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchy()\n    ```\n\n    If you also want the labels of hierarchical topics visualized,\n    run the following:\n\n    ```python\n    # Extract hierarchical topics and their representations\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Visualize these representations\n    topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n    ```\n\n    If you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchy()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../getting_started/visualization/hierarchy.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_hierarchy(self,\n                                        orientation=orientation,\n                                        topics=topics,\n                                        top_n_topics=top_n_topics,\n                                        custom_labels=custom_labels,\n                                        title=title,\n                                        width=width,\n                                        height=height,\n                                        hierarchical_topics=hierarchical_topics,\n                                        linkage_function=linkage_function,\n                                        distance_function=distance_function,\n                                        color_threshold=color_threshold\n                                        )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_term_rank","title":"visualize_term_rank(self, topics=None, log_scale=False, custom_labels=False, title='<b>Term score decline per Topic</b>', width=800, height=500)","text":"

        Visualize the ranks of all terms across all topics

        Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize. These will be colored red where all others will be colored black.

        None log_scale bool

        Whether to represent the ranking on a log scale

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Term score decline per Topic</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        500

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the ranks of all words across all topics simply run:

        topic_model.visualize_term_rank()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_term_rank()\nfig.write_html(\"path/to/file.html\")\n

        Reference:

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        Source code in bertopic\\_bertopic.py
        def visualize_term_rank(self,\n                        topics: List[int] = None,\n                        log_scale: bool = False,\n                        custom_labels: bool = False,\n                        title: str = \"<b>Term score decline per Topic</b>\",\n                        width: int = 800,\n                        height: int = 500) -> go.Figure:\n    \"\"\" Visualize the ranks of all terms across all topics\n\n    Each topic is represented by a set of words. These words, however,\n    do not all equally represent the topic. This visualization shows\n    how many words are needed to represent a topic and at which point\n    the beneficial effect of adding words starts to decline.\n\n    Arguments:\n        topics: A selection of topics to visualize. These will be colored\n                red where all others will be colored black.\n        log_scale: Whether to represent the ranking on a log scale\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the ranks of all words across\n    all topics simply run:\n\n    ```python\n    topic_model.visualize_term_rank()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_term_rank()\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    Reference:\n\n    This visualization was heavily inspired by the\n    \"Term Probability Decline\" visualization found in an\n    analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/).\n    Reference to that specific analysis can be found\n    [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html).\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_term_rank(self,\n                                        topics=topics,\n                                        log_scale=log_scale,\n                                        custom_labels=custom_labels,\n                                        title=title,\n                                        width=width,\n                                        height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_topics","title":"visualize_topics(self, topics=None, top_n_topics=None, custom_labels=False, title='<b>Intertopic Distance Map</b>', width=650, height=650)","text":"

        Visualize topics, their sizes, and their corresponding words

        This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None top_n_topics int

        Only select the top n most frequent topics

        None custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Intertopic Distance Map</b>' width int

        The width of the figure.

        650 height int

        The height of the figure.

        650

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_topics()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_topics(self,\n                     topics: List[int] = None,\n                     top_n_topics: int = None,\n                     custom_labels: bool = False,\n                     title: str = \"<b>Intertopic Distance Map</b>\",\n                     width: int = 650,\n                     height: int = 650) -> go.Figure:\n    \"\"\" Visualize topics, their sizes, and their corresponding words\n\n    This visualization is highly inspired by LDAvis, a great visualization\n    technique typically reserved for LDA.\n\n    Arguments:\n        topics: A selection of topics to visualize\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        top_n_topics: Only select the top n most frequent topics\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_topics()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_topics(self,\n                                     topics=topics,\n                                     top_n_topics=top_n_topics,\n                                     custom_labels=custom_labels,\n                                     title=title,\n                                     width=width,\n                                     height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_topics_over_time","title":"visualize_topics_over_time(self, topics_over_time, top_n_topics=None, topics=None, normalize_frequency=False, custom_labels=False, title='<b>Topics over Time</b>', width=1250, height=450)","text":"

        Visualize topics over time

        Parameters:

        Name Type Description Default topics_over_time DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        None topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Topics over Time</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        450

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics over time, simply run:

        topics_over_time = topic_model.topics_over_time(docs, timestamps)\ntopic_model.visualize_topics_over_time(topics_over_time)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_over_time(topics_over_time)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_topics_over_time(self,\n                               topics_over_time: pd.DataFrame,\n                               top_n_topics: int = None,\n                               topics: List[int] = None,\n                               normalize_frequency: bool = False,\n                               custom_labels: bool = False,\n                               title: str = \"<b>Topics over Time</b>\",\n                               width: int = 1250,\n                               height: int = 450) -> go.Figure:\n    \"\"\" Visualize topics over time\n\n    Arguments:\n        topics_over_time: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n\n    To visualize the topics over time, simply run:\n\n    ```python\n    topics_over_time = topic_model.topics_over_time(docs, timestamps)\n    topic_model.visualize_topics_over_time(topics_over_time)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_over_time(topics_over_time)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_topics_over_time(self,\n                                               topics_over_time=topics_over_time,\n                                               top_n_topics=top_n_topics,\n                                               topics=topics,\n                                               normalize_frequency=normalize_frequency,\n                                               custom_labels=custom_labels,\n                                               title=title,\n                                               width=width,\n                                               height=height)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_topics_per_class","title":"visualize_topics_per_class(self, topics_per_class, top_n_topics=10, topics=None, normalize_frequency=False, custom_labels=False, title='<b>Topics per Class</b>', width=1250, height=900)","text":"

        Visualize topics per class

        Parameters:

        Name Type Description Default topics_per_class DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        10 topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Topics per Class</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        900

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics per class, simply run:

        topics_per_class = topic_model.topics_per_class(docs, classes)\ntopic_model.visualize_topics_per_class(topics_per_class)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_per_class(topics_per_class)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_topics_per_class(self,\n                               topics_per_class: pd.DataFrame,\n                               top_n_topics: int = 10,\n                               topics: List[int] = None,\n                               normalize_frequency: bool = False,\n                               custom_labels: bool = False,\n                               title: str = \"<b>Topics per Class</b>\",\n                               width: int = 1250,\n                               height: int = 900) -> go.Figure:\n    \"\"\" Visualize topics per class\n\n    Arguments:\n        topics_per_class: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n\n    To visualize the topics per class, simply run:\n\n    ```python\n    topics_per_class = topic_model.topics_per_class(docs, classes)\n    topic_model.visualize_topics_per_class(topics_per_class)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_per_class(topics_per_class)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_topics_per_class(self,\n                                               topics_per_class=topics_per_class,\n                                               top_n_topics=top_n_topics,\n                                               topics=topics,\n                                               normalize_frequency=normalize_frequency,\n                                               custom_labels=custom_labels,\n                                               title=title,\n                                               width=width,\n                                               height=height)\n
        "},{"location":"api/ctfidf.html","title":"c-TF-IDF","text":"

        A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.

        c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes by joining all documents per class. Thus, each class is converted to a single document instead of set of documents. The frequency of each word x is extracted for each class c and is l1 normalized. This constitutes the term frequency.

        Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus the average number of words per class A divided by the frequency of word x across all classes.

        Parameters:

        Name Type Description Default bm25_weighting bool

        Uses BM25-inspired idf-weighting procedure instead of the procedure as defined in the c-TF-IDF formula. It uses the following weighting scheme: log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))

        False reduce_frequent_words bool

        Takes the square root of the bag-of-words after normalizing the matrix. Helps to reduce the impact of words that appear too frequently.

        False seed_words List[str]

        Specific words that will have their idf value increased by the value of seed_multiplier. NOTE: This will only increase the value of words that have an exact match.

        None seed_multiplier float

        The value with which the idf values of the words in seed_words are multiplied.

        2

        Examples:

        transformer = ClassTfidfTransformer()\n
        Source code in bertopic\\vectorizers\\_ctfidf.py
        class ClassTfidfTransformer(TfidfTransformer):\n    \"\"\"\n    A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.\n\n    ![](../algorithm/c-TF-IDF.svg)\n\n    c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes\n    by joining all documents per class. Thus, each class is converted to a single document\n    instead of set of documents. The frequency of each word **x** is extracted\n    for each class **c** and is **l1** normalized. This constitutes the term frequency.\n\n    Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus\n    the average number of words per class **A** divided by the frequency of word **x**\n    across all classes.\n\n    Arguments:\n        bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure\n                        as defined in the c-TF-IDF formula. It uses the following weighting scheme:\n                        `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`\n        reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.\n                               Helps to reduce the impact of words that appear too frequently.\n        seed_words: Specific words that will have their idf value increased by \n                    the value of `seed_multiplier`. \n                    NOTE: This will only increase the value of words that have an exact match.\n        seed_multiplier: The value with which the idf values of the words in `seed_words`\n                         are multiplied.\n\n    Examples:\n\n    ```python\n    transformer = ClassTfidfTransformer()\n    ```\n    \"\"\"\n    def __init__(self, \n                 bm25_weighting: bool = False, \n                 reduce_frequent_words: bool = False,\n                 seed_words: List[str] = None,\n                 seed_multiplier: float = 2\n                 ):\n        self.bm25_weighting = bm25_weighting\n        self.reduce_frequent_words = reduce_frequent_words\n        self.seed_words = seed_words\n        self.seed_multiplier = seed_multiplier\n        super(ClassTfidfTransformer, self).__init__()\n\n    def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):\n        \"\"\"Learn the idf vector (global term weights).\n\n        Arguments:\n            X: A matrix of term/token counts.\n            multiplier: A multiplier for increasing/decreasing certain IDF scores\n        \"\"\"\n        X = check_array(X, accept_sparse=('csr', 'csc'))\n        if not sp.issparse(X):\n            X = sp.csr_matrix(X)\n        dtype = np.float64\n\n        if self.use_idf:\n            _, n_features = X.shape\n\n            # Calculate the frequency of words across all classes\n            df = np.squeeze(np.asarray(X.sum(axis=0)))\n\n            # Calculate the average number of samples as regularization\n            avg_nr_samples = int(X.sum(axis=1).mean())\n\n            # BM25-inspired weighting procedure\n            if self.bm25_weighting:\n                idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))\n\n            # Divide the average number of samples by the word frequency\n            # +1 is added to force values to be positive\n            else:\n                idf = np.log((avg_nr_samples / df)+1)\n\n            # Multiplier to increase/decrease certain idf scores\n            if multiplier is not None:\n                idf = idf * multiplier\n\n            self._idf_diag = sp.diags(idf, offsets=0,\n                                      shape=(n_features, n_features),\n                                      format='csr',\n                                      dtype=dtype)\n\n        return self\n\n    def transform(self, X: sp.csr_matrix):\n        \"\"\"Transform a count-based matrix to c-TF-IDF\n\n        Arguments:\n            X (sparse matrix): A matrix of term/token counts.\n\n        Returns:\n            X (sparse matrix): A c-TF-IDF matrix\n        \"\"\"\n        if self.use_idf:\n            X = normalize(X, axis=1, norm='l1', copy=False)\n\n            if self.reduce_frequent_words:\n                X.data = np.sqrt(X.data)\n\n            X = X * self._idf_diag\n\n        return X\n
        "},{"location":"api/ctfidf.html#bertopic.vectorizers._ctfidf.ClassTfidfTransformer.fit","title":"fit(self, X, multiplier=None)","text":"

        Learn the idf vector (global term weights).

        Parameters:

        Name Type Description Default X csr_matrix

        A matrix of term/token counts.

        required multiplier ndarray

        A multiplier for increasing/decreasing certain IDF scores

        None Source code in bertopic\\vectorizers\\_ctfidf.py
        def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):\n    \"\"\"Learn the idf vector (global term weights).\n\n    Arguments:\n        X: A matrix of term/token counts.\n        multiplier: A multiplier for increasing/decreasing certain IDF scores\n    \"\"\"\n    X = check_array(X, accept_sparse=('csr', 'csc'))\n    if not sp.issparse(X):\n        X = sp.csr_matrix(X)\n    dtype = np.float64\n\n    if self.use_idf:\n        _, n_features = X.shape\n\n        # Calculate the frequency of words across all classes\n        df = np.squeeze(np.asarray(X.sum(axis=0)))\n\n        # Calculate the average number of samples as regularization\n        avg_nr_samples = int(X.sum(axis=1).mean())\n\n        # BM25-inspired weighting procedure\n        if self.bm25_weighting:\n            idf = np.log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))\n\n        # Divide the average number of samples by the word frequency\n        # +1 is added to force values to be positive\n        else:\n            idf = np.log((avg_nr_samples / df)+1)\n\n        # Multiplier to increase/decrease certain idf scores\n        if multiplier is not None:\n            idf = idf * multiplier\n\n        self._idf_diag = sp.diags(idf, offsets=0,\n                                  shape=(n_features, n_features),\n                                  format='csr',\n                                  dtype=dtype)\n\n    return self\n
        "},{"location":"api/ctfidf.html#bertopic.vectorizers._ctfidf.ClassTfidfTransformer.transform","title":"transform(self, X)","text":"

        Transform a count-based matrix to c-TF-IDF

        Parameters:

        Name Type Description Default X sparse matrix

        A matrix of term/token counts.

        required

        Returns:

        Type Description X (sparse matrix)

        A c-TF-IDF matrix

        Source code in bertopic\\vectorizers\\_ctfidf.py
        def transform(self, X: sp.csr_matrix):\n    \"\"\"Transform a count-based matrix to c-TF-IDF\n\n    Arguments:\n        X (sparse matrix): A matrix of term/token counts.\n\n    Returns:\n        X (sparse matrix): A c-TF-IDF matrix\n    \"\"\"\n    if self.use_idf:\n        X = normalize(X, axis=1, norm='l1', copy=False)\n\n        if self.reduce_frequent_words:\n            X.data = np.sqrt(X.data)\n\n        X = X * self._idf_diag\n\n    return X\n
        "},{"location":"api/onlinecv.html","title":"OnlineCountVectorizer","text":"

        An online variant of the CountVectorizer with updating vocabulary.

        At each .partial_fit, its vocabulary is updated based on any OOV words it might find. Then, .update_bow can be used to track and update the Bag-of-Words representation. These functions are separated such that the vectorizer can be used in iteration without updating the Bag-of-Words representation can might speed up the fitting process. However, the .update_bow function is used in BERTopic to track changes in the topic representations and allow for decay.

        This class inherits its parameters and attributes from: sklearn.feature_extraction.text.CountVectorizer

        Parameters:

        Name Type Description Default decay float

        A value between [0, 1] to weight the percentage of frequencies the previous bag-of-words should be decreased. For example, a value of .1 will decrease the frequencies in the bag-of-words matrix with 10% at each iteration.

        None delete_min_df float

        Delete words at each iteration from its vocabulary that are below a minimum frequency. This will keep the resulting bag-of-words matrix small such that it does not explode in size with increasing vocabulary. If decay is None then this equals min_df.

        None **kwargs

        Set of parameters inherited from: sklearn.feature_extraction.text.CountVectorizer In practice, this means that you can still use parameters from the original CountVectorizer, like stop_words and ngram_range.

        {}

        Attributes:

        Name Type Description X_ scipy.sparse.csr_matrix)

        The Bag-of-Words representation

        Examples:

        from bertopic.vectorizers import OnlineCountVectorizer\nvectorizer = OnlineCountVectorizer(stop_words=\"english\")\n\nfor index, doc in enumerate(my_docs):\n    vectorizer.partial_fit(doc)\n\n    # Update and clean the bow every 100 iterations:\n    if index % 100 == 0:\n        X = vectorizer.update_bow()\n

        To use the model in BERTopic:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import OnlineCountVectorizer\n\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        References

        Adapted from: https://github.com/idoshlomo/online_vectorizers

        Source code in bertopic\\vectorizers\\_online_cv.py
        class OnlineCountVectorizer(CountVectorizer):\n    \"\"\" An online variant of the CountVectorizer with updating vocabulary.\n\n    At each `.partial_fit`, its vocabulary is updated based on any OOV words\n    it might find. Then, `.update_bow` can be used to track and update\n    the Bag-of-Words representation. These functions are separated such that\n    the vectorizer can be used in iteration without updating the Bag-of-Words\n    representation can might speed up the fitting process. However, the\n    `.update_bow` function is used in BERTopic to track changes in the\n    topic representations and allow for decay.\n\n    This class inherits its parameters and attributes from:\n        `sklearn.feature_extraction.text.CountVectorizer`\n\n    Arguments:\n        decay: A value between [0, 1] to weight the percentage of frequencies\n               the previous bag-of-words should be decreased. For example,\n               a value of `.1` will decrease the frequencies in the bag-of-words\n               matrix with 10% at each iteration.\n        delete_min_df: Delete words at each iteration from its vocabulary\n                       that are below a minimum frequency.\n                       This will keep the resulting bag-of-words matrix small\n                       such that it does not explode in size with increasing\n                       vocabulary. If `decay` is None then this equals `min_df`.\n        **kwargs: Set of parameters inherited from:\n                  `sklearn.feature_extraction.text.CountVectorizer`\n                  In practice, this means that you can still use parameters\n                  from the original CountVectorizer, like `stop_words` and\n                  `ngram_range`.\n\n    Attributes:\n        X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation\n\n    Examples:\n\n    ```python\n    from bertopic.vectorizers import OnlineCountVectorizer\n    vectorizer = OnlineCountVectorizer(stop_words=\"english\")\n\n    for index, doc in enumerate(my_docs):\n        vectorizer.partial_fit(doc)\n\n        # Update and clean the bow every 100 iterations:\n        if index % 100 == 0:\n            X = vectorizer.update_bow()\n    ```\n\n    To use the model in BERTopic:\n\n    ```python\n    from bertopic import BERTopic\n    from bertopic.vectorizers import OnlineCountVectorizer\n\n    vectorizer_model = OnlineCountVectorizer(stop_words=\"english\")\n    topic_model = BERTopic(vectorizer_model=vectorizer_model)\n    ```\n\n    References:\n        Adapted from: https://github.com/idoshlomo/online_vectorizers\n    \"\"\"\n    def __init__(self,\n                 decay: float = None,\n                 delete_min_df: float = None,\n                 **kwargs):\n        self.decay = decay\n        self.delete_min_df = delete_min_df\n        super(OnlineCountVectorizer, self).__init__(**kwargs)\n\n    def partial_fit(self, raw_documents: List[str]) -> None:\n        \"\"\" Perform a partial fit and update vocabulary with OOV tokens\n\n        Arguments:\n            raw_documents: A list of documents\n        \"\"\"\n        if not hasattr(self, 'vocabulary_'):\n            return self.fit(raw_documents)\n\n        analyzer = self.build_analyzer()\n        analyzed_documents = [analyzer(doc) for doc in raw_documents]\n        new_tokens = set(chain.from_iterable(analyzed_documents))\n        oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))\n\n        if oov_tokens:\n            max_index = max(self.vocabulary_.values())\n            oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1))))\n            self.vocabulary_.update(oov_vocabulary)\n\n        return self\n\n    def update_bow(self, raw_documents: List[str]) -> csr_matrix:\n        \"\"\" Create or update the bag-of-words matrix\n\n        Update the bag-of-words matrix by adding the newly transformed\n        documents. This may add empty columns if new words are found and/or\n        add empty rows if new topics are found.\n\n        During this process, the previous bag-of-words matrix might be\n        decayed if `self.decay` has been set during init. Similarly, words\n        that do not exceed `self.delete_min_df` are removed from its\n        vocabulary and bag-of-words matrix.\n\n        Arguments:\n            raw_documents: A list of documents\n\n        Returns:\n            X_: Bag-of-words matrix\n        \"\"\"\n        if hasattr(self, \"X_\"):\n            X = self.transform(raw_documents)\n\n            # Add empty columns if new words are found\n            columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)\n            self.X_ = sparse.hstack([self.X_, columns])\n\n            # Add empty rows if new topics are found\n            rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)\n            self.X_ = sparse.vstack([self.X_, rows])\n\n            # Decay of BoW matrix\n            if self.decay is not None:\n                self.X_ = self.X_ * (1 - self.decay)\n\n            self.X_ += X\n        else:\n            self.X_ = self.transform(raw_documents)\n\n        if self.delete_min_df is not None:\n            self._clean_bow()\n\n        return self.X_\n\n    def _clean_bow(self) -> None:\n        \"\"\" Remove words that do not exceed `self.delete_min_df` \"\"\"\n        # Only keep words with a minimum frequency\n        indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]\n        indices_dict = {index: index for index in indices}\n        self.X_ = self.X_[:, indices]\n\n        # Update vocabulary with new words\n        new_vocab = {}\n        vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}\n        for i, index in enumerate(indices):\n            if indices_dict.get(index) is not None:\n                new_vocab[vocabulary_dict[index]] = i\n\n        self.vocabulary_ = new_vocab\n
        "},{"location":"api/onlinecv.html#bertopic.vectorizers._online_cv.OnlineCountVectorizer.partial_fit","title":"partial_fit(self, raw_documents)","text":"

        Perform a partial fit and update vocabulary with OOV tokens

        Parameters:

        Name Type Description Default raw_documents List[str]

        A list of documents

        required Source code in bertopic\\vectorizers\\_online_cv.py
        def partial_fit(self, raw_documents: List[str]) -> None:\n    \"\"\" Perform a partial fit and update vocabulary with OOV tokens\n\n    Arguments:\n        raw_documents: A list of documents\n    \"\"\"\n    if not hasattr(self, 'vocabulary_'):\n        return self.fit(raw_documents)\n\n    analyzer = self.build_analyzer()\n    analyzed_documents = [analyzer(doc) for doc in raw_documents]\n    new_tokens = set(chain.from_iterable(analyzed_documents))\n    oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))\n\n    if oov_tokens:\n        max_index = max(self.vocabulary_.values())\n        oov_vocabulary = dict(zip(oov_tokens, list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1))))\n        self.vocabulary_.update(oov_vocabulary)\n\n    return self\n
        "},{"location":"api/onlinecv.html#bertopic.vectorizers._online_cv.OnlineCountVectorizer.update_bow","title":"update_bow(self, raw_documents)","text":"

        Create or update the bag-of-words matrix

        Update the bag-of-words matrix by adding the newly transformed documents. This may add empty columns if new words are found and/or add empty rows if new topics are found.

        During this process, the previous bag-of-words matrix might be decayed if self.decay has been set during init. Similarly, words that do not exceed self.delete_min_df are removed from its vocabulary and bag-of-words matrix.

        Parameters:

        Name Type Description Default raw_documents List[str]

        A list of documents

        required

        Returns:

        Type Description X_

        Bag-of-words matrix

        Source code in bertopic\\vectorizers\\_online_cv.py
        def update_bow(self, raw_documents: List[str]) -> csr_matrix:\n    \"\"\" Create or update the bag-of-words matrix\n\n    Update the bag-of-words matrix by adding the newly transformed\n    documents. This may add empty columns if new words are found and/or\n    add empty rows if new topics are found.\n\n    During this process, the previous bag-of-words matrix might be\n    decayed if `self.decay` has been set during init. Similarly, words\n    that do not exceed `self.delete_min_df` are removed from its\n    vocabulary and bag-of-words matrix.\n\n    Arguments:\n        raw_documents: A list of documents\n\n    Returns:\n        X_: Bag-of-words matrix\n    \"\"\"\n    if hasattr(self, \"X_\"):\n        X = self.transform(raw_documents)\n\n        # Add empty columns if new words are found\n        columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)\n        self.X_ = sparse.hstack([self.X_, columns])\n\n        # Add empty rows if new topics are found\n        rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)\n        self.X_ = sparse.vstack([self.X_, rows])\n\n        # Decay of BoW matrix\n        if self.decay is not None:\n            self.X_ = self.X_ * (1 - self.decay)\n\n        self.X_ += X\n    else:\n        self.X_ = self.transform(raw_documents)\n\n    if self.delete_min_df is not None:\n        self._clean_bow()\n\n    return self.X_\n
        "},{"location":"api/backends/base.html","title":"BaseEmbedder","text":"

        The Base Embedder used for creating embedding models

        Parameters:

        Name Type Description Default embedding_model

        The main embedding model to be used for extracting document and word embedding

        None word_embedding_model

        The embedding model used for extracting word embeddings only. If this model is selected, then the embedding_model is purely used for creating document embeddings.

        None Source code in bertopic\\backend\\_base.py
        class BaseEmbedder:\n    \"\"\" The Base Embedder used for creating embedding models\n\n    Arguments:\n        embedding_model: The main embedding model to be used for extracting\n                         document and word embedding\n        word_embedding_model: The embedding model used for extracting word\n                              embeddings only. If this model is selected,\n                              then the `embedding_model` is purely used for\n                              creating document embeddings.\n    \"\"\"\n    def __init__(self,\n                 embedding_model=None,\n                 word_embedding_model=None):\n        self.embedding_model = embedding_model\n        self.word_embedding_model = word_embedding_model\n\n    def embed(self,\n              documents: List[str],\n              verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n documents/words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            documents: A list of documents or words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document/words embeddings with shape (n, m) with `n` documents/words\n            that each have an embeddings size of `m`\n        \"\"\"\n        pass\n\n    def embed_words(self,\n                    words: List[str],\n                    verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            words: A list of words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Word embeddings with shape (n, m) with `n` words\n            that each have an embeddings size of `m`\n\n        \"\"\"\n        return self.embed(words, verbose)\n\n    def embed_documents(self,\n                        document: List[str],\n                        verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            document: A list of documents to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document embeddings with shape (n, m) with `n` documents\n            that each have an embeddings size of `m`\n        \"\"\"\n        return self.embed(document, verbose)\n
        "},{"location":"api/backends/base.html#bertopic.backend._base.BaseEmbedder.embed","title":"embed(self, documents, verbose=False)","text":"

        Embed a list of n documents/words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents or words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document/words embeddings with shape (n, m) with n documents/words that each have an embeddings size of m

        Source code in bertopic\\backend\\_base.py
        def embed(self,\n          documents: List[str],\n          verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n documents/words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        documents: A list of documents or words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document/words embeddings with shape (n, m) with `n` documents/words\n        that each have an embeddings size of `m`\n    \"\"\"\n    pass\n
        "},{"location":"api/backends/base.html#bertopic.backend._base.BaseEmbedder.embed_documents","title":"embed_documents(self, document, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default document List[str]

        A list of documents to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document embeddings with shape (n, m) with n documents that each have an embeddings size of m

        Source code in bertopic\\backend\\_base.py
        def embed_documents(self,\n                    document: List[str],\n                    verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        document: A list of documents to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document embeddings with shape (n, m) with `n` documents\n        that each have an embeddings size of `m`\n    \"\"\"\n    return self.embed(document, verbose)\n
        "},{"location":"api/backends/base.html#bertopic.backend._base.BaseEmbedder.embed_words","title":"embed_words(self, words, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default words List[str]

        A list of words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Word embeddings with shape (n, m) with n words that each have an embeddings size of m

        Source code in bertopic\\backend\\_base.py
        def embed_words(self,\n                words: List[str],\n                verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        words: A list of words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Word embeddings with shape (n, m) with `n` words\n        that each have an embeddings size of `m`\n\n    \"\"\"\n    return self.embed(words, verbose)\n
        "},{"location":"api/backends/cohere.html","title":"CohereBackend","text":"

        Cohere Embedding Model

        Parameters:

        Name Type Description Default client

        A cohere client.

        required embedding_model str

        A Cohere model. Default is \"large\". For an overview of models see: https://docs.cohere.ai/docs/generation-card

        'large' delay_in_seconds float

        If a batch_size is given, use this set the delay in seconds between batches.

        None batch_size int

        The size of each batch.

        None embed_kwargs Mapping[str, Any]

        Kwargs passed to cohere.Client.embed. Can be used to define additional parameters such as input_type

        {}

        Examples:

        import cohere\nfrom bertopic.backend import CohereBackend\n\nclient = cohere.Client(\"APIKEY\")\ncohere_model = CohereBackend(client)\n

        If you want to specify input_type:

        cohere_model = CohereBackend(\n    client,\n    embedding_model=\"embed-english-v3.0\",\n    embed_kwargs={\"input_type\": \"clustering\"}\n)\n
        Source code in bertopic\\backend\\_cohere.py
        class CohereBackend(BaseEmbedder):\n    \"\"\" Cohere Embedding Model\n\n    Arguments:\n        client: A `cohere` client.\n        embedding_model: A Cohere model. Default is \"large\".\n                         For an overview of models see:\n                         https://docs.cohere.ai/docs/generation-card\n        delay_in_seconds: If a `batch_size` is given, use this set\n                          the delay in seconds between batches.\n        batch_size: The size of each batch.\n        embed_kwargs: Kwargs passed to `cohere.Client.embed`.\n                            Can be used to define additional parameters\n                            such as `input_type`\n\n    Examples:\n\n    ```python\n    import cohere\n    from bertopic.backend import CohereBackend\n\n    client = cohere.Client(\"APIKEY\")\n    cohere_model = CohereBackend(client)\n    ```\n\n    If you want to specify `input_type`:\n\n    ```python\n    cohere_model = CohereBackend(\n        client,\n        embedding_model=\"embed-english-v3.0\",\n        embed_kwargs={\"input_type\": \"clustering\"}\n    )\n    ```\n    \"\"\"\n    def __init__(self,\n                 client,\n                 embedding_model: str = \"large\",\n                 delay_in_seconds: float = None,\n                 batch_size: int = None,\n                 embed_kwargs: Mapping[str, Any] = {}):\n        super().__init__()\n        self.client = client\n        self.embedding_model = embedding_model\n        self.delay_in_seconds = delay_in_seconds\n        self.batch_size = batch_size\n        self.embed_kwargs = embed_kwargs\n\n        if self.embed_kwargs.get(\"model\"):\n            self.embedding_model = embed_kwargs.get(\"model\")\n        else:\n            self.embed_kwargs[\"model\"] = self.embedding_model\n\n    def embed(self,\n              documents: List[str],\n              verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n documents/words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            documents: A list of documents or words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document/words embeddings with shape (n, m) with `n` documents/words\n            that each have an embeddings size of `m`\n        \"\"\"\n        # Batch-wise embedding extraction\n        if self.batch_size is not None:\n            embeddings = []\n            for batch in tqdm(self._chunks(documents), disable=not verbose):\n                response = self.client.embed(texts=batch, **self.embed_kwargs)\n                embeddings.extend(response.embeddings)\n\n                # Delay subsequent calls\n                if self.delay_in_seconds:\n                    time.sleep(self.delay_in_seconds)\n\n        # Extract embeddings all at once\n        else:\n            response = self.client.embed(texts=documents, **self.embed_kwargs)\n            embeddings = response.embeddings\n        return np.array(embeddings)\n\n    def _chunks(self, documents):\n        for i in range(0, len(documents), self.batch_size):\n            yield documents[i:i + self.batch_size]\n
        "},{"location":"api/backends/cohere.html#bertopic.backend._cohere.CohereBackend.embed","title":"embed(self, documents, verbose=False)","text":"

        Embed a list of n documents/words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents or words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document/words embeddings with shape (n, m) with n documents/words that each have an embeddings size of m

        Source code in bertopic\\backend\\_cohere.py
        def embed(self,\n          documents: List[str],\n          verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n documents/words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        documents: A list of documents or words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document/words embeddings with shape (n, m) with `n` documents/words\n        that each have an embeddings size of `m`\n    \"\"\"\n    # Batch-wise embedding extraction\n    if self.batch_size is not None:\n        embeddings = []\n        for batch in tqdm(self._chunks(documents), disable=not verbose):\n            response = self.client.embed(texts=batch, **self.embed_kwargs)\n            embeddings.extend(response.embeddings)\n\n            # Delay subsequent calls\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n    # Extract embeddings all at once\n    else:\n        response = self.client.embed(texts=documents, **self.embed_kwargs)\n        embeddings = response.embeddings\n    return np.array(embeddings)\n
        "},{"location":"api/backends/openai.html","title":"OpenAIBackend","text":"

        OpenAI Embedding Model

        Parameters:

        Name Type Description Default client OpenAI

        A openai.OpenAI client.

        required embedding_model str

        An OpenAI model. Default is For an overview of models see: https://platform.openai.com/docs/models/embeddings

        'text-embedding-ada-002' delay_in_seconds float

        If a batch_size is given, use this set the delay in seconds between batches.

        None batch_size int

        The size of each batch.

        None generator_kwargs Mapping[str, Any]

        Kwargs passed to openai.Embedding.create. Can be used to define custom engines or deployment_ids.

        {}

        Examples:

        import openai\nfrom bertopic.backend import OpenAIBackend\n\nclient = openai.OpenAI(api_key=\"sk-...\")\nopenai_embedder = OpenAIBackend(client, \"text-embedding-ada-002\")\n
        Source code in bertopic\\backend\\_openai.py
        class OpenAIBackend(BaseEmbedder):\n    \"\"\" OpenAI Embedding Model\n\n    Arguments:\n        client: A `openai.OpenAI` client.\n        embedding_model: An OpenAI model. Default is\n                         For an overview of models see:\n                         https://platform.openai.com/docs/models/embeddings\n        delay_in_seconds: If a `batch_size` is given, use this set\n                          the delay in seconds between batches.\n        batch_size: The size of each batch.\n        generator_kwargs: Kwargs passed to `openai.Embedding.create`.\n                          Can be used to define custom engines or\n                          deployment_ids.\n\n    Examples:\n\n    ```python\n    import openai\n    from bertopic.backend import OpenAIBackend\n\n    client = openai.OpenAI(api_key=\"sk-...\")\n    openai_embedder = OpenAIBackend(client, \"text-embedding-ada-002\")\n    ```\n    \"\"\"\n    def __init__(self,\n                 client: openai.OpenAI,\n                 embedding_model: str = \"text-embedding-ada-002\",\n                 delay_in_seconds: float = None,\n                 batch_size: int = None,\n                 generator_kwargs: Mapping[str, Any] = {}):\n        super().__init__()\n        self.client = client\n        self.embedding_model = embedding_model\n        self.delay_in_seconds = delay_in_seconds\n        self.batch_size = batch_size\n        self.generator_kwargs = generator_kwargs\n\n        if self.generator_kwargs.get(\"model\"):\n            self.embedding_model = generator_kwargs.get(\"model\")\n        elif not self.generator_kwargs.get(\"engine\"):\n            self.generator_kwargs[\"model\"] = self.embedding_model\n\n    def embed(self,\n              documents: List[str],\n              verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n documents/words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            documents: A list of documents or words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document/words embeddings with shape (n, m) with `n` documents/words\n            that each have an embeddings size of `m`\n        \"\"\"\n        # Prepare documents, replacing empty strings with a single space\n        prepared_documents = [\" \" if doc == \"\" else doc for doc in documents]\n\n        # Batch-wise embedding extraction\n        if self.batch_size is not None:\n            embeddings = []\n            for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):\n                response = self.client.embeddings.create(input=batch, **self.generator_kwargs)\n                embeddings.extend([r.embedding for r in response.data])\n\n                # Delay subsequent calls\n                if self.delay_in_seconds:\n                    time.sleep(self.delay_in_seconds)\n\n        # Extract embeddings all at once\n        else:\n            response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)\n            embeddings = [r.embedding for r in response.data]\n        return np.array(embeddings)\n\n    def _chunks(self, documents):\n        for i in range(0, len(documents), self.batch_size):\n            yield documents[i:i + self.batch_size]\n
        "},{"location":"api/backends/openai.html#bertopic.backend._openai.OpenAIBackend.embed","title":"embed(self, documents, verbose=False)","text":"

        Embed a list of n documents/words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents or words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document/words embeddings with shape (n, m) with n documents/words that each have an embeddings size of m

        Source code in bertopic\\backend\\_openai.py
        def embed(self,\n          documents: List[str],\n          verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n documents/words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        documents: A list of documents or words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document/words embeddings with shape (n, m) with `n` documents/words\n        that each have an embeddings size of `m`\n    \"\"\"\n    # Prepare documents, replacing empty strings with a single space\n    prepared_documents = [\" \" if doc == \"\" else doc for doc in documents]\n\n    # Batch-wise embedding extraction\n    if self.batch_size is not None:\n        embeddings = []\n        for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):\n            response = self.client.embeddings.create(input=batch, **self.generator_kwargs)\n            embeddings.extend([r.embedding for r in response.data])\n\n            # Delay subsequent calls\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n    # Extract embeddings all at once\n    else:\n        response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)\n        embeddings = [r.embedding for r in response.data]\n    return np.array(embeddings)\n
        "},{"location":"api/backends/word_doc.html","title":"WordDocEmbedder","text":"

        Combine a document- and word-level embedder

        Source code in bertopic\\backend\\_word_doc.py
        class WordDocEmbedder(BaseEmbedder):\n    \"\"\" Combine a document- and word-level embedder\n    \"\"\"\n    def __init__(self,\n                 embedding_model,\n                 word_embedding_model):\n        super().__init__()\n\n        self.embedding_model = select_backend(embedding_model)\n        self.word_embedding_model = select_backend(word_embedding_model)\n\n    def embed_words(self,\n                    words: List[str],\n                    verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            words: A list of words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Word embeddings with shape (n, m) with `n` words\n            that each have an embeddings size of `m`\n\n        \"\"\"\n        return self.word_embedding_model.embed(words, verbose)\n\n    def embed_documents(self,\n                        document: List[str],\n                        verbose: bool = False) -> np.ndarray:\n        \"\"\" Embed a list of n words into an n-dimensional\n        matrix of embeddings\n\n        Arguments:\n            document: A list of documents to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document embeddings with shape (n, m) with `n` documents\n            that each have an embeddings size of `m`\n        \"\"\"\n        return self.embedding_model.embed(document, verbose)\n
        "},{"location":"api/backends/word_doc.html#bertopic.backend._word_doc.WordDocEmbedder.embed_documents","title":"embed_documents(self, document, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default document List[str]

        A list of documents to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document embeddings with shape (n, m) with n documents that each have an embeddings size of m

        Source code in bertopic\\backend\\_word_doc.py
        def embed_documents(self,\n                    document: List[str],\n                    verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        document: A list of documents to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document embeddings with shape (n, m) with `n` documents\n        that each have an embeddings size of `m`\n    \"\"\"\n    return self.embedding_model.embed(document, verbose)\n
        "},{"location":"api/backends/word_doc.html#bertopic.backend._word_doc.WordDocEmbedder.embed_words","title":"embed_words(self, words, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings

        Parameters:

        Name Type Description Default words List[str]

        A list of words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Word embeddings with shape (n, m) with n words that each have an embeddings size of m

        Source code in bertopic\\backend\\_word_doc.py
        def embed_words(self,\n                words: List[str],\n                verbose: bool = False) -> np.ndarray:\n    \"\"\" Embed a list of n words into an n-dimensional\n    matrix of embeddings\n\n    Arguments:\n        words: A list of words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Word embeddings with shape (n, m) with `n` words\n        that each have an embeddings size of `m`\n\n    \"\"\"\n    return self.word_embedding_model.embed(words, verbose)\n
        "},{"location":"api/cluster/base.html","title":"BaseCluster","text":"

        The Base Cluster class

        Using this class directly in BERTopic will make it skip over the cluster step. As a result, topics need to be passed to BERTopic in the form of its y parameter in order to create topic representations.

        Examples:

        This will skip over the cluster step in BERTopic:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseCluster\n\nempty_cluster_model = BaseCluster()\n\ntopic_model = BERTopic(hdbscan_model=empty_cluster_model)\n

        Then, this class can be used to perform manual topic modeling. That is, topic modeling on a topics that were already generated before without the need to learn them:

        topic_model.fit(docs, y=y)\n
        Source code in bertopic\\cluster\\_base.py
        class BaseCluster:\n    \"\"\" The Base Cluster class\n\n    Using this class directly in BERTopic will make it skip\n    over the cluster step. As a result, topics need to be passed \n    to BERTopic in the form of its `y` parameter in order to create \n    topic representations. \n\n    Examples:    \n\n    This will skip over the cluster step in BERTopic:\n\n    ```python\n    from bertopic import BERTopic\n    from bertopic.dimensionality import BaseCluster\n\n    empty_cluster_model = BaseCluster()\n\n    topic_model = BERTopic(hdbscan_model=empty_cluster_model)\n    ```\n\n    Then, this class can be used to perform manual topic modeling. \n    That is, topic modeling on a topics that were already generated before \n    without the need to learn them:\n\n    ```python\n    topic_model.fit(docs, y=y)\n    ```\n    \"\"\"\n    def fit(self, X, y=None):\n        if y is not None:\n            self.labels_ = y\n        else:\n            self.labels_ = None\n        return self\n\n    def transform(self, X: np.ndarray) -> np.ndarray:\n        return X\n
        "},{"location":"api/dimensionality/base.html","title":"BaseDimensionalityReduction","text":"

        The Base Dimensionality Reduction class

        You can use this to skip over the dimensionality reduction step in BERTopic.

        Examples:

        This will skip over the reduction step in BERTopic:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\nempty_reduction_model = BaseDimensionalityReduction()\n\ntopic_model = BERTopic(umap_model=empty_reduction_model)\n
        Source code in bertopic\\dimensionality\\_base.py
        class BaseDimensionalityReduction:\n    \"\"\" The Base Dimensionality Reduction class\n\n    You can use this to skip over the dimensionality reduction step in BERTopic.\n\n    Examples:\n\n    This will skip over the reduction step in BERTopic:\n\n    ```python\n    from bertopic import BERTopic\n    from bertopic.dimensionality import BaseDimensionalityReduction\n\n    empty_reduction_model = BaseDimensionalityReduction()\n\n    topic_model = BERTopic(umap_model=empty_reduction_model)\n    ```\n    \"\"\"\n    def fit(self, X: np.ndarray = None):\n        return self\n\n    def transform(self, X: np.ndarray) -> np.ndarray:\n        return X\n
        "},{"location":"api/plotting/barchart.html","title":"Barchart","text":"

        Visualize a barchart of selected topics

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        8 n_words int

        Number of words to show in a topic

        5 custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topic Word Scores</b>' width int

        The width of each figure.

        250 height int

        The height of each figure.

        250 autoscale bool

        Whether to automatically calculate the height of the figures to fit the whole bar text

        False

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the barchart of selected topics simply run:

        topic_model.visualize_barchart()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_barchart()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_barchart.py
        def visualize_barchart(topic_model,\n                       topics: List[int] = None,\n                       top_n_topics: int = 8,\n                       n_words: int = 5,\n                       custom_labels: Union[bool, str] = False,\n                       title: str = \"<b>Topic Word Scores</b>\",\n                       width: int = 250,\n                       height: int = 250,\n                       autoscale: bool=False) -> go.Figure:\n    \"\"\" Visualize a barchart of selected topics\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_words: Number of words to show in a topic\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of each figure.\n        height: The height of each figure.\n        autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the barchart of selected topics\n    simply run:\n\n    ```python\n    topic_model.visualize_barchart()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_barchart()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/bar_chart.html\"\n    style=\"width:1100px; height: 660px; border: 0px;\"\"></iframe>\n    \"\"\"\n    colors = itertools.cycle([\"#D55E00\", \"#0072B2\", \"#CC79A7\", \"#E69F00\", \"#56B4E9\", \"#009E73\", \"#F0E442\"])\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list()[0:6])\n\n    # Initialize figure\n    if isinstance(custom_labels, str):\n        subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]\n        subplot_titles = [\"_\".join([label[0] for label in labels[:4]]) for labels in subplot_titles]\n        subplot_titles = [label if len(label) < 30 else label[:27] + \"...\" for label in subplot_titles]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]\n    else:\n        subplot_titles = [f\"Topic {topic}\" for topic in topics]\n    columns = 4\n    rows = int(np.ceil(len(topics) / columns))\n    fig = make_subplots(rows=rows,\n                        cols=columns,\n                        shared_xaxes=False,\n                        horizontal_spacing=.1,\n                        vertical_spacing=.4 / rows if rows > 1 else 0,\n                        subplot_titles=subplot_titles)\n\n    # Add barchart for each topic\n    row = 1\n    column = 1\n    for topic in topics:\n        words = [word + \"  \" for word, _ in topic_model.get_topic(topic)][:n_words][::-1]\n        scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]\n\n        fig.add_trace(\n            go.Bar(x=scores,\n                   y=words,\n                   orientation='h',\n                   marker_color=next(colors)),\n            row=row, col=column)\n\n        if autoscale:\n            if len(words) > 12:\n                height = 250 + (len(words) - 12) * 11\n\n            if len(words) > 9:\n                fig.update_yaxes(\n                    tickfont=dict(size=(height - 140) // len(words))\n                )\n\n        if column == columns:\n            column = 1\n            row += 1\n        else:\n            column += 1\n\n    # Stylize graph\n    fig.update_layout(\n        template=\"plotly_white\",\n        showlegend=False,\n        title={\n            'text': f\"{title}\",\n            'x': .5,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        width=width*4,\n        height=height*rows if rows > 1 else height * 1.3,\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n    )\n\n    fig.update_xaxes(showgrid=True)\n    fig.update_yaxes(showgrid=True)\n\n    return fig\n
        "},{"location":"api/plotting/distribution.html","title":"Distribution","text":"

        Visualize the distribution of topic probabilities

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required probabilities ndarray

        An array of probability scores

        required min_probability float

        The minimum probability score to visualize. All others are ignored.

        0.015 custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topic Probability Distribution</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        600

        Examples:

        Make sure to fit the model before and only input the probabilities of a single document:

        topic_model.visualize_distribution(probabilities[0])\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_distribution(probabilities[0])\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_distribution.py
        def visualize_distribution(topic_model,\n                           probabilities: np.ndarray,\n                           min_probability: float = 0.015,\n                           custom_labels: Union[bool, str] = False,\n                           title: str = \"<b>Topic Probability Distribution</b>\",\n                           width: int = 800,\n                           height: int = 600) -> go.Figure:\n    \"\"\" Visualize the distribution of topic probabilities\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        probabilities: An array of probability scores\n        min_probability: The minimum probability score to visualize.\n                         All others are ignored.\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    Make sure to fit the model before and only input the\n    probabilities of a single document:\n\n    ```python\n    topic_model.visualize_distribution(probabilities[0])\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_distribution(probabilities[0])\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/probabilities.html\"\n    style=\"width:1000px; height: 500px; border: 0px;\"\"></iframe>\n    \"\"\"\n    if len(probabilities.shape) != 1:\n        raise ValueError(\"This visualization cannot be used if you have set `calculate_probabilities` to False \"\n                         \"as it uses the topic probabilities of all topics. \")\n    if len(probabilities[probabilities > min_probability]) == 0:\n        raise ValueError(\"There are no values where `min_probability` is higher than the \"\n                         \"probabilities that were supplied. Lower `min_probability` to prevent this error.\")\n\n    # Get values and indices equal or exceed the minimum probability\n    labels_idx = np.argwhere(probabilities >= min_probability).flatten()\n    vals = probabilities[labels_idx].tolist()\n\n    # Create labels\n    if isinstance(custom_labels, str):\n        labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in labels_idx]\n        labels = [\"_\".join([label[0] for label in l[:4]]) for l in labels]\n        labels = [label if len(label) < 30 else label[:27] + \"...\" for label in labels]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        labels = [topic_model.custom_labels_[idx + topic_model._outliers] for idx in labels_idx]\n    else:\n        labels = []\n        for idx in labels_idx:\n            words = topic_model.get_topic(idx)\n            if words:\n                label = [word[0] for word in words[:5]]\n                label = f\"<b>Topic {idx}</b>: {'_'.join(label)}\"\n                label = label[:40] + \"...\" if len(label) > 40 else label\n                labels.append(label)\n            else:\n                vals.remove(probabilities[idx])\n\n    # Create Figure\n    fig = go.Figure(go.Bar(\n        x=vals,\n        y=labels,\n        marker=dict(\n            color='#C8D2D7',\n            line=dict(\n                color='#6E8484',\n                width=1),\n        ),\n        orientation='h')\n    )\n\n    fig.update_layout(\n        xaxis_title=\"Probability\",\n        title={\n            'text': f\"{title}\",\n            'y': .95,\n            'x': 0.5,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        template=\"simple_white\",\n        width=width,\n        height=height,\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n    )\n\n    return fig\n
        "},{"location":"api/plotting/document_datamap.html","title":"Documents with DataMapPlot","text":"

        Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points.

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        'Documents and Topics' sub_title Optional[str]

        Sub-title of the plot.

        None width int

        The width of the figure.

        1200 height int

        The height of the figure.

        1200 **datamap_kwds

        All further keyword args will be passed on to DataMapPlot's create_plot function. See the DataMapPlot documentation for more details.

        {}

        Returns:

        Type Description figure

        A Matplotlib Figure object.

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_document_datamap(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\nfig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n

        Source code in bertopic\\plotting\\_datamap.py
        def visualize_document_datamap(topic_model,\n                               docs: List[str],\n                               topics: List[int] = None,\n                               embeddings: np.ndarray = None,\n                               reduced_embeddings: np.ndarray = None,\n                               custom_labels: Union[bool, str] = False,\n                               title: str = \"Documents and Topics\",\n                               sub_title: Union[str, None] = None,\n                               width: int = 1200,\n                               height: int = 1200,\n                               **datamap_kwds) -> Figure:\n    \"\"\" Visualize documents and their topics in 2D as a static plot for publication using\n    DataMapPlot.\n\n    Arguments:\n        topic_model:  A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`. Documents not in these topics will be shown\n                as noise points.\n        embeddings:  The embeddings of all documents in `docs`.\n        reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.\n        custom_labels:  If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        sub_title: Sub-title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n        **datamap_kwds:  All further keyword args will be passed on to DataMapPlot's\n                         `create_plot` function. See the DataMapPlot documentation\n                         for more details.\n\n    Returns:\n        figure: A Matplotlib Figure object.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_document_datamap(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic().fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    fig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n    ```\n    <img src=\"../../getting_started/visualization/datamapplot.png\",\n         alt=\"DataMapPlot of 20-Newsgroups\", width=800, height=800></img>\n    \"\"\"\n\n    topic_per_doc = topic_model.topics_\n\n    df = pd.DataFrame({\"topic\": np.array(topic_per_doc)})\n    df[\"doc\"] = docs\n    df[\"topic\"] = topic_per_doc\n\n    # Extract embeddings if not already done\n    if embeddings is None and reduced_embeddings is None:\n        embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n    else:\n        embeddings_to_reduce = embeddings\n\n    # Reduce input embeddings\n    if reduced_embeddings is None:\n        umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric='cosine').fit(embeddings_to_reduce)\n        embeddings_2d = umap_model.embedding_\n    else:\n        embeddings_2d = reduced_embeddings\n\n    unique_topics = set(topic_per_doc)\n\n    # Prepare text and names\n    if isinstance(custom_labels, str):\n        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]\n        names = [\" \".join([label[0] for label in labels[:4]]) for labels in names]\n        names = [label if len(label) < 30 else label[:27] + \"...\" for label in names]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]\n    else:\n        names = [f\"Topic-{topic}: \" + \" \".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics]\n\n    topic_name_mapping = {topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)}\n    topic_name_mapping[-1] = \"Unlabelled\"\n\n    # If a set of topics is chosen, set everything else to \"Unlabelled\"\n    if topics is not None:\n        selected_topics = set(topics)\n        for topic_num in topic_name_mapping:\n            if topic_num not in selected_topics:\n                topic_name_mapping[topic_num] = \"Unlabelled\"\n\n    # Map in topic names and plot\n    named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).values\n\n    figure, axes = datamapplot.create_plot(\n        embeddings_2d,\n        named_topic_per_doc,\n        figsize=(width/100, height/100),\n        dpi=100,\n        title=title,\n        sub_title=sub_title,\n        **datamap_kwds,\n    )\n\n    return figure\n
        "},{"location":"api/plotting/documents.html","title":"Documents","text":"

        Visualize documents and their topics in 2D

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample float

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization.

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_documents(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_documents.py
        def visualize_documents(topic_model,\n                        docs: List[str],\n                        topics: List[int] = None,\n                        embeddings: np.ndarray = None,\n                        reduced_embeddings: np.ndarray = None,\n                        sample: float = None,\n                        hide_annotations: bool = False,\n                        hide_document_hover: bool = False,\n                        custom_labels: Union[bool, str] = False,\n                        title: str = \"<b>Documents and Topics</b>\",\n                        width: int = 1200,\n                        height: int = 750):\n    \"\"\" Visualize documents and their topics in 2D\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                             specific points. Helps to speed up generation of visualization.\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_documents(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic().fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_documents(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../../getting_started/visualization/documents.html\"\n    style=\"width:1000px; height: 800px; border: 0px;\"\"></iframe>\n    \"\"\"\n    topic_per_doc = topic_model.topics_\n\n    # Sample the data to optimize for visualization and dimensionality reduction\n    if sample is None or sample > 1:\n        sample = 1\n\n    indices = []\n    for topic in set(topic_per_doc):\n        s = np.where(np.array(topic_per_doc) == topic)[0]\n        size = len(s) if len(s) < 100 else int(len(s) * sample)\n        indices.extend(np.random.choice(s, size=size, replace=False))\n    indices = np.array(indices)\n\n    df = pd.DataFrame({\"topic\": np.array(topic_per_doc)[indices]})\n    df[\"doc\"] = [docs[index] for index in indices]\n    df[\"topic\"] = [topic_per_doc[index] for index in indices]\n\n    # Extract embeddings if not already done\n    if sample is None:\n        if embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n        else:\n            embeddings_to_reduce = embeddings\n    else:\n        if embeddings is not None:\n            embeddings_to_reduce = embeddings[indices]\n        elif embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n\n    # Reduce input embeddings\n    if reduced_embeddings is None:\n        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)\n        embeddings_2d = umap_model.embedding_\n    elif sample is not None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings[indices]\n    elif sample is None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings\n\n    unique_topics = set(topic_per_doc)\n    if topics is None:\n        topics = unique_topics\n\n    # Combine data\n    df[\"x\"] = embeddings_2d[:, 0]\n    df[\"y\"] = embeddings_2d[:, 1]\n\n    # Prepare text and names\n    if isinstance(custom_labels, str):\n        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]\n        names = [\"_\".join([label[0] for label in labels[:4]]) for labels in names]\n        names = [label if len(label) < 30 else label[:27] + \"...\" for label in names]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]\n    else:\n        names = [f\"{topic}_\" + \"_\".join([word for word, value in topic_model.get_topic(topic)][:3]) for topic in unique_topics]\n\n    # Visualize\n    fig = go.Figure()\n\n    # Outliers and non-selected topics\n    non_selected_topics = set(unique_topics).difference(topics)\n    if len(non_selected_topics) == 0:\n        non_selected_topics = [-1]\n\n    selection = df.loc[df.topic.isin(non_selected_topics), :]\n    selection[\"text\"] = \"\"\n    selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), \"Other documents\"]\n\n    fig.add_trace(\n        go.Scattergl(\n            x=selection.x,\n            y=selection.y,\n            hovertext=selection.doc if not hide_document_hover else None,\n            hoverinfo=\"text\",\n            mode='markers+text',\n            name=\"other\",\n            showlegend=False,\n            marker=dict(color='#CFD8DC', size=5, opacity=0.5)\n        )\n    )\n\n    # Selected topics\n    for name, topic in zip(names, unique_topics):\n        if topic in topics and topic != -1:\n            selection = df.loc[df.topic == topic, :]\n            selection[\"text\"] = \"\"\n\n            if not hide_annotations:\n                selection.loc[len(selection), :] = [None, None, selection.x.mean(), selection.y.mean(), name]\n\n            fig.add_trace(\n                go.Scattergl(\n                    x=selection.x,\n                    y=selection.y,\n                    hovertext=selection.doc if not hide_document_hover else None,\n                    hoverinfo=\"text\",\n                    text=selection.text,\n                    mode='markers+text',\n                    name=name,\n                    textfont=dict(\n                        size=12,\n                    ),\n                    marker=dict(size=5, opacity=0.5)\n                )\n            )\n\n    # Add grid in a 'plus' shape\n    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))\n    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))\n    fig.add_shape(type=\"line\",\n                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],\n                  line=dict(color=\"#CFD8DC\", width=2))\n    fig.add_shape(type=\"line\",\n                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,\n                  line=dict(color=\"#9E9E9E\", width=2))\n    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text=\"D1\", showarrow=False, yshift=10)\n    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text=\"D2\", showarrow=False, xshift=10)\n\n    # Stylize layout\n    fig.update_layout(\n        template=\"simple_white\",\n        title={\n            'text': f\"{title}\",\n            'x': 0.5,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        width=width,\n        height=height\n    )\n\n    fig.update_xaxes(visible=False)\n    fig.update_yaxes(visible=False)\n    return fig\n
        "},{"location":"api/plotting/dtm.html","title":"DTM","text":"

        Visualize topics over time

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics_over_time DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        None topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topics over Time</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        450

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics over time, simply run:

        topics_over_time = topic_model.topics_over_time(docs, timestamps)\ntopic_model.visualize_topics_over_time(topics_over_time)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_over_time(topics_over_time)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_topics_over_time.py
        def visualize_topics_over_time(topic_model,\n                               topics_over_time: pd.DataFrame,\n                               top_n_topics: int = None,\n                               topics: List[int] = None,\n                               normalize_frequency: bool = False,\n                               custom_labels: Union[bool, str] = False,\n                               title: str = \"<b>Topics over Time</b>\",\n                               width: int = 1250,\n                               height: int = 450) -> go.Figure:\n    \"\"\" Visualize topics over time\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics_over_time: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n\n    To visualize the topics over time, simply run:\n\n    ```python\n    topics_over_time = topic_model.topics_over_time(docs, timestamps)\n    topic_model.visualize_topics_over_time(topics_over_time)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_over_time(topics_over_time)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/trump.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    colors = [\"#E69F00\", \"#56B4E9\", \"#009E73\", \"#F0E442\", \"#D55E00\", \"#0072B2\", \"#CC79A7\"]\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        selected_topics = list(topics)\n    elif top_n_topics is not None:\n        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        selected_topics = sorted(freq_df.Topic.to_list())\n\n    # Prepare data\n    if isinstance(custom_labels, str):\n        topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]\n        topic_names = [\"_\".join([label[0] for label in labels[:4]]) for labels in topic_names]\n        topic_names = [label if len(label) < 30 else label[:27] + \"...\" for label in topic_names]\n        topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()}\n    else:\n        topic_names = {key: value[:40] + \"...\" if len(value) > 40 else value\n                       for key, value in topic_model.topic_labels_.items()}\n    topics_over_time[\"Name\"] = topics_over_time.Topic.map(topic_names)\n    data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values([\"Topic\", \"Timestamp\"])\n\n    # Add traces\n    fig = go.Figure()\n    for index, topic in enumerate(data.Topic.unique()):\n        trace_data = data.loc[data.Topic == topic, :]\n        topic_name = trace_data.Name.values[0]\n        words = trace_data.Words.values\n        if normalize_frequency:\n            y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]\n        else:\n            y = trace_data.Frequency\n        fig.add_trace(go.Scatter(x=trace_data.Timestamp, y=y,\n                                 mode='lines',\n                                 marker_color=colors[index % 7],\n                                 hoverinfo=\"text\",\n                                 name=topic_name,\n                                 hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))\n\n    # Styling of the visualization\n    fig.update_xaxes(showgrid=True)\n    fig.update_yaxes(showgrid=True)\n    fig.update_layout(\n        yaxis_title=\"Normalized Frequency\" if normalize_frequency else \"Frequency\",\n        title={\n            'text': f\"{title}\",\n            'y': .95,\n            'x': 0.40,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        template=\"simple_white\",\n        width=width,\n        height=height,\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n        legend=dict(\n            title=\"<b>Global Topic Representation\",\n        )\n    )\n    return fig\n
        "},{"location":"api/plotting/heatmap.html","title":"Heatmap","text":"

        Visualize a heatmap of the topic's similarity matrix

        Based on the cosine similarity matrix between topic embeddings, a heatmap is created showing the similarity between topics.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        None n_clusters int

        Create n clusters and order the similarity matrix by those clusters.

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Similarity Matrix</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        800

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the similarity matrix of topics simply run:

        topic_model.visualize_heatmap()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_heatmap()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_heatmap.py
        def visualize_heatmap(topic_model,\n                      topics: List[int] = None,\n                      top_n_topics: int = None,\n                      n_clusters: int = None,\n                      custom_labels: Union[bool, str] = False,\n                      title: str = \"<b>Similarity Matrix</b>\",\n                      width: int = 800,\n                      height: int = 800) -> go.Figure:\n    \"\"\" Visualize a heatmap of the topic's similarity matrix\n\n    Based on the cosine similarity matrix between topic embeddings,\n    a heatmap is created showing the similarity between topics.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_clusters: Create n clusters and order the similarity\n                    matrix by those clusters.\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the similarity matrix of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_heatmap()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_heatmap()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/heatmap.html\"\n    style=\"width:1000px; height: 720px; border: 0px;\"\"></iframe>\n    \"\"\"\n\n    # Select topic embeddings\n    if topic_model.topic_embeddings_ is not None:\n        embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:]\n    else:\n        embeddings = topic_model.c_tf_idf_[topic_model._outliers:]\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list())\n\n    # Order heatmap by similar clusters of topics\n    sorted_topics = topics\n    if n_clusters:\n        if n_clusters >= len(set(topics)):\n            raise ValueError(\"Make sure to set `n_clusters` lower than \"\n                             \"the total number of unique topics.\")\n\n        distance_matrix = cosine_similarity(embeddings[topics])\n        Z = linkage(distance_matrix, 'ward')\n        clusters = fcluster(Z, t=n_clusters, criterion='maxclust')\n\n        # Extract new order of topics\n        mapping = {cluster: [] for cluster in clusters}\n        for topic, cluster in zip(topics, clusters):\n            mapping[cluster].append(topic)\n        mapping = [cluster for cluster in mapping.values()]\n        sorted_topics = [topic for cluster in mapping for topic in cluster]\n\n    # Select embeddings\n    indices = np.array([topics.index(topic) for topic in sorted_topics])\n    embeddings = embeddings[indices]\n    distance_matrix = cosine_similarity(embeddings)\n\n    # Create labels\n    if isinstance(custom_labels, str):\n        new_labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        new_labels = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in sorted_topics]\n    else:\n        new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n\n    fig = px.imshow(distance_matrix,\n                    labels=dict(color=\"Similarity Score\"),\n                    x=new_labels,\n                    y=new_labels,\n                    color_continuous_scale='GnBu'\n                    )\n\n    fig.update_layout(\n        title={\n            'text': f\"{title}\",\n            'y': .95,\n            'x': 0.55,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        width=width,\n        height=height,\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n    )\n    fig.update_layout(showlegend=True)\n    fig.update_layout(legend_title_text='Trend')\n\n    return fig\n
        "},{"location":"api/plotting/hierarchical_documents.html","title":"Hierarchical Documents","text":"

        Visualize documents and their topics in 2D at different levels of hierarchy

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample Union[float, int]

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations.

        True nr_levels int

        The number of levels to be visualized in the hierarchy. First, the distances in hierarchical_topics.Distance are split in nr_levels lists of distances. Then, for each list of distances, the merged topics are selected that have a distance less or equal to the maximum distance of the selected list of distances. NOTE: To get all possible merged steps, make sure that nr_levels is equal to the length of hierarchical_topics.

        10 level_scale str

        Whether to apply a linear or logarithmic (log) scale levels of the distance vector. Linear scaling will perform an equal number of merges at each level while logarithmic scaling will perform more mergers in earlier levels to provide more resolution at higher levels (this can be used for when the number of topics is large).

        'linear' custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\". NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n

        Note

        This visualization was inspired by the scatter plot representation of Doc2Map: https://github.com/louisgeisler/Doc2Map

        Source code in bertopic\\plotting\\_hierarchical_documents.py
        def visualize_hierarchical_documents(topic_model,\n                                     docs: List[str],\n                                     hierarchical_topics: pd.DataFrame,\n                                     topics: List[int] = None,\n                                     embeddings: np.ndarray = None,\n                                     reduced_embeddings: np.ndarray = None,\n                                     sample: Union[float, int] = None,\n                                     hide_annotations: bool = False,\n                                     hide_document_hover: bool = True,\n                                     nr_levels: int = 10,\n                                     level_scale: str = 'linear', \n                                     custom_labels: Union[bool, str] = False,\n                                     title: str = \"<b>Hierarchical Documents and Topics</b>\",\n                                     width: int = 1200,\n                                     height: int = 750) -> go.Figure:\n    \"\"\" Visualize documents and their topics in 2D at different levels of hierarchy\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                             specific points. Helps to speed up generation of visualizations.\n        nr_levels: The number of levels to be visualized in the hierarchy. First, the distances\n                   in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances. \n                   Then, for each list of distances, the merged topics are selected that have a \n                   distance less or equal to the maximum distance of the selected list of distances.\n                   NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to\n                   the length of `hierarchical_topics`.\n        level_scale: Whether to apply a linear or logarithmic (log) scale levels of the distance \n                     vector. Linear scaling will perform an equal number of merges at each level \n                     while logarithmic scaling will perform more mergers in earlier levels to \n                     provide more resolution at higher levels (this can be used for when the number \n                     of topics is large). \n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n                       NOTE: Custom labels are only generated for the original \n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic and extract hierarchical topics\n    topic_model = BERTopic().fit(docs, embeddings)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    NOTE:\n        This visualization was inspired by the scatter plot representation of Doc2Map:\n        https://github.com/louisgeisler/Doc2Map\n\n    <iframe src=\"../../getting_started/visualization/hierarchical_documents.html\"\n    style=\"width:1000px; height: 770px; border: 0px;\"\"></iframe>\n    \"\"\"\n    topic_per_doc = topic_model.topics_\n\n    # Sample the data to optimize for visualization and dimensionality reduction\n    if sample is None or sample > 1:\n        sample = 1\n\n    indices = []\n    for topic in set(topic_per_doc):\n        s = np.where(np.array(topic_per_doc) == topic)[0]\n        size = len(s) if len(s) < 100 else int(len(s)*sample)\n        indices.extend(np.random.choice(s, size=size, replace=False))\n    indices = np.array(indices)\n\n    df = pd.DataFrame({\"topic\": np.array(topic_per_doc)[indices]})\n    df[\"doc\"] = [docs[index] for index in indices]\n    df[\"topic\"] = [topic_per_doc[index] for index in indices]\n\n    # Extract embeddings if not already done\n    if sample is None:\n        if embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n        else:\n            embeddings_to_reduce = embeddings\n    else:\n        if embeddings is not None:\n            embeddings_to_reduce = embeddings[indices]\n        elif embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n\n    # Reduce input embeddings\n    if reduced_embeddings is None:\n        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit(embeddings_to_reduce)\n        embeddings_2d = umap_model.embedding_\n    elif sample is not None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings[indices]\n    elif sample is None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings\n\n    # Combine data\n    df[\"x\"] = embeddings_2d[:, 0]\n    df[\"y\"] = embeddings_2d[:, 1]\n\n    # Create topic list for each level, levels are created by calculating the distance\n    distances = hierarchical_topics.Distance.to_list()\n    if level_scale == 'log' or level_scale == 'logarithmic':\n        log_indices = np.round(np.logspace(start=math.log(1,10), stop=math.log(len(distances)-1,10), num=nr_levels)).astype(int).tolist()\n        log_indices.reverse()\n        max_distances = [distances[i] for i in log_indices]\n    elif level_scale == 'lin' or level_scale == 'linear':\n        max_distances = [distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels)][::-1]\n    else:\n        raise ValueError(\"level_scale needs to be one of 'log' or 'linear'\")\n\n    for index, max_distance in enumerate(max_distances):\n\n        # Get topics below `max_distance`\n        mapping = {topic: topic for topic in df.topic.unique()}\n        selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :]\n        selection.Parent_ID = selection.Parent_ID.astype(int)\n        selection = selection.sort_values(\"Parent_ID\")\n\n        for row in selection.iterrows():\n            for topic in row[1].Topics:\n                mapping[topic] = row[1].Parent_ID\n\n        # Make sure the mappings are mapped 1:1\n        mappings = [True for _ in mapping]\n        while any(mappings):\n            for i, (key, value) in enumerate(mapping.items()):\n                if value in mapping.keys() and key != value:\n                    mapping[key] = mapping[value]\n                else:\n                    mappings[i] = False\n\n        # Create new column\n        df[f\"level_{index+1}\"] = df.topic.map(mapping)\n        df[f\"level_{index+1}\"] = df[f\"level_{index+1}\"].astype(int)\n\n    # Prepare topic names of original and merged topics\n    trace_names = []\n    topic_names = {}\n    for topic in range(hierarchical_topics.Parent_ID.astype(int).max()):\n        if topic < hierarchical_topics.Parent_ID.astype(int).min():\n            if topic_model.get_topic(topic):\n                if isinstance(custom_labels, str):\n                    trace_name = f\"{topic}_\" + \"_\".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3])\n                elif topic_model.custom_labels_ is not None and custom_labels:\n                    trace_name = topic_model.custom_labels_[topic + topic_model._outliers]\n                else:\n                    trace_name = f\"{topic}_\" + \"_\".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3])\n                topic_names[topic] = {\"trace_name\": trace_name[:40], \"plot_text\": trace_name[:40]}\n                trace_names.append(trace_name)\n        else:\n            trace_name = f\"{topic}_\" + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), \"Parent_Name\"].values[0]\n            plot_text = \"_\".join([name[:20] for name in trace_name.split(\"_\")[:3]])\n            topic_names[topic] = {\"trace_name\": trace_name[:40], \"plot_text\": plot_text[:40]}\n            trace_names.append(trace_name)\n\n    # Prepare traces\n    all_traces = []\n    for level in range(len(max_distances)):\n        traces = []\n\n        # Outliers\n        if topic_model._outliers:\n            traces.append(\n                    go.Scattergl(\n                        x=df.loc[(df[f\"level_{level+1}\"] == -1), \"x\"],\n                        y=df.loc[df[f\"level_{level+1}\"] == -1, \"y\"],\n                        mode='markers+text',\n                        name=\"other\",\n                        hoverinfo=\"text\",\n                        hovertext=df.loc[(df[f\"level_{level+1}\"] == -1), \"doc\"] if not hide_document_hover else None,\n                        showlegend=False,\n                        marker=dict(color='#CFD8DC', size=5, opacity=0.5)\n                    )\n                )\n\n        # Selected topics\n        if topics:\n            selection = df.loc[(df.topic.isin(topics)), :]\n            unique_topics = sorted([int(topic) for topic in selection[f\"level_{level+1}\"].unique()])\n        else:\n            unique_topics = sorted([int(topic) for topic in df[f\"level_{level+1}\"].unique()])\n\n        for topic in unique_topics:\n            if topic != -1:\n                if topics:\n                    selection = df.loc[(df[f\"level_{level+1}\"] == topic) &\n                                       (df.topic.isin(topics)), :]\n                else:\n                    selection = df.loc[df[f\"level_{level+1}\"] == topic, :]\n\n                if not hide_annotations:\n                    selection.loc[len(selection), :] = None\n                    selection[\"text\"] = \"\"\n                    selection.loc[len(selection) - 1, \"x\"] = selection.x.mean()\n                    selection.loc[len(selection) - 1, \"y\"] = selection.y.mean()\n                    selection.loc[len(selection) - 1, \"text\"] = topic_names[int(topic)][\"plot_text\"]\n\n                traces.append(\n                    go.Scattergl(\n                        x=selection.x,\n                        y=selection.y,\n                        text=selection.text if not hide_annotations else None,\n                        hovertext=selection.doc if not hide_document_hover else None,\n                        hoverinfo=\"text\",\n                        name=topic_names[int(topic)][\"trace_name\"],\n                        mode='markers+text',\n                        marker=dict(size=5, opacity=0.5)\n                    )\n                )\n\n        all_traces.append(traces)\n\n    # Track and count traces\n    nr_traces_per_set = [len(traces) for traces in all_traces]\n    trace_indices = [(0, nr_traces_per_set[0])]\n    for index, nr_traces in enumerate(nr_traces_per_set[1:]):\n        start = trace_indices[index][1]\n        end = nr_traces + start\n        trace_indices.append((start, end))\n\n    # Visualization\n    fig = go.Figure()\n    for traces in all_traces:\n        for trace in traces:\n            fig.add_trace(trace)\n\n    for index in range(len(fig.data)):\n        if index >= nr_traces_per_set[0]:\n            fig.data[index].visible = False\n\n    # Create and add slider\n    steps = []\n    for index, indices in enumerate(trace_indices):\n        step = dict(\n            method=\"update\",\n            label=str(index),\n            args=[{\"visible\": [False] * len(fig.data)}]\n        )\n        for index in range(indices[1]-indices[0]):\n            step[\"args\"][0][\"visible\"][index+indices[0]] = True\n        steps.append(step)\n\n    sliders = [dict(\n        currentvalue={\"prefix\": \"Level: \"},\n        pad={\"t\": 20},\n        steps=steps\n    )]\n\n    # Add grid in a 'plus' shape\n    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))\n    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))\n    fig.add_shape(type=\"line\",\n                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],\n                  line=dict(color=\"#CFD8DC\", width=2))\n    fig.add_shape(type=\"line\",\n                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,\n                  line=dict(color=\"#9E9E9E\", width=2))\n    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text=\"D1\", showarrow=False, yshift=10)\n    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text=\"D2\", showarrow=False, xshift=10)\n\n    # Stylize layout\n    fig.update_layout(\n        sliders=sliders,\n        template=\"simple_white\",\n        title={\n            'text': f\"{title}\",\n            'x': 0.5,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        width=width,\n        height=height,\n    )\n\n    fig.update_xaxes(visible=False)\n    fig.update_yaxes(visible=False)\n    return fig\n
        "},{"location":"api/plotting/hierarchy.html","title":"Hierarchy","text":"

        Visualize a hierarchical structure of the topics

        A ward linkage function is used to perform the hierarchical clustering based on the cosine distance matrix between topic embeddings.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required orientation str

        The orientation of the figure. Either 'left' or 'bottom'

        'left' topics List[int]

        A selection of topics to visualize

        None top_n_topics int

        Only select the top n most frequent topics

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\". NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Clustering</b>' width int

        The width of the figure. Only works if orientation is set to 'left'

        1000 height int

        The height of the figure. Only works if orientation is set to 'bottom'

        600 hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both topics and top_n_topics are not set.

        None linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

        The linkage function to use. Default is: lambda x: sch.linkage(x, 'ward', optimal_ordering=True) NOTE: Make sure to use the same linkage_function as used in topic_model.hierarchical_topics.

        None distance_function Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

        The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x). You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. NOTE: Make sure to use the same distance_function as used in topic_model.hierarchical_topics.

        None color_threshold int

        Value at which the separation of clusters will be made which will result in different colors for different clusters. A higher value will typically lead in less colored clusters.

        1

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the hierarchical structure of topics simply run:

        topic_model.visualize_hierarchy()\n

        If you also want the labels visualized of hierarchical topics, run the following:

        # Extract hierarchical topics and their representations\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Visualize these representations\ntopic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you want to save the resulting figure:

        fig = topic_model.visualize_hierarchy()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_hierarchy.py
        def visualize_hierarchy(topic_model,\n                        orientation: str = \"left\",\n                        topics: List[int] = None,\n                        top_n_topics: int = None,\n                        custom_labels: Union[bool, str] = False,\n                        title: str = \"<b>Hierarchical Clustering</b>\",\n                        width: int = 1000,\n                        height: int = 600,\n                        hierarchical_topics: pd.DataFrame = None,\n                        linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n                        distance_function: Callable[[csr_matrix], csr_matrix] = None,\n                        color_threshold: int = 1) -> go.Figure:\n    \"\"\" Visualize a hierarchical structure of the topics\n\n    A ward linkage function is used to perform the\n    hierarchical clustering based on the cosine distance\n    matrix between topic embeddings.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        orientation: The orientation of the figure.\n                     Either 'left' or 'bottom'\n        topics: A selection of topics to visualize\n        top_n_topics: Only select the top n most frequent topics\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n                       NOTE: Custom labels are only generated for the original \n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure. Only works if orientation is set to 'left'\n        height: The height of the figure. Only works if orientation is set to 'bottom'\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children.\n                             NOTE: The hierarchical topic names are only visualized\n                             if both `topics` and `top_n_topics` are not set.\n        linkage_function: The linkage function to use. Default is:\n                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n                          NOTE: Make sure to use the same `linkage_function` as used\n                          in `topic_model.hierarchical_topics`.\n        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                           `lambda x: 1 - cosine_similarity(x)`.\n                            You can pass any function that returns either a square matrix of \n                            shape (n_samples, n_samples) with zeros on the diagonal and \n                            non-negative values or condensed distance matrix of shape \n                            (n_samples * (n_samples - 1) / 2,) containing the upper \n                            triangular of the distance matrix.\n                           NOTE: Make sure to use the same `distance_function` as used\n                           in `topic_model.hierarchical_topics`.\n        color_threshold: Value at which the separation of clusters will be made which\n                         will result in different colors for different clusters.\n                         A higher value will typically lead in less colored clusters.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the hierarchical structure of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchy()\n    ```\n\n    If you also want the labels visualized of hierarchical topics,\n    run the following:\n\n    ```python\n    # Extract hierarchical topics and their representations\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Visualize these representations\n    topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n    ```\n\n    If you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchy()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/hierarchy.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    if distance_function is None:\n        distance_function = lambda x: 1 - cosine_similarity(x)\n\n    if linkage_function is None:\n        linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list())\n\n    # Select embeddings\n    all_topics = sorted(list(topic_model.get_topics().keys()))\n    indices = np.array([all_topics.index(topic) for topic in topics])\n\n    # Select topic embeddings\n    if topic_model.c_tf_idf_ is not None:\n        embeddings = topic_model.c_tf_idf_[indices]\n    else:\n        embeddings = np.array(topic_model.topic_embeddings_)[indices]\n\n    # Annotations\n    if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()):\n        annotations = _get_annotations(topic_model=topic_model,\n                                       hierarchical_topics=hierarchical_topics,\n                                       embeddings=embeddings,\n                                       distance_function=distance_function,\n                                       linkage_function=linkage_function,\n                                       orientation=orientation,\n                                       custom_labels=custom_labels)\n    else:\n        annotations = None\n\n    # wrap distance function to validate input and return a condensed distance matrix\n    distance_function_viz = lambda x: validate_distance_matrix(\n        distance_function(x), embeddings.shape[0])\n    # Create dendogram\n    fig = ff.create_dendrogram(embeddings,\n                               orientation=orientation,\n                               distfun=distance_function_viz,\n                               linkagefun=linkage_function,\n                               hovertext=annotations,\n                               color_threshold=color_threshold)\n\n    # Create nicer labels\n    axis = \"yaxis\" if orientation == \"left\" else \"xaxis\"\n    if isinstance(custom_labels, str):\n        new_labels = [[[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis][\"ticktext\"]]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        new_labels = [topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis][\"ticktext\"]]\n    else:\n        new_labels = [[[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)])\n                      for x in fig.layout[axis][\"ticktext\"]]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n\n    # Stylize layout\n    fig.update_layout(\n        plot_bgcolor='#ECEFF1',\n        template=\"plotly_white\",\n        title={\n            'text': f\"{title}\",\n            'x': 0.5,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n    )\n\n    # Stylize orientation\n    if orientation == \"left\":\n        fig.update_layout(height=200 + (15 * len(topics)),\n                          width=width,\n                          yaxis=dict(tickmode=\"array\",\n                                     ticktext=new_labels))\n\n        # Fix empty space on the bottom of the graph\n        y_max = max([trace['y'].max() + 5 for trace in fig['data']])\n        y_min = min([trace['y'].min() - 5 for trace in fig['data']])\n        fig.update_layout(yaxis=dict(range=[y_min, y_max]))\n\n    else:\n        fig.update_layout(width=200 + (15 * len(topics)),\n                          height=height,\n                          xaxis=dict(tickmode=\"array\",\n                                     ticktext=new_labels))\n\n    if hierarchical_topics is not None:\n        for index in [0, 3]:\n            axis = \"x\" if orientation == \"left\" else \"y\"\n            xs = [data[\"x\"][index] for data in fig.data if (data[\"text\"] and data[axis][index] > 0)]\n            ys = [data[\"y\"][index] for data in fig.data if (data[\"text\"] and data[axis][index] > 0)]\n            hovertext = [data[\"text\"][index] for data in fig.data if (data[\"text\"] and data[axis][index] > 0)]\n\n            fig.add_trace(go.Scatter(x=xs, y=ys, marker_color='black',\n                                     hovertext=hovertext, hoverinfo=\"text\",\n                                     mode='markers', showlegend=False))\n    return fig\n
        "},{"location":"api/plotting/term.html","title":"Term Score Decline","text":"

        Visualize the ranks of all terms across all topics

        Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize. These will be colored red where all others will be colored black.

        None log_scale bool

        Whether to represent the ranking on a log scale

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Term score decline per Topic</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        500

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the ranks of all words across all topics simply run:

        topic_model.visualize_term_rank()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_term_rank()\nfig.write_html(\"path/to/file.html\")\n

        Reference:

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        Source code in bertopic\\plotting\\_term_rank.py
        def visualize_term_rank(topic_model,\n                        topics: List[int] = None,\n                        log_scale: bool = False,\n                        custom_labels: Union[bool, str] = False,\n                        title: str = \"<b>Term score decline per Topic</b>\",\n                        width: int = 800,\n                        height: int = 500) -> go.Figure:\n    \"\"\" Visualize the ranks of all terms across all topics\n\n    Each topic is represented by a set of words. These words, however,\n    do not all equally represent the topic. This visualization shows\n    how many words are needed to represent a topic and at which point\n    the beneficial effect of adding words starts to decline.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize. These will be colored\n                red where all others will be colored black.\n        log_scale: Whether to represent the ranking on a log scale\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n\n    To visualize the ranks of all words across\n    all topics simply run:\n\n    ```python\n    topic_model.visualize_term_rank()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_term_rank()\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../../getting_started/visualization/term_rank.html\"\n    style=\"width:1000px; height: 530px; border: 0px;\"\"></iframe>\n\n    <iframe src=\"../../getting_started/visualization/term_rank_log.html\"\n    style=\"width:1000px; height: 530px; border: 0px;\"\"></iframe>\n\n    Reference:\n\n    This visualization was heavily inspired by the\n    \"Term Probability Decline\" visualization found in an\n    analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/).\n    Reference to that specific analysis can be found\n    [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html).\n    \"\"\"\n\n    topics = [] if topics is None else topics\n\n    topic_ids = topic_model.get_topic_info().Topic.unique().tolist()\n    topic_words = [topic_model.get_topic(topic) for topic in topic_ids]\n\n    values = np.array([[value[1] for value in values] for values in topic_words])\n    indices = np.array([[value + 1 for value in range(len(values))] for values in topic_words])\n\n    # Create figure\n    lines = []\n    for topic, x, y in zip(topic_ids, indices, values):\n        if not any(y > 1.5):\n\n            # labels\n            if isinstance(custom_labels, str):\n                label = f\"{topic}_\" + \"_\".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3])\n            elif topic_model.custom_labels_ is not None and custom_labels:\n                label = topic_model.custom_labels_[topic + topic_model._outliers]\n            else:\n                label = f\"<b>Topic {topic}</b>:\" + \"_\".join([word[0] for word in topic_model.get_topic(topic)])\n                label = label[:50]\n\n            # line parameters\n            color = \"red\" if topic in topics else \"black\"\n            opacity = 1 if topic in topics else .1\n            if any(y == 0):\n                y[y == 0] = min(values[values > 0])\n            y = np.log10(y, out=y, where=y > 0) if log_scale else y\n\n            line = go.Scatter(x=x, y=y,\n                              name=\"\",\n                              hovertext=label,\n                              mode=\"lines+lines\",\n                              opacity=opacity,\n                              line=dict(color=color, width=1.5))\n            lines.append(line)\n\n    fig = go.Figure(data=lines)\n\n    # Stylize layout\n    fig.update_xaxes(range=[0, len(indices[0])], tick0=1, dtick=2)\n    fig.update_layout(\n        showlegend=False,\n        template=\"plotly_white\",\n        title={\n            'text': f\"{title}\",\n            'y': .9,\n            'x': 0.5,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        width=width,\n        height=height,\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n    )\n\n    fig.update_xaxes(title_text='Term Rank')\n    if log_scale:\n        fig.update_yaxes(title_text='c-TF-IDF score (log scale)')\n    else:\n        fig.update_yaxes(title_text='c-TF-IDF score')\n\n    return fig\n
        "},{"location":"api/plotting/topics.html","title":"Topics","text":"

        Visualize topics, their sizes, and their corresponding words

        This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize

        None top_n_topics int

        Only select the top n most frequent topics

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Intertopic Distance Map</b>' width int

        The width of the figure.

        650 height int

        The height of the figure.

        650

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_topics()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_topics.py
        def visualize_topics(topic_model,\n                     topics: List[int] = None,\n                     top_n_topics: int = None,\n                     custom_labels: Union[bool, str] = False,\n                     title: str = \"<b>Intertopic Distance Map</b>\",\n                     width: int = 650,\n                     height: int = 650) -> go.Figure:\n    \"\"\" Visualize topics, their sizes, and their corresponding words\n\n    This visualization is highly inspired by LDAvis, a great visualization\n    technique typically reserved for LDA.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize\n        top_n_topics: Only select the top n most frequent topics\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_topics()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/viz.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list())\n\n    # Extract topic words and their frequencies\n    topic_list = sorted(topics)\n    frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list]\n    if isinstance(custom_labels, str):\n        words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list]\n        words = [\"_\".join([label[0] for label in labels[:4]]) for labels in words]\n        words = [label if len(label) < 30 else label[:27] + \"...\" for label in words]\n    elif custom_labels and topic_model.custom_labels_ is not None:\n        words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list]\n    else:\n        words = [\" | \".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]\n\n    # Embed c-TF-IDF into 2D\n    all_topics = sorted(list(topic_model.get_topics().keys()))\n    indices = np.array([all_topics.index(topic) for topic in topics])\n\n    if topic_model.topic_embeddings_ is not None:\n        embeddings = topic_model.topic_embeddings_[indices]\n        embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)\n    else:\n        embeddings = topic_model.c_tf_idf_.toarray()[indices]\n        embeddings = MinMaxScaler().fit_transform(embeddings)\n        embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)\n\n    # Visualize with plotly\n    df = pd.DataFrame({\"x\": embeddings[:, 0], \"y\": embeddings[:, 1],\n                       \"Topic\": topic_list, \"Words\": words, \"Size\": frequencies})\n    return _plotly_topic_visualization(df, topic_list, title, width, height)\n
        "},{"location":"api/plotting/topics_per_class.html","title":"Topics per Class","text":"

        Visualize topics per class

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics_per_class DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        10 topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topics per Class</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        900

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics per class, simply run:

        topics_per_class = topic_model.topics_per_class(docs, classes)\ntopic_model.visualize_topics_per_class(topics_per_class)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_per_class(topics_per_class)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_topics_per_class.py
        def visualize_topics_per_class(topic_model,\n                               topics_per_class: pd.DataFrame,\n                               top_n_topics: int = 10,\n                               topics: List[int] = None,\n                               normalize_frequency: bool = False,\n                               custom_labels: Union[bool, str] = False,\n                               title: str = \"<b>Topics per Class</b>\",\n                               width: int = 1250,\n                               height: int = 900) -> go.Figure:\n    \"\"\" Visualize topics per class\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics_per_class: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: If bool, whether to use custom topic labels that were defined using \n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n\n    To visualize the topics per class, simply run:\n\n    ```python\n    topics_per_class = topic_model.topics_per_class(docs, classes)\n    topic_model.visualize_topics_per_class(topics_per_class)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_per_class(topics_per_class)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/topics_per_class.html\"\n    style=\"width:1400px; height: 1000px; border: 0px;\"\"></iframe>\n    \"\"\"\n    colors = [\"#E69F00\", \"#56B4E9\", \"#009E73\", \"#F0E442\", \"#D55E00\", \"#0072B2\", \"#CC79A7\"]\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        selected_topics = list(topics)\n    elif top_n_topics is not None:\n        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        selected_topics = sorted(freq_df.Topic.to_list())\n\n    # Prepare data\n    if isinstance(custom_labels, str):\n        topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]\n        topic_names = [\"_\".join([label[0] for label in labels[:4]]) for labels in topic_names]\n        topic_names = [label if len(label) < 30 else label[:27] + \"...\" for label in topic_names]\n        topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()}\n    else:\n        topic_names = {key: value[:40] + \"...\" if len(value) > 40 else value\n                       for key, value in topic_model.topic_labels_.items()}\n    topics_per_class[\"Name\"] = topics_per_class.Topic.map(topic_names)\n    data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :]\n\n    # Add traces\n    fig = go.Figure()\n    for index, topic in enumerate(selected_topics):\n        if index == 0:\n            visible = True\n        else:\n            visible = \"legendonly\"\n        trace_data = data.loc[data.Topic == topic, :]\n        topic_name = trace_data.Name.values[0]\n        words = trace_data.Words.values\n        if normalize_frequency:\n            x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]\n        else:\n            x = trace_data.Frequency\n        fig.add_trace(go.Bar(y=trace_data.Class,\n                             x=x,\n                             visible=visible,\n                             marker_color=colors[index % 7],\n                             hoverinfo=\"text\",\n                             name=topic_name,\n                             orientation=\"h\",\n                             hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))\n\n    # Styling of the visualization\n    fig.update_xaxes(showgrid=True)\n    fig.update_yaxes(showgrid=True)\n    fig.update_layout(\n        xaxis_title=\"Normalized Frequency\" if normalize_frequency else \"Frequency\",\n        yaxis_title=\"Class\",\n        title={\n            'text': f\"{title}\",\n            'y': .95,\n            'x': 0.40,\n            'xanchor': 'center',\n            'yanchor': 'top',\n            'font': dict(\n                size=22,\n                color=\"Black\")\n        },\n        template=\"simple_white\",\n        width=width,\n        height=height,\n        hoverlabel=dict(\n            bgcolor=\"white\",\n            font_size=16,\n            font_family=\"Rockwell\"\n        ),\n        legend=dict(\n            title=\"<b>Global Topic Representation\",\n        )\n    )\n    return fig\n
        "},{"location":"api/representation/base.html","title":"BaseRepresentation","text":"

        The base representation model for fine-tuning topic representations

        Source code in bertopic\\representation\\_base.py
        class BaseRepresentation(BaseEstimator):\n    \"\"\" The base representation model for fine-tuning topic representations \"\"\"\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Each representation model that inherits this class will have\n        its arguments (topic_model, documents, c_tf_idf, topics)\n        automatically passed. Therefore, the representation model\n        will only have access to the information about topics related\n        to those arguments.\n\n        Arguments:\n            topic_model: The BERTopic model that is fitted until topic\n                         representations are calculated.\n            documents: A dataframe with columns \"Document\" and \"Topic\"\n                       that contains all documents with each corresponding\n                       topic.\n            c_tf_idf: A c-TF-IDF representation that is typically\n                      identical to `topic_model.c_tf_idf_` except for\n                      dynamic, class-based, and hierarchical topic modeling\n                      where it is calculated on a subset of the documents.\n            topics: A dictionary with topic (key) and tuple of word and\n                    weight (value) as calculated by c-TF-IDF. This is the\n                    default topics that are returned if no representation\n                    model is used.\n        \"\"\"\n        return topic_model.topic_representations_\n
        "},{"location":"api/representation/base.html#bertopic.representation._base.BaseRepresentation.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Each representation model that inherits this class will have its arguments (topic_model, documents, c_tf_idf, topics) automatically passed. Therefore, the representation model will only have access to the information about topics related to those arguments.

        Parameters:

        Name Type Description Default topic_model

        The BERTopic model that is fitted until topic representations are calculated.

        required documents DataFrame

        A dataframe with columns \"Document\" and \"Topic\" that contains all documents with each corresponding topic.

        required c_tf_idf csr_matrix

        A c-TF-IDF representation that is typically identical to topic_model.c_tf_idf_ except for dynamic, class-based, and hierarchical topic modeling where it is calculated on a subset of the documents.

        required topics Mapping[str, List[Tuple[str, float]]]

        A dictionary with topic (key) and tuple of word and weight (value) as calculated by c-TF-IDF. This is the default topics that are returned if no representation model is used.

        required Source code in bertopic\\representation\\_base.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topics\n\n    Each representation model that inherits this class will have\n    its arguments (topic_model, documents, c_tf_idf, topics)\n    automatically passed. Therefore, the representation model\n    will only have access to the information about topics related\n    to those arguments.\n\n    Arguments:\n        topic_model: The BERTopic model that is fitted until topic\n                     representations are calculated.\n        documents: A dataframe with columns \"Document\" and \"Topic\"\n                   that contains all documents with each corresponding\n                   topic.\n        c_tf_idf: A c-TF-IDF representation that is typically\n                  identical to `topic_model.c_tf_idf_` except for\n                  dynamic, class-based, and hierarchical topic modeling\n                  where it is calculated on a subset of the documents.\n        topics: A dictionary with topic (key) and tuple of word and\n                weight (value) as calculated by c-TF-IDF. This is the\n                default topics that are returned if no representation\n                model is used.\n    \"\"\"\n    return topic_model.topic_representations_\n
        "},{"location":"api/representation/cohere.html","title":"Cohere","text":"

        Use the Cohere API to generate topic labels based on their generative model.

        Find more about their models here: https://docs.cohere.ai/docs

        Parameters:

        Name Type Description Default client

        A cohere.Client

        required model str

        Model to use within Cohere, defaults to \"xlarge\".

        'xlarge' prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and documents need to be inserted.

        None delay_in_seconds float

        The delay in seconds between consecutive prompts in order to prevent RateLimitErrors.

        None nr_docs int

        The number of documents to pass to OpenAI if a prompt with the [\"DOCUMENTS\"] tag is used.

        4 diversity float

        The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and trunctated depending on doc_length * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None

        Usage:

        To use this, you will need to install cohere first:

        pip install cohere

        Then, get yourself an API key and use Cohere's API as follows:

        import cohere\nfrom bertopic.representation import Cohere\nfrom bertopic import BERTopic\n\n# Create your representation model\nco = cohere.Client(my_api_key)\nrepresentation_model = Cohere(co)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"I have the following documents: [DOCUMENTS]. What topic do they contain?\"\nrepresentation_model = Cohere(co, prompt=prompt)\n
        Source code in bertopic\\representation\\_cohere.py
        class Cohere(BaseRepresentation):\n    \"\"\" Use the Cohere API to generate topic labels based on their\n    generative model.\n\n    Find more about their models here:\n    https://docs.cohere.ai/docs\n\n    Arguments:\n        client: A `cohere.Client`\n        model: Model to use within Cohere, defaults to `\"xlarge\"`.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n                to decide where the keywords and documents need to be\n                inserted.\n        delay_in_seconds: The delay in seconds between consecutive prompts \n                                in order to prevent RateLimitErrors. \n        nr_docs: The number of documents to pass to OpenAI if a prompt\n                 with the `[\"DOCUMENTS\"]` tag is used.\n        diversity: The diversity of documents to pass to OpenAI.\n                   Accepts values between 0 and 1. A higher \n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document. \n                       * If tokenizer is 'char', then the document is split up \n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and trunctated depending on `doc_length`\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n\n    Usage:\n\n    To use this, you will need to install cohere first:\n\n    `pip install cohere`\n\n    Then, get yourself an API key and use Cohere's API as follows:\n\n    ```python\n    import cohere\n    from bertopic.representation import Cohere\n    from bertopic import BERTopic\n\n    # Create your representation model\n    co = cohere.Client(my_api_key)\n    representation_model = Cohere(co)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can also use a custom prompt:\n\n    ```python\n    prompt = \"I have the following documents: [DOCUMENTS]. What topic do they contain?\"\n    representation_model = Cohere(co, prompt=prompt)\n    ```\n    \"\"\"\n    def __init__(self,\n                 client,\n                 model: str = \"xlarge\",\n                 prompt: str = None,\n                 delay_in_seconds: float = None,\n                 nr_docs: int = 4,\n                 diversity: float = None,\n                 doc_length: int = None,\n                 tokenizer: Union[str, Callable] = None\n                 ):\n        self.client = client\n        self.model = model\n        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT\n        self.default_prompt_ = DEFAULT_PROMPT\n        self.delay_in_seconds = delay_in_seconds\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n        self.prompts_ = []\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: Not used\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top 4 representative documents per topic\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)\n\n        # Generate using Cohere's Language Model\n        updated_topics = {}\n        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n            truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n            prompt = self._create_prompt(truncated_docs, topic, topics)\n            self.prompts_.append(prompt)\n\n            # Delay\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n            request = self.client.generate(model=self.model,\n                                           prompt=prompt,\n                                           max_tokens=50,\n                                           num_generations=1,\n                                           stop_sequences=[\"\\n\"])\n            label = request.generations[0].text.strip()\n            updated_topics[topic] = [(label, 1)] + [(\"\", 0) for _ in range(9)]\n\n        return updated_topics\n\n    def _create_prompt(self, docs, topic, topics):\n        keywords = list(zip(*topics[topic]))[0]\n\n        # Use the Default Chat Prompt\n        if self.prompt == DEFAULT_PROMPT:\n            prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            prompt = self._replace_documents(prompt, docs)\n\n        # Use a custom prompt that leverages keywords, documents or both using\n        # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively\n        else:\n            prompt = self.prompt\n            if \"[KEYWORDS]\" in prompt:\n                prompt = prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            if \"[DOCUMENTS]\" in prompt:\n                prompt = self._replace_documents(prompt, docs)\n\n        return prompt\n\n    @staticmethod\n    def _replace_documents(prompt, docs):\n        to_replace = \"\"\n        for doc in docs:\n            to_replace += f\"- {doc}\\n\"\n        prompt = prompt.replace(\"[DOCUMENTS]\", to_replace)\n        return prompt\n
        "},{"location":"api/representation/cohere.html#bertopic.representation._cohere.Cohere.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Parameters:

        Name Type Description Default topic_model

        Not used

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_cohere.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topics\n\n    Arguments:\n        topic_model: Not used\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top 4 representative documents per topic\n    repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)\n\n    # Generate using Cohere's Language Model\n    updated_topics = {}\n    for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n        truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n        prompt = self._create_prompt(truncated_docs, topic, topics)\n        self.prompts_.append(prompt)\n\n        # Delay\n        if self.delay_in_seconds:\n            time.sleep(self.delay_in_seconds)\n\n        request = self.client.generate(model=self.model,\n                                       prompt=prompt,\n                                       max_tokens=50,\n                                       num_generations=1,\n                                       stop_sequences=[\"\\n\"])\n        label = request.generations[0].text.strip()\n        updated_topics[topic] = [(label, 1)] + [(\"\", 0) for _ in range(9)]\n\n    return updated_topics\n
        "},{"location":"api/representation/generation.html","title":"TextGeneration","text":"

        Text2Text or text generation with transformers

        Parameters:

        Name Type Description Default model Union[str, pipeline]

        A transformers pipeline that should be initialized as \"text-generation\" for gpt-like models or \"text2text-generation\" for T5-like models. For example, pipeline('text-generation', model='gpt2'). If a string is passed, \"text-generation\" will be selected by default.

        required prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and documents need to be inserted.

        None pipeline_kwargs Mapping[str, Any]

        Kwargs that you can pass to the transformers.pipeline when it is called.

        {} random_state int

        A random state to be passed to transformers.set_seed

        42 nr_docs int

        The number of documents to pass to OpenAI if a prompt with the [\"DOCUMENTS\"] tag is used.

        4 diversity float

        The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and trunctated depending on doc_length * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None

        Usage:

        To use a gpt-like model:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Create your representation model\ngenerator = pipeline('text-generation', model='gpt2')\nrepresentation_model = TextGeneration(generator)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTo pic(representation_model=representation_model)\n

        You can use a custom prompt and decide where the keywords should be inserted by using the [KEYWORDS] or documents with thte [DOCUMENTS] tag:

        from bertopic.representation import TextGeneration\n\nprompt = \"I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?\"\"\n\n# Create your representation model\ngenerator = pipeline('text2text-generation', model='google/flan-t5-base')\nrepresentation_model = TextGeneration(generator)\n
        Source code in bertopic\\representation\\_textgeneration.py
        class TextGeneration(BaseRepresentation):\n    \"\"\" Text2Text or text generation with transformers\n\n    Arguments:\n        model: A transformers pipeline that should be initialized as \"text-generation\"\n               for gpt-like models or \"text2text-generation\" for T5-like models.\n               For example, `pipeline('text-generation', model='gpt2')`. If a string\n               is passed, \"text-generation\" will be selected by default.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n                to decide where the keywords and documents need to be\n                inserted.\n        pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline\n                         when it is called.\n        random_state: A random state to be passed to `transformers.set_seed`\n        nr_docs: The number of documents to pass to OpenAI if a prompt\n                 with the `[\"DOCUMENTS\"]` tag is used.\n        diversity: The diversity of documents to pass to OpenAI.\n                   Accepts values between 0 and 1. A higher\n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document.\n                       * If tokenizer is 'char', then the document is split up\n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and trunctated depending on `doc_length`\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n\n    Usage:\n\n    To use a gpt-like model:\n\n    ```python\n    from bertopic.representation import TextGeneration\n    from bertopic import BERTopic\n\n    # Create your representation model\n    generator = pipeline('text-generation', model='gpt2')\n    representation_model = TextGeneration(generator)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTo pic(representation_model=representation_model)\n    ```\n\n    You can use a custom prompt and decide where the keywords should\n    be inserted by using the `[KEYWORDS]` or documents with thte `[DOCUMENTS]` tag:\n\n    ```python\n    from bertopic.representation import TextGeneration\n\n    prompt = \"I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?\"\"\n\n    # Create your representation model\n    generator = pipeline('text2text-generation', model='google/flan-t5-base')\n    representation_model = TextGeneration(generator)\n    ```\n    \"\"\"\n    def __init__(self,\n                 model: Union[str, pipeline],\n                 prompt: str = None,\n                 pipeline_kwargs: Mapping[str, Any] = {},\n                 random_state: int = 42,\n                 nr_docs: int = 4,\n                 diversity: float = None,\n                 doc_length: int = None,\n                 tokenizer: Union[str, Callable] = None\n                 ):\n        self.random_state = random_state\n        set_seed(random_state)\n        if isinstance(model, str):\n            self.model = pipeline(\"text-generation\", model=model)\n        elif isinstance(model, Pipeline):\n            self.model = model\n        else:\n            raise ValueError(\"Make sure that the HF model that you\"\n                             \"pass is either a string referring to a\"\n                             \"HF model or a `transformers.pipeline` object.\")\n        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT\n        self.default_prompt_ = DEFAULT_PROMPT\n        self.pipeline_kwargs = pipeline_kwargs\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n\n        self.prompts_ = []\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topic representations and return a single label\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top 4 representative documents per topic\n        if self.prompt != DEFAULT_PROMPT and \"[DOCUMENTS]\" in self.prompt:\n            repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n                c_tf_idf,\n                documents,\n                topics,\n                500,\n                self.nr_docs,\n                self.diversity\n            )\n        else:\n            repr_docs_mappings = {topic: None for topic in topics.keys()}\n\n        updated_topics = {}\n        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n\n            # Prepare prompt\n            truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs\n            prompt = self._create_prompt(truncated_docs, topic, topics)\n            self.prompts_.append(prompt)\n\n            # Extract result from generator and use that as label\n            topic_description = self.model(prompt, **self.pipeline_kwargs)\n            topic_description = [(description[\"generated_text\"].replace(prompt, \"\"), 1) for description in topic_description]\n\n            if len(topic_description) < 10:\n                topic_description += [(\"\", 0) for _ in range(10-len(topic_description))]\n\n            updated_topics[topic] = topic_description\n\n        return updated_topics\n\n    def _create_prompt(self, docs, topic, topics):\n        keywords = \", \".join(list(zip(*topics[topic]))[0])\n\n        # Use the default prompt and replace keywords\n        if self.prompt == DEFAULT_PROMPT:\n            prompt = self.prompt.replace(\"[KEYWORDS]\", keywords)\n\n        # Use a prompt that leverages either keywords or documents in\n        # a custom location\n        else:\n            prompt = self.prompt\n            if \"[KEYWORDS]\" in prompt:\n                prompt = prompt.replace(\"[KEYWORDS]\", keywords)\n            if \"[DOCUMENTS]\" in prompt:\n                to_replace = \"\"\n                for doc in docs:\n                    to_replace += f\"- {doc}\\n\"\n                prompt = prompt.replace(\"[DOCUMENTS]\", to_replace)\n\n        return prompt\n
        "},{"location":"api/representation/generation.html#bertopic.representation._textgeneration.TextGeneration.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topic representations and return a single label

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_textgeneration.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topic representations and return a single label\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top 4 representative documents per topic\n    if self.prompt != DEFAULT_PROMPT and \"[DOCUMENTS]\" in self.prompt:\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n            c_tf_idf,\n            documents,\n            topics,\n            500,\n            self.nr_docs,\n            self.diversity\n        )\n    else:\n        repr_docs_mappings = {topic: None for topic in topics.keys()}\n\n    updated_topics = {}\n    for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n\n        # Prepare prompt\n        truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs] if docs is not None else docs\n        prompt = self._create_prompt(truncated_docs, topic, topics)\n        self.prompts_.append(prompt)\n\n        # Extract result from generator and use that as label\n        topic_description = self.model(prompt, **self.pipeline_kwargs)\n        topic_description = [(description[\"generated_text\"].replace(prompt, \"\"), 1) for description in topic_description]\n\n        if len(topic_description) < 10:\n            topic_description += [(\"\", 0) for _ in range(10-len(topic_description))]\n\n        updated_topics[topic] = topic_description\n\n    return updated_topics\n
        "},{"location":"api/representation/keybert.html","title":"KeyBERTInspired","text":"Source code in bertopic\\representation\\_keybert.py
        class KeyBERTInspired(BaseRepresentation):\n    def __init__(self,\n                 top_n_words: int = 10,\n                 nr_repr_docs: int = 5,\n                 nr_samples: int = 500,\n                 nr_candidate_words: int = 100,\n                 random_state: int = 42):\n        \"\"\" Use a KeyBERT-like model to fine-tune the topic representations\n\n        The algorithm follows KeyBERT but does some optimization in\n        order to speed up inference.\n\n        The steps are as follows. First, we extract the top n representative\n        documents per topic. To extract the representative documents, we\n        randomly sample a number of candidate documents per cluster\n        which is controlled by the `nr_samples` parameter. Then,\n        the top n representative documents  are extracted by calculating\n        the c-TF-IDF representation for the  candidate documents and finding,\n        through cosine similarity, which are closest to the topic c-TF-IDF representation.\n        Next, the top n words per topic are extracted based on their\n        c-TF-IDF representation, which is controlled by the `nr_repr_docs`\n        parameter.\n\n        Then, we extract the embeddings for words and representative documents\n        and create topic embeddings by averaging the representative documents.\n        Finally, the most similar words to each topic are extracted by\n        calculating the cosine similarity between word and topic embeddings.\n\n        Arguments:\n            top_n_words: The top n words to extract per topic.\n            nr_repr_docs: The number of representative documents to extract per cluster.\n            nr_samples: The number of candidate documents to extract per cluster.\n            nr_candidate_words: The number of candidate words per cluster.\n            random_state: The random state for randomly sampling candidate documents.\n\n        Usage:\n\n        ```python\n        from bertopic.representation import KeyBERTInspired\n        from bertopic import BERTopic\n\n        # Create your representation model\n        representation_model = KeyBERTInspired()\n\n        # Use the representation model in BERTopic on top of the default pipeline\n        topic_model = BERTopic(representation_model=representation_model)\n        ```\n        \"\"\"\n        self.top_n_words = top_n_words\n        self.nr_repr_docs = nr_repr_docs\n        self.nr_samples = nr_samples\n        self.nr_candidate_words = nr_candidate_words\n        self.random_state = random_state\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # We extract the top n representative documents per class\n        _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs)\n\n        # We extract the top n words per class\n        topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)\n\n        # We calculate the similarity between word and document embeddings and create\n        # topic embeddings from the representative document embeddings\n        sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices)\n\n        # Find the best matching words based on the similarity matrix for each topic\n        updated_topics = self._extract_top_words(words, topics, sim_matrix)\n\n        return updated_topics\n\n    def _extract_candidate_words(self,\n                                 topic_model,\n                                 c_tf_idf: csr_matrix,\n                                 topics: Mapping[str, List[Tuple[str, float]]]\n                                 ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" For each topic, extract candidate words based on the c-TF-IDF\n        representation.\n\n        Arguments:\n            topic_model: A BERTopic model\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The top words per topic\n\n        Returns:\n            topics: The `self.top_n_words` per topic\n        \"\"\"\n        labels = [int(label) for label in sorted(list(topics.keys()))]\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = topic_model.vectorizer_model.get_feature_names_out()\n        else:\n            words = topic_model.vectorizer_model.get_feature_names()\n\n        indices = topic_model._top_n_idx_sparse(c_tf_idf, self.nr_candidate_words)\n        scores = topic_model._top_n_values_sparse(c_tf_idf, indices)\n        sorted_indices = np.argsort(scores, 1)\n        indices = np.take_along_axis(indices, sorted_indices, axis=1)\n        scores = np.take_along_axis(scores, sorted_indices, axis=1)\n\n        # Get top 30 words per topic based on c-TF-IDF score\n        topics = {label: [(words[word_index], score)\n                          if word_index is not None and score > 0\n                          else (\"\", 0.00001)\n                          for word_index, score in zip(indices[index][::-1], scores[index][::-1])\n                          ]\n                  for index, label in enumerate(labels)}\n        topics = {label: list(zip(*values[:self.nr_candidate_words]))[0] for label, values in topics.items()}\n\n        return topics\n\n    def _extract_embeddings(self,\n                            topic_model,\n                            topics: Mapping[str, List[Tuple[str, float]]],\n                            representative_docs: List[str],\n                            repr_doc_indices: List[List[int]]\n                            ) -> Union[np.ndarray, List[str]]:\n        \"\"\" Extract the representative document embeddings and create topic embeddings.\n        Then extract word embeddings and calculate the cosine similarity between topic\n        embeddings and the word embeddings. Topic embeddings are the average of\n        representative document embeddings.\n\n        Arguments:\n            topic_model: A BERTopic model\n            topics: The top words per topic\n            representative_docs: A flat list of representative documents\n            repr_doc_indices: The indices of representative documents\n                              that belong to each topic\n\n        Returns:\n            sim: The similarity matrix between word and topic embeddings\n            vocab: The complete vocabulary of input documents\n        \"\"\"\n        # Calculate representative docs embeddings and create topic embeddings\n        repr_embeddings = topic_model._extract_embeddings(representative_docs, method=\"document\", verbose=False)\n        topic_embeddings = [np.mean(repr_embeddings[i[0]:i[-1]+1], axis=0) for i in repr_doc_indices]\n\n        # Calculate word embeddings and extract best matching with updated topic_embeddings\n        vocab = list(set([word for words in topics.values() for word in words]))\n        word_embeddings = topic_model._extract_embeddings(vocab, method=\"document\", verbose=False)\n        sim = cosine_similarity(topic_embeddings, word_embeddings)\n\n        return sim, vocab\n\n    def _extract_top_words(self,\n                           vocab: List[str],\n                           topics: Mapping[str, List[Tuple[str, float]]],\n                           sim: np.ndarray\n                           ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract the top n words per topic based on the\n        similarity matrix between topics and words.\n\n        Arguments:\n            vocab: The complete vocabulary of input documents\n            labels: All topic labels\n            topics: The top words per topic\n            sim: The similarity matrix between word and topic embeddings\n\n        Returns:\n            updated_topics: The updated topic representations\n        \"\"\"\n        labels = [int(label) for label in sorted(list(topics.keys()))]\n        updated_topics = {}\n        for i, topic in enumerate(labels):\n            indices = [vocab.index(word) for word in topics[topic]]\n            values = sim[:, indices][i]\n            word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words:]]\n            updated_topics[topic] = [(vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words:], word_indices)][::-1]\n\n        return updated_topics\n
        "},{"location":"api/representation/keybert.html#bertopic.representation._keybert.KeyBERTInspired.__init__","title":"__init__(self, top_n_words=10, nr_repr_docs=5, nr_samples=500, nr_candidate_words=100, random_state=42) special","text":"

        Use a KeyBERT-like model to fine-tune the topic representations

        The algorithm follows KeyBERT but does some optimization in order to speed up inference.

        The steps are as follows. First, we extract the top n representative documents per topic. To extract the representative documents, we randomly sample a number of candidate documents per cluster which is controlled by the nr_samples parameter. Then, the top n representative documents are extracted by calculating the c-TF-IDF representation for the candidate documents and finding, through cosine similarity, which are closest to the topic c-TF-IDF representation. Next, the top n words per topic are extracted based on their c-TF-IDF representation, which is controlled by the nr_repr_docs parameter.

        Then, we extract the embeddings for words and representative documents and create topic embeddings by averaging the representative documents. Finally, the most similar words to each topic are extracted by calculating the cosine similarity between word and topic embeddings.

        Parameters:

        Name Type Description Default top_n_words int

        The top n words to extract per topic.

        10 nr_repr_docs int

        The number of representative documents to extract per cluster.

        5 nr_samples int

        The number of candidate documents to extract per cluster.

        500 nr_candidate_words int

        The number of candidate words per cluster.

        100 random_state int

        The random state for randomly sampling candidate documents.

        42

        Usage:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        Source code in bertopic\\representation\\_keybert.py
        def __init__(self,\n             top_n_words: int = 10,\n             nr_repr_docs: int = 5,\n             nr_samples: int = 500,\n             nr_candidate_words: int = 100,\n             random_state: int = 42):\n    \"\"\" Use a KeyBERT-like model to fine-tune the topic representations\n\n    The algorithm follows KeyBERT but does some optimization in\n    order to speed up inference.\n\n    The steps are as follows. First, we extract the top n representative\n    documents per topic. To extract the representative documents, we\n    randomly sample a number of candidate documents per cluster\n    which is controlled by the `nr_samples` parameter. Then,\n    the top n representative documents  are extracted by calculating\n    the c-TF-IDF representation for the  candidate documents and finding,\n    through cosine similarity, which are closest to the topic c-TF-IDF representation.\n    Next, the top n words per topic are extracted based on their\n    c-TF-IDF representation, which is controlled by the `nr_repr_docs`\n    parameter.\n\n    Then, we extract the embeddings for words and representative documents\n    and create topic embeddings by averaging the representative documents.\n    Finally, the most similar words to each topic are extracted by\n    calculating the cosine similarity between word and topic embeddings.\n\n    Arguments:\n        top_n_words: The top n words to extract per topic.\n        nr_repr_docs: The number of representative documents to extract per cluster.\n        nr_samples: The number of candidate documents to extract per cluster.\n        nr_candidate_words: The number of candidate words per cluster.\n        random_state: The random state for randomly sampling candidate documents.\n\n    Usage:\n\n    ```python\n    from bertopic.representation import KeyBERTInspired\n    from bertopic import BERTopic\n\n    # Create your representation model\n    representation_model = KeyBERTInspired()\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n    \"\"\"\n    self.top_n_words = top_n_words\n    self.nr_repr_docs = nr_repr_docs\n    self.nr_samples = nr_samples\n    self.nr_candidate_words = nr_candidate_words\n    self.random_state = random_state\n
        "},{"location":"api/representation/keybert.html#bertopic.representation._keybert.KeyBERTInspired.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        The topic c-TF-IDF representation

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_keybert.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topics\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: The topic c-TF-IDF representation\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # We extract the top n representative documents per class\n    _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs)\n\n    # We extract the top n words per class\n    topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)\n\n    # We calculate the similarity between word and document embeddings and create\n    # topic embeddings from the representative document embeddings\n    sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices)\n\n    # Find the best matching words based on the similarity matrix for each topic\n    updated_topics = self._extract_top_words(words, topics, sim_matrix)\n\n    return updated_topics\n
        "},{"location":"api/representation/langchain.html","title":"LangChain","text":"

        Using chains in langchain to generate topic labels.

        The classic example uses langchain.chains.question_answering.load_qa_chain. This returns a chain that takes a list of documents and a question as input.

        You can also use Runnables such as those composed using the LangChain Expression Language.

        Parameters:

        Name Type Description Default chain

        The langchain chain or Runnable with a batch method. Input keys must be input_documents and question. Output key must be output_text.

        required prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" in the prompt to decide where the keywords need to be inserted. Keywords won't be included unless indicated. Unlike other representation models, Langchain does not use the \"[DOCUMENTS]\" tag to insert documents into the prompt. The load_qa_chain function formats the representative documents within the prompt.

        None nr_docs int

        The number of documents to pass to LangChain

        4 diversity float

        The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and trunctated depending on doc_length. They are decoded with whitespaces. * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None chain_config

        The configuration for the langchain chain. Can be used to set options like max_concurrency to avoid rate limiting errors.

        None

        Usage:

        To use this, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai:

        pip install langchain pip install openai

        Then, you can create your chain as follows:

        from langchain.chains.question_answering import load_qa_chain\nfrom langchain.llms import OpenAI\nchain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type=\"stuff\")\n

        Finally, you can pass the chain to BERTopic as follows:

        from bertopic.representation import LangChain\n\n# Create your representation model\nrepresentation_model = LangChain(chain)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"What are these documents about? Please give a single label.\"\nrepresentation_model = LangChain(chain, prompt=prompt)\n

        You can also use a Runnable instead of a chain. The example below uses the LangChain Expression Language:

        from bertopic.representation import LangChain\nfrom langchain.chains.question_answering import load_qa_chain\nfrom langchain.chat_models import ChatAnthropic\nfrom langchain.schema.document import Document\nfrom langchain.schema.runnable import RunnablePassthrough\nfrom langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer\n\nprompt = ...\nllm = ...\n\n# We will construct a special privacy-preserving chain using Microsoft Presidio\n\npii_handler = PresidioReversibleAnonymizer(analyzed_fields=[\"PERSON\"])\n\nchain = (\n    {\n        \"input_documents\": (\n            lambda inp: [\n                Document(\n                    page_content=pii_handler.anonymize(\n                        d.page_content,\n                        language=\"en\",\n                    ),\n                )\n                for d in inp[\"input_documents\"]\n            ]\n        ),\n        \"question\": RunnablePassthrough(),\n    }\n    | load_qa_chain(representation_llm, chain_type=\"stuff\")\n    | (lambda output: {\"output_text\": pii_handler.deanonymize(output[\"output_text\"])})\n)\n\nrepresentation_model = LangChain(chain, prompt=representation_prompt)\n
        Source code in bertopic\\representation\\_langchain.py
        class LangChain(BaseRepresentation):\n    \"\"\" Using chains in langchain to generate topic labels.\n\n    The classic example uses `langchain.chains.question_answering.load_qa_chain`.\n    This returns a chain that takes a list of documents and a question as input.\n\n    You can also use Runnables such as those composed using the LangChain Expression Language.\n\n    Arguments:\n        chain: The langchain chain or Runnable with a `batch` method.\n               Input keys must be `input_documents` and `question`.\n               Output key must be `output_text`.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                 NOTE: Use `\"[KEYWORDS]\"` in the prompt\n                 to decide where the keywords need to be\n                 inserted. Keywords won't be included unless\n                 indicated. Unlike other representation models,\n                 Langchain does not use the `\"[DOCUMENTS]\"` tag\n                 to insert documents into the prompt. The load_qa_chain function\n                 formats the representative documents within the prompt.\n        nr_docs: The number of documents to pass to LangChain\n        diversity: The diversity of documents to pass to LangChain.\n                   Accepts values between 0 and 1. A higher \n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document. \n                       * If tokenizer is 'char', then the document is split up \n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and trunctated depending on `doc_length`. They are decoded with \n                         whitespaces.\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n        chain_config: The configuration for the langchain chain. Can be used to set options\n                      like max_concurrency to avoid rate limiting errors.\n    Usage:\n\n    To use this, you will need to install the langchain package first.\n    Additionally, you will need an underlying LLM to support langchain,\n    like openai:\n\n    `pip install langchain`\n    `pip install openai`\n\n    Then, you can create your chain as follows:\n\n    ```python\n    from langchain.chains.question_answering import load_qa_chain\n    from langchain.llms import OpenAI\n    chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type=\"stuff\")\n    ```\n\n    Finally, you can pass the chain to BERTopic as follows:\n\n    ```python\n    from bertopic.representation import LangChain\n\n    # Create your representation model\n    representation_model = LangChain(chain)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can also use a custom prompt:\n\n    ```python\n    prompt = \"What are these documents about? Please give a single label.\"\n    representation_model = LangChain(chain, prompt=prompt)\n    ```\n\n    You can also use a Runnable instead of a chain.\n    The example below uses the LangChain Expression Language:\n\n    ```python\n    from bertopic.representation import LangChain\n    from langchain.chains.question_answering import load_qa_chain\n    from langchain.chat_models import ChatAnthropic\n    from langchain.schema.document import Document\n    from langchain.schema.runnable import RunnablePassthrough\n    from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer\n\n    prompt = ...\n    llm = ...\n\n    # We will construct a special privacy-preserving chain using Microsoft Presidio\n\n    pii_handler = PresidioReversibleAnonymizer(analyzed_fields=[\"PERSON\"])\n\n    chain = (\n        {\n            \"input_documents\": (\n                lambda inp: [\n                    Document(\n                        page_content=pii_handler.anonymize(\n                            d.page_content,\n                            language=\"en\",\n                        ),\n                    )\n                    for d in inp[\"input_documents\"]\n                ]\n            ),\n            \"question\": RunnablePassthrough(),\n        }\n        | load_qa_chain(representation_llm, chain_type=\"stuff\")\n        | (lambda output: {\"output_text\": pii_handler.deanonymize(output[\"output_text\"])})\n    )\n\n    representation_model = LangChain(chain, prompt=representation_prompt)\n    ```\n    \"\"\"\n    def __init__(self,\n                 chain,\n                 prompt: str = None,\n                 nr_docs: int = 4,\n                 diversity: float = None,\n                 doc_length: int = None,\n                 tokenizer: Union[str, Callable] = None,\n                 chain_config = None,\n                 ):\n        self.chain = chain\n        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT\n        self.default_prompt_ = DEFAULT_PROMPT\n        self.chain_config = chain_config\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, int]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top 4 representative documents per topic\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n            c_tf_idf=c_tf_idf,\n            documents=documents,\n            topics=topics,\n            nr_samples=500,\n            nr_repr_docs=self.nr_docs,\n            diversity=self.diversity\n        )\n\n        # Generate label using langchain's batch functionality\n        chain_docs: List[List[Document]] = [\n            [\n                Document(\n                    page_content=truncate_document(\n                        topic_model,\n                        self.doc_length,\n                        self.tokenizer,\n                        doc\n                    )\n                )\n                for doc in docs\n            ]\n            for docs in repr_docs_mappings.values()\n        ]\n\n        # `self.chain` must take `input_documents` and `question` as input keys\n        # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS]\n        if \"[KEYWORDS]\" in self.prompt:\n            prompts = []\n            for topic in topics:\n                keywords = list(zip(*topics[topic]))[0]\n                prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n                prompts.append(prompt)\n\n            inputs = [\n                {\"input_documents\": docs, \"question\": prompt}\n                for docs, prompt in zip(chain_docs, prompts)\n            ]\n\n        else:\n            inputs = [\n                {\"input_documents\": docs, \"question\": self.prompt}\n                for docs in chain_docs\n            ]\n\n        # `self.chain` must return a dict with an `output_text` key\n        # same output key as the `StuffDocumentsChain` returned by `load_qa_chain`\n        outputs = self.chain.batch(inputs=inputs, config=self.chain_config)\n        labels = [output[\"output_text\"].strip() for output in outputs]\n\n        updated_topics = {\n            topic: [(label, 1)] + [(\"\", 0) for _ in range(9)]\n            for topic, label in zip(repr_docs_mappings.keys(), labels)\n        }\n\n        return updated_topics\n
        "},{"location":"api/representation/langchain.html#bertopic.representation._langchain.LangChain.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        The topic c-TF-IDF representation

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_langchain.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, int]]]:\n    \"\"\" Extract topics\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: The topic c-TF-IDF representation\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top 4 representative documents per topic\n    repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n        c_tf_idf=c_tf_idf,\n        documents=documents,\n        topics=topics,\n        nr_samples=500,\n        nr_repr_docs=self.nr_docs,\n        diversity=self.diversity\n    )\n\n    # Generate label using langchain's batch functionality\n    chain_docs: List[List[Document]] = [\n        [\n            Document(\n                page_content=truncate_document(\n                    topic_model,\n                    self.doc_length,\n                    self.tokenizer,\n                    doc\n                )\n            )\n            for doc in docs\n        ]\n        for docs in repr_docs_mappings.values()\n    ]\n\n    # `self.chain` must take `input_documents` and `question` as input keys\n    # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS]\n    if \"[KEYWORDS]\" in self.prompt:\n        prompts = []\n        for topic in topics:\n            keywords = list(zip(*topics[topic]))[0]\n            prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            prompts.append(prompt)\n\n        inputs = [\n            {\"input_documents\": docs, \"question\": prompt}\n            for docs, prompt in zip(chain_docs, prompts)\n        ]\n\n    else:\n        inputs = [\n            {\"input_documents\": docs, \"question\": self.prompt}\n            for docs in chain_docs\n        ]\n\n    # `self.chain` must return a dict with an `output_text` key\n    # same output key as the `StuffDocumentsChain` returned by `load_qa_chain`\n    outputs = self.chain.batch(inputs=inputs, config=self.chain_config)\n    labels = [output[\"output_text\"].strip() for output in outputs]\n\n    updated_topics = {\n        topic: [(label, 1)] + [(\"\", 0) for _ in range(9)]\n        for topic, label in zip(repr_docs_mappings.keys(), labels)\n    }\n\n    return updated_topics\n
        "},{"location":"api/representation/mmr.html","title":"MaximalMarginalRelevance","text":"

        Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document.

        MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.

        Parameters:

        Name Type Description Default diversity float

        How diverse the select keywords/keyphrases are. Values range between 0 and 1 with 0 being not diverse at all and 1 being most diverse.

        0.1 top_n_words int

        The number of keywords/keyhprases to return

        10

        Usage:

        from bertopic.representation import MaximalMarginalRelevance\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = MaximalMarginalRelevance(diversity=0.3)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        Source code in bertopic\\representation\\_mmr.py
        class MaximalMarginalRelevance(BaseRepresentation):\n    \"\"\" Calculate Maximal Marginal Relevance (MMR)\n    between candidate keywords and the document.\n\n    MMR considers the similarity of keywords/keyphrases with the\n    document, along with the similarity of already selected\n    keywords and keyphrases. This results in a selection of keywords\n    that maximize their within diversity with respect to the document.\n\n    Arguments:\n        diversity: How diverse the select keywords/keyphrases are.\n                    Values range between 0 and 1 with 0 being not diverse at all\n                    and 1 being most diverse.\n        top_n_words: The number of keywords/keyhprases to return\n\n    Usage:\n\n    ```python\n    from bertopic.representation import MaximalMarginalRelevance\n    from bertopic import BERTopic\n\n    # Create your representation model\n    representation_model = MaximalMarginalRelevance(diversity=0.3)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n    \"\"\"\n    def __init__(self, diversity: float = 0.1, top_n_words: int = 10):\n        self.diversity = diversity\n        self.top_n_words = top_n_words\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topic representations\n\n        Arguments:\n            topic_model: The BERTopic model\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n\n        if topic_model.embedding_model is None:\n            warnings.warn(\"MaximalMarginalRelevance can only be used BERTopic was instantiated\"\n                          \"with the `embedding_model` parameter.\")\n            return topics\n\n        updated_topics = {}\n        for topic, topic_words in topics.items():\n            words = [word[0] for word in topic_words]\n            word_embeddings = topic_model._extract_embeddings(words, method=\"word\", verbose=False)\n            topic_embedding = topic_model._extract_embeddings(\" \".join(words), method=\"word\", verbose=False).reshape(1, -1)\n            topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words)\n            updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]\n        return updated_topics\n
        "},{"location":"api/representation/mmr.html#bertopic.representation._mmr.MaximalMarginalRelevance.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topic representations

        Parameters:

        Name Type Description Default topic_model

        The BERTopic model

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_mmr.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topic representations\n\n    Arguments:\n        topic_model: The BERTopic model\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n\n    if topic_model.embedding_model is None:\n        warnings.warn(\"MaximalMarginalRelevance can only be used BERTopic was instantiated\"\n                      \"with the `embedding_model` parameter.\")\n        return topics\n\n    updated_topics = {}\n    for topic, topic_words in topics.items():\n        words = [word[0] for word in topic_words]\n        word_embeddings = topic_model._extract_embeddings(words, method=\"word\", verbose=False)\n        topic_embedding = topic_model._extract_embeddings(\" \".join(words), method=\"word\", verbose=False).reshape(1, -1)\n        topic_words = mmr(topic_embedding, word_embeddings, words, self.diversity, self.top_n_words)\n        updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]\n    return updated_topics\n
        "},{"location":"api/representation/openai.html","title":"OpenAI","text":"

        Using the OpenAI API to generate topic labels based on one of their Completion of ChatCompletion models.

        The default method is `openai.Completion` if `chat=False`.\nThe prompts will also need to follow a completion task. If you\nare looking for a more interactive chats, use `chat=True`\nwith `model=gpt-3.5-turbo`.\n\nFor an overview see:\nhttps://platform.openai.com/docs/models\n\n!!! arguments\n    client: A `openai.OpenAI` client\n    !!! model \"Model to use within OpenAI, defaults to `\"text-ada-001\"`.\"\n           NOTE: If a `gpt-3.5-turbo` model is used, make sure to set\n           `chat` to True.\n    !!! generator_kwargs \"Kwargs passed to `openai.Completion.create`\"\n                      for fine-tuning the output.\n    !!! prompt \"The prompt to be used in the model. If no prompt is given,\"\n            `self.default_prompt_` is used instead.\n            NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n            to decide where the keywords and documents need to be\n            inserted.\n    !!! delay_in_seconds \"The delay in seconds between consecutive prompts\"\n                      in order to prevent RateLimitErrors.\n    !!! exponential_backoff \"Retry requests with a random exponential backoff.\"\n                         A short sleep is used when a rate limit error is hit,\n                         then the requests is retried. Increase the sleep length\n                         if errors are hit until 10 unsuccesfull requests.\n                         If True, overrides `delay_in_seconds`.\n    !!! chat \"Set this to True if a GPT-3.5 model is used.\"\n          See: https://platform.openai.com/docs/models/gpt-3-5\n    !!! nr_docs \"The number of documents to pass to OpenAI if a prompt\"\n             with the `[\"DOCUMENTS\"]` tag is used.\n    !!! diversity \"The diversity of documents to pass to OpenAI.\"\n               Accepts values between 0 and 1. A higher\n               values results in passing more diverse documents\n               whereas lower values passes more similar documents.\n    !!! doc_length \"The maximum length of each document. If a document is longer,\"\n                it will be truncated. If None, the entire document is passed.\n    !!! tokenizer \"The tokenizer used to calculate to split the document into segments\"\n               used to count the length of a document.\n                   * If tokenizer is 'char', then the document is split up\n                     into characters which are counted to adhere to `doc_length`\n                   * If tokenizer is 'whitespace', the document is split up\n                     into words separated by whitespaces. These words are counted\n                     and truncated depending on `doc_length`\n                   * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                     is used to tokenize the document. These tokens are counted\n                     and trunctated depending on `doc_length`\n                   * If tokenizer is a callable, then that callable is used to tokenize\n                     the document. These tokens are counted and truncated depending\n                     on `doc_length`\n\nUsage:\n\nTo use this, you will need to install the openai package first:\n\n`pip install openai`\n\nThen, get yourself an API key and use OpenAI's API as follows:\n\n```python\nimport openai\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Create your representation model\nclient = openai.OpenAI(api_key=MY_API_KEY)\nrepresentation_model = OpenAI(client, delay_in_seconds=5)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n```\n\nYou can also use a custom prompt:\n\n```python\nprompt = \"I have the following documents: [DOCUMENTS]\n

        These documents are about the following topic: '\" representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5) If you want to use OpenAI's ChatGPT model:python representation_model = OpenAI(client, model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True) ```

        Source code in bertopic\\representation\\_openai.py
        class OpenAI(BaseRepresentation):\n    \"\"\" Using the OpenAI API to generate topic labels based\n    on one of their Completion of ChatCompletion models.\n\n    The default method is `openai.Completion` if `chat=False`.\n    The prompts will also need to follow a completion task. If you\n    are looking for a more interactive chats, use `chat=True`\n    with `model=gpt-3.5-turbo`.\n\n    For an overview see:\n    https://platform.openai.com/docs/models\n\n    Arguments:\n        client: A `openai.OpenAI` client\n        model: Model to use within OpenAI, defaults to `\"text-ada-001\"`.\n               NOTE: If a `gpt-3.5-turbo` model is used, make sure to set\n               `chat` to True.\n        generator_kwargs: Kwargs passed to `openai.Completion.create`\n                          for fine-tuning the output.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n                to decide where the keywords and documents need to be\n                inserted.\n        delay_in_seconds: The delay in seconds between consecutive prompts\n                          in order to prevent RateLimitErrors.\n        exponential_backoff: Retry requests with a random exponential backoff.\n                             A short sleep is used when a rate limit error is hit,\n                             then the requests is retried. Increase the sleep length\n                             if errors are hit until 10 unsuccesfull requests.\n                             If True, overrides `delay_in_seconds`.\n        chat: Set this to True if a GPT-3.5 model is used.\n              See: https://platform.openai.com/docs/models/gpt-3-5\n        nr_docs: The number of documents to pass to OpenAI if a prompt\n                 with the `[\"DOCUMENTS\"]` tag is used.\n        diversity: The diversity of documents to pass to OpenAI.\n                   Accepts values between 0 and 1. A higher\n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document.\n                       * If tokenizer is 'char', then the document is split up\n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and trunctated depending on `doc_length`\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n\n    Usage:\n\n    To use this, you will need to install the openai package first:\n\n    `pip install openai`\n\n    Then, get yourself an API key and use OpenAI's API as follows:\n\n    ```python\n    import openai\n    from bertopic.representation import OpenAI\n    from bertopic import BERTopic\n\n    # Create your representation model\n    client = openai.OpenAI(api_key=MY_API_KEY)\n    representation_model = OpenAI(client, delay_in_seconds=5)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can also use a custom prompt:\n\n    ```python\n    prompt = \"I have the following documents: [DOCUMENTS] \\nThese documents are about the following topic: '\"\n    representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5)\n    ```\n\n    If you want to use OpenAI's ChatGPT model:\n\n    ```python\n    representation_model = OpenAI(client, model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n    ```\n    \"\"\"\n    def __init__(self,\n                 client,\n                 model: str = \"text-embedding-3-small\",\n                 prompt: str = None,\n                 generator_kwargs: Mapping[str, Any] = {},\n                 delay_in_seconds: float = None,\n                 exponential_backoff: bool = False,\n                 chat: bool = False,\n                 nr_docs: int = 4,\n                 diversity: float = None,\n                 doc_length: int = None,\n                 tokenizer: Union[str, Callable] = None\n                 ):\n        self.client = client\n        self.model = model\n\n        if prompt is None:\n            self.prompt = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT\n        else:\n            self.prompt = prompt\n\n        self.default_prompt_ = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT\n        self.delay_in_seconds = delay_in_seconds\n        self.exponential_backoff = exponential_backoff\n        self.chat = chat\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n        self.prompts_ = []\n\n        self.generator_kwargs = generator_kwargs\n        if self.generator_kwargs.get(\"model\"):\n            self.model = generator_kwargs.get(\"model\")\n            del self.generator_kwargs[\"model\"]\n        if self.generator_kwargs.get(\"prompt\"):\n            del self.generator_kwargs[\"prompt\"]\n        if not self.generator_kwargs.get(\"stop\") and not chat:\n            self.generator_kwargs[\"stop\"] = \"\\n\"\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top n representative documents per topic\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)\n\n        # Generate using OpenAI's Language Model\n        updated_topics = {}\n        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n            truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n            prompt = self._create_prompt(truncated_docs, topic, topics)\n            self.prompts_.append(prompt)\n\n            # Delay\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n            if self.chat:\n                messages = [\n                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n                    {\"role\": \"user\", \"content\": prompt}\n                ]\n                kwargs = {\"model\": self.model, \"messages\": messages, **self.generator_kwargs}\n                if self.exponential_backoff:\n                    response = chat_completions_with_backoff(self.client, **kwargs)\n                else:\n                    response = self.client.chat.completions.create(**kwargs)\n\n                # Check whether content was actually generated\n                # Adresses #1570 for potential issues with OpenAI's content filter\n                if hasattr(response.choices[0].message, \"content\"):\n                    label = response.choices[0].message.content.strip().replace(\"topic: \", \"\")\n                else:\n                    label = \"No label returned\"\n            else:\n                if self.exponential_backoff:\n                    response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs)\n                else:\n                    response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs)\n                label = response.choices[0].text.strip()\n\n            updated_topics[topic] = [(label, 1)]\n\n        return updated_topics\n\n    def _create_prompt(self, docs, topic, topics):\n        keywords = list(zip(*topics[topic]))[0]\n\n        # Use the Default Chat Prompt\n        if self.prompt == DEFAULT_CHAT_PROMPT or self.prompt == DEFAULT_PROMPT:\n            prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            prompt = self._replace_documents(prompt, docs)\n\n        # Use a custom prompt that leverages keywords, documents or both using\n        # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively\n        else:\n            prompt = self.prompt\n            if \"[KEYWORDS]\" in prompt:\n                prompt = prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            if \"[DOCUMENTS]\" in prompt:\n                prompt = self._replace_documents(prompt, docs)\n\n        return prompt\n\n    @staticmethod\n    def _replace_documents(prompt, docs):\n        to_replace = \"\"\n        for doc in docs:\n            to_replace += f\"- {doc}\\n\"\n        prompt = prompt.replace(\"[DOCUMENTS]\", to_replace)\n        return prompt\n
        "},{"location":"api/representation/openai.html#bertopic.representation._openai.OpenAI.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        The topic c-TF-IDF representation

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_openai.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topics\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: The topic c-TF-IDF representation\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top n representative documents per topic\n    repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity)\n\n    # Generate using OpenAI's Language Model\n    updated_topics = {}\n    for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n        truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n        prompt = self._create_prompt(truncated_docs, topic, topics)\n        self.prompts_.append(prompt)\n\n        # Delay\n        if self.delay_in_seconds:\n            time.sleep(self.delay_in_seconds)\n\n        if self.chat:\n            messages = [\n                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n                {\"role\": \"user\", \"content\": prompt}\n            ]\n            kwargs = {\"model\": self.model, \"messages\": messages, **self.generator_kwargs}\n            if self.exponential_backoff:\n                response = chat_completions_with_backoff(self.client, **kwargs)\n            else:\n                response = self.client.chat.completions.create(**kwargs)\n\n            # Check whether content was actually generated\n            # Adresses #1570 for potential issues with OpenAI's content filter\n            if hasattr(response.choices[0].message, \"content\"):\n                label = response.choices[0].message.content.strip().replace(\"topic: \", \"\")\n            else:\n                label = \"No label returned\"\n        else:\n            if self.exponential_backoff:\n                response = completions_with_backoff(self.client, model=self.model, prompt=prompt, **self.generator_kwargs)\n            else:\n                response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs)\n            label = response.choices[0].text.strip()\n\n        updated_topics[topic] = [(label, 1)]\n\n    return updated_topics\n
        "},{"location":"api/representation/pos.html","title":"PartOfSpeech","text":"

        Extract Topic Keywords based on their Part-of-Speech

        DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ]

        From candidate topics, as extracted with c-TF-IDF, find documents that contain keywords found in the candidate topics. These candidate documents then serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic.

        These candidate keywords are first judged by whether they fall within the DEFAULT_PATTERNS or the user-defined pattern. Then, the resulting keywords are sorted by their respective c-TF-IDF values.

        Parameters:

        Name Type Description Default model Union[str, spacy.language.Language]

        The Spacy model to use

        'en_core_web_sm' top_n_words int

        The top n words to extract

        10 pos_patterns List[str]

        Patterns for Spacy to use. See https://spacy.io/usage/rule-based-matching

        None

        Usage:

        from bertopic.representation import PartOfSpeech\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can define custom POS patterns to be extracted:

        pos_patterns = [\n            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n            [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n]\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\", pos_patterns=pos_patterns)\n
        Source code in bertopic\\representation\\_pos.py
        class PartOfSpeech(BaseRepresentation):\n    \"\"\" Extract Topic Keywords based on their Part-of-Speech\n\n    DEFAULT_PATTERNS = [\n                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n                [{'POS': 'NOUN'}],\n                [{'POS': 'ADJ'}]\n    ]\n\n    From candidate topics, as extracted with c-TF-IDF,\n    find documents that contain keywords found in the\n    candidate topics. These candidate documents then\n    serve as the representative set of documents from\n    which the Spacy model can extract a set of candidate\n    keywords for each topic.\n\n    These candidate keywords are first judged by whether\n    they fall within the DEFAULT_PATTERNS or the user-defined\n    pattern. Then, the resulting keywords are sorted by\n    their respective c-TF-IDF values.\n\n    Arguments:\n        model: The Spacy model to use\n        top_n_words: The top n words to extract\n        pos_patterns: Patterns for Spacy to use.\n                      See https://spacy.io/usage/rule-based-matching\n\n    Usage:\n\n    ```python\n    from bertopic.representation import PartOfSpeech\n    from bertopic import BERTopic\n\n    # Create your representation model\n    representation_model = PartOfSpeech(\"en_core_web_sm\")\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can define custom POS patterns to be extracted:\n\n    ```python\n    pos_patterns = [\n                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n                [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n    ]\n    representation_model = PartOfSpeech(\"en_core_web_sm\", pos_patterns=pos_patterns)\n    ```\n    \"\"\"\n    def __init__(self,\n                 model: Union[str, Language] = \"en_core_web_sm\",\n                 top_n_words: int = 10,\n                 pos_patterns: List[str] = None):\n        if isinstance(model, str):\n            self.model = spacy.load(model)\n        elif isinstance(model, Language):\n            self.model = model\n        else:\n            raise ValueError(\"Make sure that the Spacy model that you\"\n                             \"pass is either a string referring to a\"\n                             \"Spacy model or a Spacy nlp object.\")\n\n        self.top_n_words = top_n_words\n\n        if pos_patterns is None:\n            self.pos_patterns = [\n                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n                [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n            ]\n        else:\n            self.pos_patterns = pos_patterns\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        matcher = Matcher(self.model.vocab)\n        matcher.add(\"Pattern\", self.pos_patterns)\n\n        candidate_topics = {}\n        for topic, values in topics.items():\n            keywords = list(zip(*values))[0]\n\n            # Extract candidate documents\n            candidate_documents = []\n            for keyword in keywords:\n                selection = documents.loc[documents.Topic == topic, :]\n                selection = selection.loc[selection.Document.str.contains(keyword), \"Document\"]\n                if len(selection) > 0:\n                    for document in selection[:2]:\n                        candidate_documents.append(document)\n            candidate_documents = list(set(candidate_documents))\n\n            # Extract keywords\n            docs_pipeline = self.model.pipe(candidate_documents)\n            updated_keywords = []\n            for doc in docs_pipeline:\n                matches = matcher(doc)\n                for _, start, end in matches:\n                    updated_keywords.append(doc[start:end].text)\n            candidate_topics[topic] = list(set(updated_keywords))\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = list(topic_model.vectorizer_model.get_feature_names_out())\n        else:\n            words = list(topic_model.vectorizer_model.get_feature_names())\n\n        # Match updated keywords with c-TF-IDF values\n        words_lookup = dict(zip(words, range(len(words))))\n        updated_topics = {topic: [] for topic in topics.keys()}\n\n        for topic, candidate_keywords in candidate_topics.items():\n            word_indices = np.sort([words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)])\n            vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]\n            indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]\n            vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]\n            topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]\n            updated_topics[topic] = topic_words\n            if len(updated_topics[topic]) < self.top_n_words:\n                updated_topics[topic] += [(\"\", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))]\n\n        return updated_topics\n
        "},{"location":"api/representation/pos.html#bertopic.representation._pos.PartOfSpeech.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_pos.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topics\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    matcher = Matcher(self.model.vocab)\n    matcher.add(\"Pattern\", self.pos_patterns)\n\n    candidate_topics = {}\n    for topic, values in topics.items():\n        keywords = list(zip(*values))[0]\n\n        # Extract candidate documents\n        candidate_documents = []\n        for keyword in keywords:\n            selection = documents.loc[documents.Topic == topic, :]\n            selection = selection.loc[selection.Document.str.contains(keyword), \"Document\"]\n            if len(selection) > 0:\n                for document in selection[:2]:\n                    candidate_documents.append(document)\n        candidate_documents = list(set(candidate_documents))\n\n        # Extract keywords\n        docs_pipeline = self.model.pipe(candidate_documents)\n        updated_keywords = []\n        for doc in docs_pipeline:\n            matches = matcher(doc)\n            for _, start, end in matches:\n                updated_keywords.append(doc[start:end].text)\n        candidate_topics[topic] = list(set(updated_keywords))\n\n    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n    # and will be removed in 1.2. Please use get_feature_names_out instead.\n    if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n        words = list(topic_model.vectorizer_model.get_feature_names_out())\n    else:\n        words = list(topic_model.vectorizer_model.get_feature_names())\n\n    # Match updated keywords with c-TF-IDF values\n    words_lookup = dict(zip(words, range(len(words))))\n    updated_topics = {topic: [] for topic in topics.keys()}\n\n    for topic, candidate_keywords in candidate_topics.items():\n        word_indices = np.sort([words_lookup.get(keyword) for keyword in candidate_keywords if words_lookup.get(keyword)])\n        vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]\n        indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]\n        vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words:][::-1]\n        topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]\n        updated_topics[topic] = topic_words\n        if len(updated_topics[topic]) < self.top_n_words:\n            updated_topics[topic] += [(\"\", 0) for _ in range(self.top_n_words-len(updated_topics[topic]))]\n\n    return updated_topics\n
        "},{"location":"api/representation/zeroshot.html","title":"ZeroShotClassification","text":"

        Zero-shot Classification on topic keywords with candidate labels

        Parameters:

        Name Type Description Default candidate_topics List[str]

        A list of labels to assign to the topics if they exceed min_prob

        required model str

        A transformers pipeline that should be initialized as \"zero-shot-classification\". For example, pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")

        'facebook/bart-large-mnli' pipeline_kwargs Mapping[str, Any]

        Kwargs that you can pass to the transformers.pipeline when it is called. NOTE: Use {\"multi_label\": True} to extract multiple labels for each topic.

        {} min_prob float

        The minimum probability to assign a candidate label to a topic

        0.8

        Usage:

        from bertopic.representation import ZeroShotClassification\nfrom bertopic import BERTopic\n\n# Create your representation model\ncandidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\nrepresentation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        Source code in bertopic\\representation\\_zeroshot.py
        class ZeroShotClassification(BaseRepresentation):\n    \"\"\" Zero-shot Classification on topic keywords with candidate labels\n\n    Arguments:\n        candidate_topics: A list of labels to assign to the topics if they\n                          exceed `min_prob`\n        model: A transformers pipeline that should be initialized as\n               \"zero-shot-classification\". For example,\n               `pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")`\n        pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline\n                         when it is called. NOTE: Use `{\"multi_label\": True}`\n                         to extract multiple labels for each topic.\n        min_prob: The minimum probability to assign a candidate label to a topic\n\n    Usage:\n\n    ```python\n    from bertopic.representation import ZeroShotClassification\n    from bertopic import BERTopic\n\n    # Create your representation model\n    candidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\n    representation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n    \"\"\"\n    def __init__(self,\n                 candidate_topics: List[str],\n                 model: str = \"facebook/bart-large-mnli\",\n                 pipeline_kwargs: Mapping[str, Any] = {},\n                 min_prob: float = 0.8\n                 ):\n        self.candidate_topics = candidate_topics\n        if isinstance(model, str):\n            self.model = pipeline(\"zero-shot-classification\", model=model)\n        elif isinstance(model, Pipeline):\n            self.model = model\n        else:\n            raise ValueError(\"Make sure that the HF model that you\"\n                             \"pass is either a string referring to a\"\n                             \"HF model or a `transformers.pipeline` object.\")\n        self.pipeline_kwargs = pipeline_kwargs\n        self.min_prob = min_prob\n\n    def extract_topics(self,\n                       topic_model,\n                       documents: pd.DataFrame,\n                       c_tf_idf: csr_matrix,\n                       topics: Mapping[str, List[Tuple[str, float]]]\n                       ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: Not used\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Classify topics\n        topic_descriptions = [\" \".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()]\n        classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs)\n\n        # Extract labels\n        updated_topics = {}\n        for topic, classification in zip(topics.keys(), classifications):\n            topic_description = topics[topic]\n\n            # Multi-label assignment\n            if self.pipeline_kwargs.get(\"multi_label\"):\n                topic_description = []\n                for label, score in zip(classification[\"labels\"], classification[\"scores\"]):\n                    if score > self.min_prob:\n                        topic_description.append((label, score))\n\n            # Single label assignment\n            elif classification[\"scores\"][0] > self.min_prob:\n                topic_description = [(classification[\"labels\"][0], classification[\"scores\"][0])]\n\n            # Make sure that 10 items are returned\n            if len(topic_description) == 0:\n                topic_description = topics[topic]\n            elif len(topic_description) < 10:\n                topic_description += [(\"\", 0) for _ in range(10-len(topic_description))]\n            updated_topics[topic] = topic_description\n\n        return updated_topics\n
        "},{"location":"api/representation/zeroshot.html#bertopic.representation._zeroshot.ZeroShotClassification.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics

        Parameters:

        Name Type Description Default topic_model

        Not used

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_zeroshot.py
        def extract_topics(self,\n                   topic_model,\n                   documents: pd.DataFrame,\n                   c_tf_idf: csr_matrix,\n                   topics: Mapping[str, List[Tuple[str, float]]]\n                   ) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\" Extract topics\n\n    Arguments:\n        topic_model: Not used\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Classify topics\n    topic_descriptions = [\" \".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()]\n    classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs)\n\n    # Extract labels\n    updated_topics = {}\n    for topic, classification in zip(topics.keys(), classifications):\n        topic_description = topics[topic]\n\n        # Multi-label assignment\n        if self.pipeline_kwargs.get(\"multi_label\"):\n            topic_description = []\n            for label, score in zip(classification[\"labels\"], classification[\"scores\"]):\n                if score > self.min_prob:\n                    topic_description.append((label, score))\n\n        # Single label assignment\n        elif classification[\"scores\"][0] > self.min_prob:\n            topic_description = [(classification[\"labels\"][0], classification[\"scores\"][0])]\n\n        # Make sure that 10 items are returned\n        if len(topic_description) == 0:\n            topic_description = topics[topic]\n        elif len(topic_description) < 10:\n            topic_description += [(\"\", 0) for _ in range(10-len(topic_description))]\n        updated_topics[topic] = topic_description\n\n    return updated_topics\n
        "},{"location":"getting_started/best_practices/best_practices.html","title":"Best Practices","text":"

        - Overview of Best Practices

        Through the nature of BERTopic, its modularity, many variations of the topic modeling technique is possible. However, during the development and through the usage of the package, a set of best practices have been developed that generally lead to great results.

        The following are a number of steps, parameters, and settings that you can use that will generally improve the quality of the resulting topics. In other words, after going through the quick start and getting a feeling for the API these steps should get you to the next level of performance.

        Note

        Although these are called best practices, it does not necessarily mean that they work across all use cases perfectly. The underlying modular nature of BERTopic is meant to take different use cases into account. After going through these practices it is advised to fine-tune wherever necessary.

        To showcase how these \"best practices\" work, we will go through an example dataset and apply all practices to it.

        "},{"location":"getting_started/best_practices/best_practices.html#data","title":"Data","text":"

        For this example, we will use a dataset containing abstracts and metadata from ArXiv articles.

        from datasets import load_dataset\n\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\n\n# Extract abstracts to train on and corresponding titles\nabstracts = dataset[\"abstract\"]\ntitles = dataset[\"title\"]\n

        Sentence Splitter

        Whenever you have large documents, you typically want to split them up into either paragraphs or sentences. A nice way to do so is by using NLTK's sentence splitter which is nothing more than:

        from nltk.tokenize import sent_tokenize, word_tokenize\nsentences = [sent_tokenize(abstract) for abstract in abstracts]\nsentences = [sentence for doc in sentences for sentence in doc]\n
        "},{"location":"getting_started/best_practices/best_practices.html#pre-calculate-embeddings","title":"Pre-calculate Embeddings","text":"

        After having created our data, namely abstracts, we can dive into the very first best practice, pre-calculating embeddings.

        BERTopic works by converting documents into numerical values, called embeddings. This process can be very costly, especially if we want to iterate over parameters. Instead, we can calculate those embeddings once and feed them to BERTopic to skip calculating embeddings each time.

        from sentence_transformers import SentenceTransformer\n\n# Pre-calculate embeddings\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = embedding_model.encode(abstracts, show_progress_bar=True)\n

        Tip

        New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the MTEB leaderboard. It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.

        "},{"location":"getting_started/best_practices/best_practices.html#preventing-stochastic-behavior","title":"Preventing Stochastic Behavior","text":"

        In BERTopic, we generally use a dimensionality reduction algorithm to reduce the size of the embeddings. This is done to prevent the curse of dimensionality to a certain degree.

        As a default, this is done with UMAP which is an incredible algorithm for reducing dimensional space. However, by default, it shows stochastic behavior which creates different results each time you run it. To prevent that, we will need to set a random_state of the model before passing it to BERTopic.

        As a result, we can now fully reproduce the results each time we run the model.

        from umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n
        "},{"location":"getting_started/best_practices/best_practices.html#controlling-number-of-topics","title":"Controlling Number of Topics","text":"

        There is a parameter to control the number of topics, namely nr_topics. This parameter, however, merges topics after they have been created. It is a parameter that supports creating a fixed number of topics.

        However, it is advised to control the number of topics through the cluster model which is by default HDBSCAN. HDBSCAN has a parameter, namely min_cluster_size that indirectly controls the number of topics that will be created.

        A higher min_cluster_size will generate fewer topics and a lower min_cluster_size will generate more topics.

        Here, we will go with min_cluster_size=150 to prevent too many micro-clusters from being created:

        from hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)\n
        "},{"location":"getting_started/best_practices/best_practices.html#improving-default-representation","title":"Improving Default Representation","text":"

        The default representation of topics is calculated through c-TF-IDF. However, c-TF-IDF is powered by the CountVectorizer which converts text into tokens. Using the CountVectorizer, we can do a number of things:

        • Remove stopwords
        • Ignore infrequent words
        • Increase the n-gram range

        In other words, we can preprocess the topic representations after documents are assigned to topics. This will not influence the clustering process in any way.

        Here, we will ignore English stopwords and infrequent words. Moreover, by increasing the n-gram range we will consider topic representations that are made up of one or two words.

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(stop_words=\"english\", min_df=2, ngram_range=(1, 2))\n
        "},{"location":"getting_started/best_practices/best_practices.html#additional-representations","title":"Additional Representations","text":"

        Previously, we have tuned the default representation but there are quite a number of other topic representations in BERTopic that we can choose from. From KeyBERTInspired and PartOfSpeech, to OpenAI's ChatGPT and open-source alternatives, many representations are possible.

        In BERTopic, you can model many different topic representations simultaneously to test them out and get different perspectives of topic descriptions. This is called multi-aspect topic modeling.

        Here, we will demonstrate a number of interesting and useful representations in BERTopic:

        • KeyBERTInspired
        • A method that derives inspiration from how KeyBERT works
        • PartOfSpeech
        • Using SpaCy's POS tagging to extract words
        • MaximalMarginalRelevance
        • Diversify the topic words
        • OpenAI
        • Use ChatGPT to label our topics
        import openai\nfrom bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech\n\n# KeyBERT\nkeybert_model = KeyBERTInspired()\n\n# Part-of-Speech\npos_model = PartOfSpeech(\"en_core_web_sm\")\n\n# MMR\nmmr_model = MaximalMarginalRelevance(diversity=0.3)\n\n# GPT-3.5\nclient = openai.OpenAI(api_key=\"sk-...\")\nprompt = \"\"\"\nI have a topic that contains the following documents: \n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:\ntopic: <topic label>\n\"\"\"\nopenai_model = OpenAI(client, model=\"gpt-3.5-turbo\", exponential_backoff=True, chat=True, prompt=prompt)\n\n# All representation models\nrepresentation_model = {\n    \"KeyBERT\": keybert_model,\n    # \"OpenAI\": openai_model,  # Uncomment if you will use OpenAI\n    \"MMR\": mmr_model,\n    \"POS\": pos_model\n}\n
        "},{"location":"getting_started/best_practices/best_practices.html#training","title":"Training","text":"

        Now that we have a set of best practices, we can use them in our training loop. Here, several different representations, keywords and labels for our topics will be created. If you want to iterate over the topic model it is advised to use the pre-calculated embeddings as that significantly speeds up training.

        from bertopic import BERTopic\n\ntopic_model = BERTopic(\n\n  # Pipeline models\n  embedding_model=embedding_model,\n  umap_model=umap_model,\n  hdbscan_model=hdbscan_model,\n  vectorizer_model=vectorizer_model,\n  representation_model=representation_model,\n\n  # Hyperparameters\n  top_n_words=10,\n  verbose=True\n)\n\n# Train model\ntopics, probs = topic_model.fit_transform(abstracts, embeddings)\n\n# Show topics\ntopic_model.get_topic_info()\n

        To get all representations for a single topic, we simply run the following:

        >>> topic_model.get_topic(1, full=True)\n{'Main': [('adversarial', 0.028838938990764302),\n  ('attacks', 0.021726302042463556),\n  ('attack', 0.016803574415028524),\n  ('robustness', 0.013046135743326167),\n  ('adversarial examples', 0.01151254557995679),\n  ('examples', 0.009920962487998853),\n  ('perturbations', 0.009053305826870773),\n  ('adversarial attacks', 0.008747627064844006),\n  ('malware', 0.007675131707700338),\n  ('defense', 0.007365955840313783)],\n 'KeyBERT': [('adversarial training', 0.76427937),\n  ('adversarial attack', 0.74271905),\n  ('vulnerable adversarial', 0.73302543),\n  ('adversarial', 0.7311052),\n  ('adversarial examples', 0.7179245),\n  ('adversarial attacks', 0.7082),\n  ('adversarially', 0.7005141),\n  ('adversarial robustness', 0.69911957),\n  ('adversarial perturbations', 0.6588783),\n  ('adversary', 0.4467769)],\n 'OpenAI': [('Adversarial attacks and defense', 1)],\n 'MMR': [('adversarial', 0.028838938990764302),\n  ('attacks', 0.021726302042463556),\n  ('attack', 0.016803574415028524),\n  ('robustness', 0.013046135743326167),\n  ('adversarial examples', 0.01151254557995679),\n  ('examples', 0.009920962487998853),\n  ('perturbations', 0.009053305826870773),\n  ('adversarial attacks', 0.008747627064844006),\n  ('malware', 0.007675131707700338),\n  ('defense', 0.007365955840313783)],\n 'POS': [('adversarial', 0.028838938990764302),\n  ('attacks', 0.021726302042463556),\n  ('attack', 0.016803574415028524),\n  ('robustness', 0.013046135743326167),\n  ('adversarial examples', 0.01151254557995679),\n  ('examples', 0.009920962487998853),\n  ('perturbations', 0.009053305826870773),\n  ('adversarial attacks', 0.008747627064844006),\n  ('malware', 0.007675131707700338),\n  ('defense', 0.007365955840313783)]}\n

        NOTE: The labels generated by OpenAI's ChatGPT are especially interesting to use throughout your model. Below, we will go into more detail how to set that as a custom label.

        Parameters

        If you would like to return the topic-document probability matrix, then it is advised to use calculate_probabilities=True. Do note that this can significantly slow down training. To speed it up, use cuML's HDBSCAN instead. You could also approximate the topic-document probability matrix with .approximate_distribution which will be discussed later.

        "},{"location":"getting_started/best_practices/best_practices.html#custom-labels","title":"(Custom) Labels","text":"

        The default label of each topic are the top 3 words in each topic combined with an underscore between them.

        This, of course, might not be the best label that you can think of for a certain topic. Instead, we can use .set_topic_labels to manually label all or certain topics.

        We can also use .set_topic_labels to use one of the other topic representations that we had before, like KeyBERTInspired or even OpenAI.

        # Label the topics yourself\ntopic_model.set_topic_labels({1: \"Space Travel\", 7: \"Religion\"})\n\n# or use one of the other topic representations, like KeyBERTInspired\nkeybert_topic_labels = {topic: \" | \".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_[\"KeyBERT\"].items()}\ntopic_model.set_topic_labels(keybert_topic_labels)\n\n# or ChatGPT's labels\nchatgpt_topic_labels = {topic: \" | \".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_[\"OpenAI\"].items()}\nchatgpt_topic_labels[-1] = \"Outlier Topic\"\ntopic_model.set_topic_labels(chatgpt_topic_labels)\n

        Now that we have set the updated topic labels, we can access them with the many functions used throughout BERTopic. Most notably, you can show the updated labels in visualizations with the custom_labels=True parameters.

        If we were to run topic_model.get_topic_info() it will now include the column CustomName. That is the custom label that we just created for each topic.

        "},{"location":"getting_started/best_practices/best_practices.html#topic-document-distribution","title":"Topic-Document Distribution","text":"

        If using calculate_probabilities=True is not possible, then you can approximate the topic-document distributions using .approximate_distribution. It is a fast and flexible method for creating different topic-document distributions.

        # `topic_distr` contains the distribution of topics in each document\ntopic_distr, _ = topic_model.approximate_distribution(abstracts, window=8, stride=4)\n

        Next, lets take a look at a specific abstract and see how the topic distribution was extracted:

        # Visualize the topic-document distribution for a single document\ntopic_model.visualize_distribution(topic_distr[abstract_id], custom_labels=True)\n

        It seems to have extracted a number of topics that are relevant and shows the distributions of these topics across the abstract. We can go one step further and visualize them on a token-level:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(abstracts[abstract_id], calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(abstracts[abstract_id], topic_token_distr[0])\ndf\n

        use_embedding_model

        As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected embedding_model instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower:

        topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True)\n
        "},{"location":"getting_started/best_practices/best_practices.html#outlier-reduction","title":"Outlier Reduction","text":"

        By default, HDBSCAN generates outliers which is a helpful mechanic in creating accurate topic representations. However, you might want to assign every single document to a topic. We can use .reduce_outliers to map some or all outliers to a topic:

        # Reduce outliers\nnew_topics = topic_model.reduce_outliers(abstracts, topics)\n\n# Reduce outliers with pre-calculate embeddings instead\nnew_topics = topic_model.reduce_outliers(abstracts, topics, strategy=\"embeddings\", embeddings=embeddings)\n

        Update Topics with Outlier Reduction

        After having generated updated topic assignments, we can pass them to BERTopic in order to update the topic representations:

        topic_model.update_topics(docs, topics=new_topics)\n

        It is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2.

        "},{"location":"getting_started/best_practices/best_practices.html#visualize-topics","title":"Visualize Topics","text":"

        With visualizations, we are closing into the realm of subjective \"best practices\". These are things that I generally do because I like the representations but your experience might differ.

        Having said that, there are two visualizations that are my go-to when visualizing the topics themselves:

        • topic_model.visualize_topics()
        • topic_model.visualize_hierarchy()
        # Visualize topics with custom labels\ntopic_model.visualize_topics(custom_labels=True)\n\n# Visualize hierarchy with custom labels\ntopic_model.visualize_hierarchy(custom_labels=True)\n
        "},{"location":"getting_started/best_practices/best_practices.html#visualize-documents","title":"Visualize Documents","text":"

        When visualizing documents, it helps to have embedded the documents beforehand to speed up computation. Fortunately, we have already done that as a \"best practice\".

        Visualizing documents in 2-dimensional space helps in understanding the underlying structure of the documents and topics.

        # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n

        The following plot is interactive which means that you can zoom in, double click on a label to only see that one and generally interact with the plot:

        # Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts\n# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset\n# NOTE: You can also hide the annotations with `hide_annotations=True` which is helpful to see the larger structure\ntopic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)\n

        2-dimensional space

        Although visualizing the documents in 2-dimensional gives an idea of their underlying structure, there is a risk involved.

        Visualizing the documents in 2-dimensional space means that we have lost significant information since the original embeddings were more than 384 dimensions. Condensing all that information in 2 dimensions is simply not possible. In other words, it is merely an approximation, albeit quite an accurate one.

        "},{"location":"getting_started/best_practices/best_practices.html#serialization","title":"Serialization","text":"

        When saving a BERTopic model, there are several ways in doing so. You can either save the entire model with pickle, pytorch, or safetensors.

        Personally, I would advise going with safetensors whenever possible. The reason for this is that the format allows for a very small topic model to be saved and shared.

        When saving a model with safetensors, it skips over saving the dimensionality reduction and clustering models. The .transform function will still work without these models but instead assign topics based on the similarity between document embeddings and the topic embeddings.

        As a result, the .transform step might give different results but it is generally worth it considering the smaller and significantly faster model.

        embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"my_model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n

        Embedding Model

        Using safetensors, we are not saving the underlying embedding model but merely a pointer to the model. For example, in the above example we are saving the string \"sentence-transformers/all-MiniLM-L6-v2\" so that we can load in the embedding model alongside the topic model.

        This currently only works if you are using a sentence transformer model. If you are using a different model, you can load it in when loading the topic model like this:

        from sentence_transformers import SentenceTransformer\n\n# Define embedding model\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n\n# Load model and add embedding model\nloaded_model = BERTopic.load(\"my_model_dir\", embedding_model=embedding_model)\n
        "},{"location":"getting_started/best_practices/best_practices.html#inference","title":"Inference","text":"

        To speed up the inference, we can leverage a \"best practice\" that we used before, namely serialization. When you save a model as safetensors and then load it in, we are removing the dimensionality reduction and clustering steps from the pipeline.

        Instead, the assignment of topics is done through cosine similarity of document embeddings and topic embeddings. This speeds up inferences significantly.

        To show its effect, let's start by disabling the logger:

        from bertopic._utils import MyLogger\nlogger = MyLogger(\"ERROR\")\nloaded_model.verbose = False\ntopic_model.verbose = False\n

        Then, we run inference on both the loaded model and the non-loaded model:

        >>> %timeit loaded_model.transform(abstracts[:100])\n343 ms \u00b1 31.1 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\n
        >>> %timeit topic_model.transform(abstracts[:100])\n1.37 s \u00b1 166 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\n

        Based on the above, the loaded_model seems to be quite a bit faster for inference than the original topic_model.

        "},{"location":"getting_started/clustering/clustering.html","title":"3. Clustering","text":"

        After reducing the dimensionality of our input embeddings, we need to cluster them into groups of similar embeddings to extract our topics. This process of clustering is quite important because the more performant our clustering technique the more accurate our topic representations are.

        In BERTopic, we typically use HDBSCAN as it is quite capable of capturing structures with different densities. However, there is not one perfect clustering model and you might want to be using something entirely different for your use case. Moreover, what if a new state-of-the-art model is released tomorrow? We would like to be able to use that in BERTopic, right? Since BERTopic assumes some independence among steps, we can allow for this modularity:

        As a result, the hdbscan_model parameter in BERTopic now allows for a variety of clustering models. To do so, the class should have the following attributes:

        • .fit(X)
          • A function that can be used to fit the model
        • .predict(X)
          • A predict function that transforms the input to cluster labels
        • .labels_
          • The labels after fitting the model

        In other words, it should have the following structure:

        class ClusterModel:\n    def fit(self, X):\n        self.labels_ = None\n        return self\n\n    def predict(self, X):\n        return X\n

        In this section, we will go through several examples of clustering algorithms and how they can be implemented.

        "},{"location":"getting_started/clustering/clustering.html#hdbscan","title":"HDBSCAN","text":"

        As a default, BERTopic uses HDBSCAN to perform its clustering. To use a HDBSCAN model with custom parameters, we simply define it and pass it to BERTopic:

        from bertopic import BERTopic\nfrom hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model)\n

        Here, we can define any parameters in HDBSCAN to optimize for the best performance based on whatever validation metrics you are using.

        "},{"location":"getting_started/clustering/clustering.html#k-means","title":"k-Means","text":"

        Although HDBSCAN works quite well in BERTopic and is typically advised, you might want to be using k-Means instead. It allows you to select how many clusters you would like and forces every single point to be in a cluster. Therefore, no outliers will be created. This also has disadvantages. When you force every single point in a cluster, it will mean that the cluster is highly likely to contain noise which can hurt the topic representations. As a small tip, using the vectorizer_model=CountVectorizer(stop_words=\"english\") helps quite a bit to then improve the topic representation.

        Having said that, using k-Means is quite straightforward:

        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\n\ncluster_model = KMeans(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n

        Note

        As you might have noticed, the cluster_model is passed to hdbscan_model which might be a bit confusing considering you are not passing an HDBSCAN model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible.

        "},{"location":"getting_started/clustering/clustering.html#agglomerative-clustering","title":"Agglomerative Clustering","text":"

        Like k-Means, there are a bunch more clustering algorithms in sklearn that you can be using. Some of these models do not have a .predict() method but still can be used in BERTopic. However, using BERTopic's .transform() function will then give errors.

        Here, we will demonstrate Agglomerative Clustering:

        from bertopic import BERTopic\nfrom sklearn.cluster import AgglomerativeClustering\n\ncluster_model = AgglomerativeClustering(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n
        "},{"location":"getting_started/clustering/clustering.html#cuml-hdbscan","title":"cuML HDBSCAN","text":"

        Although the original HDBSCAN implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, we can use cuML to speed up HDBSCAN through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model)\n

        The great thing about using cuML's HDBSCAN implementation is that it supports many features of the original implementation. In other words, calculate_probabilities=True also works!

        Note

        As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., .transform) using cuML's HDBSCAN. However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., .fit and .fit_transform).

        Note

        If you want to install cuML together with BERTopic using Google Colab, you can run the following code:

        !pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64\n
        "},{"location":"getting_started/ctfidf/ctfidf.html","title":"c-TF-IDF","text":"

        In BERTopic, in order to get an accurate representation of the topics from our bag-of-words matrix, TF-IDF was adjusted to work on a cluster/categorical/topic level instead of a document level. This adjusted TF-IDF representation is called c-TF-IDF and takes into account what makes the documents in one cluster different from documents in another cluster:

        Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word x in class c, where c refers to the cluster we created before. This results in our class-based tf representation. This representation is L1-normalized to account for the differences in topic sizes. Then, we take the logarithm of one plus the average number of words per class A divided by the frequency of word x across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based idf representation. Like with the classic TF-IDF, we then multiply tf with idf to get the importance score per word in each class. In other words, the classical TF-IDF procedure is not used here but a modified version of the algorithm that allows for a much better representation.

        Since the topic representation is somewhat independent of the clustering step, we can change how the c-TF-IDF representation will look like. This can be in the form of parameter tuning, different weighting schemes, or using a diversity metric on top of it. This allows for some modularity concerning the weighting scheme:

        This class-based TF-IDF representation is enabled by default in BERTopic. However, we can explicitly pass it to BERTopic through the ctfidf_model allowing for parameter tuning and the customization of the topic extraction technique:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer()\ntopic_model = BERTopic(ctfidf_model=ctfidf_model )\n
        "},{"location":"getting_started/ctfidf/ctfidf.html#parameters","title":"Parameters","text":"

        There are two parameters worth exploring in the ClassTfidfTransformer, namely bm25_weighting and reduce_frequent_words.

        "},{"location":"getting_started/ctfidf/ctfidf.html#bm25_weighting","title":"bm25_weighting","text":"

        The bm25_weighting is a boolean parameter that indicates whether a class-based BM-25 weighting measure is used instead of the default method as defined in the formula at the beginning of this page.

        Instead of using the following weighting scheme:

        the class-based BM-25 weighting is used instead:

        At smaller datasets, this variant can be more robust to stop words that appear in your data. It can be enabled as follows:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(bm25_weighting=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model )\n
        "},{"location":"getting_started/ctfidf/ctfidf.html#reduce_frequent_words","title":"reduce_frequent_words","text":"

        Some words appear quite often in every topic but are generally not considered stop words as found in the CountVectorizer(stop_words=\"english\") list. To further reduce these frequent words, we can use reduce_frequent_words to take the square root of the term frequency after applying the weighting scheme.

        Instead of the default term frequency:

        we take the square root of the term frequency after normalizing the frequency matrix:

        Although seemingly a small change, it can have quite a large effect on the number of stop words in the resulting topic representations. It can be enabled as follows:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model )\n

        Tip

        Both parameters can be used simultaneously: ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

        "},{"location":"getting_started/dim_reduction/dim_reduction.html","title":"2. Dimensionality Reduction","text":"

        An important aspect of BERTopic is the dimensionality reduction of the input embeddings. As embeddings are often high in dimensionality, clustering becomes difficult due to the curse of dimensionality.

        A solution is to reduce the dimensionality of the embeddings to a workable dimensional space (e.g., 5) for clustering algorithms to work with. UMAP is used as a default in BERTopic since it can capture both the local and global high-dimensional space in lower dimensions. However, there are other solutions out there, such as PCA that users might be interested in trying out. Since BERTopic allows assumes some independency between steps, we can use any other dimensionality reduction algorithm. The image below illustrates this modularity:

        As a result, the umap_model parameter in BERTopic now allows for a variety of dimensionality reduction models. To do so, the class should have the following attributes:

        • .fit(X)
          • A function that can be used to fit the model
        • .transform(X)
          • A transform function that transforms the input to a lower dimensional size

        In other words, it should have the following structure:

        class DimensionalityReduction:\n    def fit(self, X):\n        return self\n\n    def transform(self, X):\n        return X\n

        In this section, we will go through several examples of dimensionality reduction techniques and how they can be implemented.

        "},{"location":"getting_started/dim_reduction/dim_reduction.html#umap","title":"UMAP","text":"

        As a default, BERTopic uses UMAP to perform its dimensionality reduction. To use a UMAP model with custom parameters, we simply define it and pass it to BERTopic:

        from bertopic import BERTopic\nfrom umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')\ntopic_model = BERTopic(umap_model=umap_model)\n

        Here, we can define any parameters in UMAP to optimize for the best performance based on whatever validation metrics you are using.

        "},{"location":"getting_started/dim_reduction/dim_reduction.html#pca","title":"PCA","text":"

        Although UMAP works quite well in BERTopic and is typically advised, you might want to be using PCA instead. It can be faster to train and perform inference. To use PCA, we can simply import it from sklearn and pass it to the umap_model parameter:

        from bertopic import BERTopic\nfrom sklearn.decomposition import PCA\n\ndim_model = PCA(n_components=5)\ntopic_model = BERTopic(umap_model=dim_model)\n

        As a small note, PCA and k-Means have worked quite well in my experiments and might be interesting to use instead of PCA and HDBSCAN.

        Note

        As you might have noticed, the dim_model is passed to umap_model which might be a bit confusing considering you are not passing a UMAP model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible.

        "},{"location":"getting_started/dim_reduction/dim_reduction.html#truncated-svd","title":"Truncated SVD","text":"

        Like PCA, there are a bunch more dimensionality reduction techniques in sklearn that you can be using. Here, we will demonstrate Truncated SVD but any model can be used as long as it has both a .fit() and .transform() method:

        from bertopic import BERTopic\nfrom sklearn.decomposition import TruncatedSVD\n\ndim_model = TruncatedSVD(n_components=5)\ntopic_model = BERTopic(umap_model=dim_model)\n
        "},{"location":"getting_started/dim_reduction/dim_reduction.html#cuml-umap","title":"cuML UMAP","text":"

        Although the original UMAP implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, we can use cuML to speed up UMAP through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.manifold import UMAP\n\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\ntopic_model = BERTopic(umap_model=umap_model)\n

        Note

        If you want to install cuML together with BERTopic using Google Colab, you can run the following code:

        !pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64\n
        "},{"location":"getting_started/dim_reduction/dim_reduction.html#skip-dimensionality-reduction","title":"Skip dimensionality reduction","text":"

        Although BERTopic applies dimensionality reduction as a default in its pipeline, this is a step that you might want to skip. We generate an \"empty\" model that simply returns the data pass it to:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\n# Fit BERTopic without actually performing any dimensionality reduction\nempty_dimensionality_model = BaseDimensionalityReduction()\ntopic_model = BERTopic(umap_model=empty_dimensionality_model)\n

        In other words, we go from this pipeline:

        SBERT UMAP HDBSCAN c-TF-IDF Embeddings Dimensionality reduction Clustering Topic representation

        To the following pipeline:

        SBERT HDBSCAN c-TF-IDF Embeddings Clustering Topic representation

        "},{"location":"getting_started/distribution/distribution.html","title":"Topic Distributions","text":"

        BERTopic approaches topic modeling as a cluster task and attempts to cluster semantically similar documents to extract common topics. A disadvantage of using such a method is that each document is assigned to a single cluster and therefore also a single topic. In practice, documents may contain a mixture of topics. This can be accounted for by splitting up the documents into sentences and feeding those to BERTopic.

        Another option is to use a cluster model that can perform soft clustering, like HDBSCAN. As BERTopic focuses on modularity, we may still want to model that mixture of topics even when we are using a hard-clustering model, like k-Means without the need to split up our documents. This is where .approximate_distribution comes in!

        the right problem is difficult Solving right the Solving the right problem right problem is problem is difficult create token sets topic-token set similarity document-topic distribution multi-topic assignment on a token level solving topic 2 topic 1 topic 3 topic 4 the right problem is difficult 0.75 0.32 0.16 0.21 0.29 0.81 0.47 0.26 0.12 0.33

        To perform this approximation, each document is split into tokens according to the provided tokenizer in the CountVectorizer. Then, a sliding window is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the document:

        Solving the right problem is difficult.

        can be split up into solving the right, the right problem, right problem is, and problem is difficult. These are called token sets. For each of these token sets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. Then, the similarities to the topics for each token set are summed to create a topic distribution for the entire document.

        Although it is often said that documents can contain a mixture of topics, these are often modeled by assigning each word to a single topic. With this approach, we take into account that there may be multiple topics for a single word.

        We can make this multiple-topic word assignment a bit more accurate by then splitting these token sets up into individual tokens and assigning the topic distributions for each token set to each individual token. That way, we can visualize the extent to which a certain word contributes to a document's topic distribution.

        "},{"location":"getting_started/distribution/distribution.html#example","title":"Example","text":"

        To calculate our topic distributions, we first need to fit a basic topic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic().fit(docs)\n

        After doing so, we can approximate the topic distributions for your documents:

        topic_distr, _ = topic_model.approximate_distribution(docs)\n

        The resulting topic_distr is a n x m matrix where n are the topics and m the documents. We can then visualize the distribution of topics in a document:

        topic_model.visualize_distribution(topic_distr[1])\n

        Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])\ndf\n

        Tip

        You can also approximate the topic distributions for unseen documents. It will not be as accurate as .transform but it is quite fast and can serve you well in a production setting.

        Note

        To get the stylized dataframe for .visualize_approximate_distribution you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via pip install jinja2

        "},{"location":"getting_started/distribution/distribution.html#parameters","title":"Parameters","text":"

        There are a few parameters that are of interest which will be discussed below.

        "},{"location":"getting_started/distribution/distribution.html#batch_size","title":"batch_size","text":"

        Creating token sets for each document can result in quite a large list of token sets. The similarity of these token sets with the topics can result a large matrix that might not fit into memory anymore. To circumvent this, we can process batches of documents instead to minimize the memory overload. The value for batch_size indicates the number of documents that will be processed at once:

        topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=500)\n
        "},{"location":"getting_started/distribution/distribution.html#window","title":"window","text":"

        The number of tokens that are combined into token sets are defined by the window parameter. Seeing as we are performing a sliding window, we can change the size of the window. A larger window takes more tokens into account but setting it too large can result in considering too much information. Personally, I like to have this window between 4 and 8:

        topic_distr, _ = topic_model.approximate_distribution(docs, window=4)\n
        "},{"location":"getting_started/distribution/distribution.html#stride","title":"stride","text":"

        The sliding window that is performed on a document shifts, as a default, 1 token to the right each time to create its token sets. As a result, especially with large windows, a single token gets judged several times. We can use the stride parameter to increase the number of tokens the window shifts to the right. By increasing this value, we are judging each token less frequently which often results in a much faster calculation. Combining this parameter with window is preferred. For example, if we have a very large dataset, we can set stride=4 and window=8 to judge token sets that contain 8 tokens but that are shifted with 4 steps each time. As a result, this increases the computational speed quite a bit:

        topic_distr, _ = topic_model.approximate_distribution(docs, window=4)\n
        "},{"location":"getting_started/distribution/distribution.html#use_embedding_model","title":"use_embedding_model","text":"

        As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected embedding_model instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower:

        topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True)\n
        "},{"location":"getting_started/embeddings/embeddings.html","title":"Embedding Models","text":"

        BERTopic starts with transforming our input documents into numerical representations. Although there are many ways this can be achieved, we typically use sentence-transformers (\"all-MiniLM-L6-v2\") as it is quite capable of capturing the semantic similarity between documents.

        However, there is not one perfect embedding model and you might want to be using something entirely different for your use case. Since BERTopic assumes some independence among steps, we can allow for this modularity:

        This modularity allows us not only to choose any embedding model to convert our documents into numerical representations, we can use essentially any data to perform our clustering. When new state-of-the-art pre-trained embedding models are released, BERTopic will be able to use them. As a result, BERTopic grows with any new models being released. Out of the box, BERTopic supports several embedding techniques. In this section, we will go through several of them and how they can be implemented.

        "},{"location":"getting_started/embeddings/embeddings.html#sentence-transformers","title":"Sentence Transformers","text":"

        You can select any model from sentence-transformers here and pass it through BERTopic with embedding_model:

        from bertopic import BERTopic\ntopic_model = BERTopic(embedding_model=\"all-MiniLM-L6-v2\")\n

        Or select a SentenceTransformer model with your parameters:

        from sentence_transformers import SentenceTransformer\n\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\ntopic_model = BERTopic(embedding_model=sentence_model)\n

        Tip 1!

        This embedding back-end was put here first for a reason, sentence-transformers works amazing out of the box! Playing around with different models can give you great results. Also, make sure to frequently visit this page as new models are often released.

        Tip 2!

        New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the MTEB leaderboard. It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.

        Many of these models can be used with SentenceTransformers in BERTopic, like so:

        from sentence_transformers import SentenceTransformer\n\nembedding_model = SentenceTransformer(\"BAAI/bge-base-en-v1.5\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#hugging-face-transformers","title":"\ud83e\udd17 Hugging Face Transformers","text":"

        To use a Hugging Face transformers model, load in a pipeline and point to any model found on their model hub (https://huggingface.co/models):

        from transformers.pipelines import pipeline\n\nembedding_model = pipeline(\"feature-extraction\", model=\"distilbert-base-cased\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n

        Tip!

        These transformers also work quite well using sentence-transformers which has great optimizations tricks that make using it a bit faster.

        "},{"location":"getting_started/embeddings/embeddings.html#flair","title":"Flair","text":"

        Flair allows you to choose almost any embedding model that is publicly available. Flair can be used as follows:

        from flair.embeddings import TransformerDocumentEmbeddings\n\nroberta = TransformerDocumentEmbeddings('roberta-base')\ntopic_model = BERTopic(embedding_model=roberta)\n

        You can select any \ud83e\udd17 transformers model here.

        Moreover, you can also use Flair to use word embeddings and pool them to create document embeddings. Under the hood, Flair simply averages all word embeddings in a document. Then, we can easily pass it to BERTopic to use those word embeddings as document embeddings:

        from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings\n\nglove_embedding = WordEmbeddings('crawl')\ndocument_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])\n\ntopic_model = BERTopic(embedding_model=document_glove_embeddings)\n
        "},{"location":"getting_started/embeddings/embeddings.html#spacy","title":"Spacy","text":"

        Spacy is an amazing framework for processing text. There are many models available across many languages for modeling text.

        To use Spacy's non-transformer models in BERTopic:

        import spacy\n\nnlp = spacy.load(\"en_core_web_md\", exclude=['tagger', 'parser', 'ner', \n                                            'attribute_ruler', 'lemmatizer'])\n\ntopic_model = BERTopic(embedding_model=nlp)\n

        Using spacy-transformer models:

        import spacy\n\nspacy.prefer_gpu()\nnlp = spacy.load(\"en_core_web_trf\", exclude=['tagger', 'parser', 'ner', \n                                             'attribute_ruler', 'lemmatizer'])\n\ntopic_model = BERTopic(embedding_model=nlp)\n

        If you run into memory issues with spacy-transformer models, try:

        import spacy\nfrom thinc.api import set_gpu_allocator, require_gpu\n\nnlp = spacy.load(\"en_core_web_trf\", exclude=['tagger', 'parser', 'ner', \n                                             'attribute_ruler', 'lemmatizer'])\nset_gpu_allocator(\"pytorch\")\nrequire_gpu(0)\n\ntopic_model = BERTopic(embedding_model=nlp)\n
        "},{"location":"getting_started/embeddings/embeddings.html#universal-sentence-encoder-use","title":"Universal Sentence Encoder (USE)","text":"

        The Universal Sentence Encoder encodes text into high-dimensional vectors that are used here for embedding the documents. The model is trained and optimized for greater-than-word length text, such as sentences, phrases, or short paragraphs.

        Using USE in BERTopic is rather straightforward:

        import tensorflow_hub\nembedding_model = tensorflow_hub.load(\"https://tfhub.dev/google/universal-sentence-encoder/4\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#gensim","title":"Gensim","text":"

        BERTopic supports the gensim.downloader module, which allows it to download any word embedding model supported by Gensim. Typically, these are Glove, Word2Vec, or FastText embeddings:

        import gensim.downloader as api\nft = api.load('fasttext-wiki-news-subwords-300')\ntopic_model = BERTopic(embedding_model=ft)\n

        Tip!

        Gensim is primarily used for Word Embedding models. This works typically best for short documents since the word embeddings are pooled.

        "},{"location":"getting_started/embeddings/embeddings.html#scikit-learn-embeddings","title":"Scikit-Learn Embeddings","text":"

        Scikit-Learn is a framework for more than just machine learning. It offers many preprocessing tools, some of which can be used to create representations for text. Many of these tools are relatively lightweight and do not require a GPU. While the representations may be less expressive than many BERT models, the fact that it runs much faster can make it a relevant candidate to consider.

        If you have a scikit-learn compatible pipeline that you'd like to use to embed text then you can also pass this to BERTopic.

        from sklearn.pipeline import make_pipeline\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\npipe = make_pipeline(\n    TfidfVectorizer(),\n    TruncatedSVD(100)\n)\n\ntopic_model = BERTopic(embedding_model=pipe)\n

        Warning

        One caveat to be aware of is that scikit-learns base Pipeline class does not support the .partial_fit()-API. If you have a pipeline that theoretically should be able to support online learning then you might want to explore the scikit-partial project. Moreover, since this backend does not generate representations on a word level, it does not support the bertopic.representation models.

        "},{"location":"getting_started/embeddings/embeddings.html#openai","title":"OpenAI","text":"

        To use OpenAI's external API, we need to define our key and explicitly call bertopic.backend.OpenAIBackend to be used in our topic model:

        import openai\nfrom bertopic.backend import OpenAIBackend\n\nclient = openai.OpenAI(api_key=\"sk-...\")\nembedding_model = OpenAIBackend(client, \"text-embedding-ada-002\")\n\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#cohere","title":"Cohere","text":"

        To use Cohere's external API, we need to define our key and explicitly call bertopic.backend.CohereBackend to be used in our topic model:

        import cohere\nfrom bertopic.backend import CohereBackend\n\nclient = cohere.Client(\"MY_API_KEY\")\nembedding_model = CohereBackend(client)\n\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#multimodal","title":"Multimodal","text":"

        To create embeddings for both text and images in the same vector space, we can use the MultiModalBackend. This model uses a clip-vit based model that is capable of embedding text, images, or both:

        from bertopic.backend import MultiModalBackend\nmodel = MultiModalBackend('clip-ViT-B-32', batch_size=32)\n\n# Embed documents only\ndoc_embeddings = model.embed_documents(docs)\n\n# Embeding images only\nimage_embeddings = model.embed_images(images)\n\n# Embed both images and documents, then average them\ndoc_image_embeddings = model.embed(docs, images)\n
        "},{"location":"getting_started/embeddings/embeddings.html#custom-backend","title":"Custom Backend","text":"

        If your backend or model cannot be found in the ones currently available, you can use the bertopic.backend.BaseEmbedder class to create your backend. Below, you will find an example of creating a SentenceTransformer backend for BERTopic:

        from bertopic.backend import BaseEmbedder\nfrom sentence_transformers import SentenceTransformer\n\nclass CustomEmbedder(BaseEmbedder):\n    def __init__(self, embedding_model):\n        super().__init__()\n        self.embedding_model = embedding_model\n\n    def embed(self, documents, verbose=False):\n        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)\n        return embeddings \n\n# Create custom backend\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\ncustom_embedder = CustomEmbedder(embedding_model=embedding_model)\n\n# Pass custom backend to bertopic\ntopic_model = BERTopic(embedding_model=custom_embedder)\n
        "},{"location":"getting_started/embeddings/embeddings.html#custom-embeddings","title":"Custom Embeddings","text":"

        The base models in BERTopic are BERT-based models that work well with document similarity tasks. Your documents, however, might be too specific for a general pre-trained model to be used. Fortunately, you can use the embedding model in BERTopic to create document features.

        You only need to prepare the document embeddings yourself and pass them through fit_transform of BERTopic:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train our topic model using our pre-trained sentence-transformers embeddings\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n

        As you can see above, we used a SentenceTransformer model to create the embedding. You could also have used \ud83e\udd17 transformers, Doc2Vec, or any other embedding method.

        "},{"location":"getting_started/embeddings/embeddings.html#tf-idf","title":"TF-IDF","text":"

        As mentioned above, any embedding technique can be used. However, when running UMAP, the typical distance metric is cosine which does not work quite well for a TF-IDF matrix. Instead, BERTopic will recognize that a sparse matrix is passed and use hellinger instead which works quite well for the similarity between probability distributions.

        We simply create a TF-IDF matrix and use them as embeddings in our fit_transform method:

        from sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\n# Create TF-IDF sparse matrix\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nvectorizer = TfidfVectorizer(min_df=5)\nembeddings = vectorizer.fit_transform(docs)\n\n# Train our topic model using TF-IDF vectors\ntopic_model = BERTopic(stop_words=\"english\")\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n

        Here, you will probably notice that creating the embeddings is quite fast whereas fit_transform is quite slow. This is to be expected as reducing the dimensionality of a large sparse matrix takes some time. The inverse of using transformer embeddings is true: creating the embeddings is slow whereas fit_transform is quite fast.

        "},{"location":"getting_started/guided/guided.html","title":"Guided Topic Modeling","text":"

        Guided Topic Modeling or Seeded Topic Modeling is a collection of techniques that guides the topic modeling approach by setting several seed topics to which the model will converge to. These techniques allow the user to set a predefined number of topic representations that are sure to be in documents. For example, take an IT business that has a ticket system for the software their clients use. Those tickets may typically contain information about a specific bug regarding login issues that the IT business is aware of.

        To model that bug, we can create a seed topic representation containing the words bug, login, password, and username. By defining those words, a Guided Topic Modeling approach will try to converge at least one topic to those words.

        \"drug cancer drugs doctor\" \"windows drive dos file\" \"space launch orbit lunar\" Concatenate and embed the keywords/keyphrases using the embedding model. For each document, generate labels by finding which seeded topic fits best based on cosine similarity between embeddings. Average the embedding of each document with the selected seeded topic. Define seed topics through keywords or keyphrases. \"drug\", \"cancer\", \"drugs\", \"doctor\" Seed topic 1 Seed topic 2 Seed topic 3 \"windows\", \"drive\", \"dos\", \"file\" \"space\", \"launch\", \"orbit\", \"lunar\" Seed topic 3 Seed topic 2 No seed topic match found Seed topic 2 seed topic embedding document embedding + 2 Mutiply the IDF values of the seeded keywords across all topics with 1.2. Word IDF Multiplier Adjusted IDF drug 1.2 .55 .66 1.2 doctor .78 .94 cat 1 .22 .22 1 dog .11 .11 space 1.2 .35 .42 1.2 launch .89 1.07

        Guided BERTopic has two main steps:

        First, we create embeddings for each seeded topic by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics.

        Second, we take all words in seed_topic_list and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier.

        "},{"location":"getting_started/guided/guided.html#example","title":"Example","text":"

        To demonstrate Guided BERTopic, we use the 20 Newsgroups dataset as our example. We have frequently used this dataset in BERTopic examples and we sometimes see a topic generated about health with words such as drug and cancer being important. However, due to the stochastic nature of UMAP, this topic is not always found.

        In order to guide BERTopic to that topic, we create a seed topic list that we pass through our model. However, there may be several other topics that we know should be in the documents. Let's also initialize those:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\nseed_topic_list = [[\"drug\", \"cancer\", \"drugs\", \"doctor\"],\n                   [\"windows\", \"drive\", \"dos\", \"file\"],\n                   [\"space\", \"launch\", \"orbit\", \"lunar\"]]\n\ntopic_model = BERTopic(seed_topic_list=seed_topic_list)\ntopics, probs = topic_model.fit_transform(docs)\n

        As you can see above, the seed_topic_list contains a list of topic representations. By defining the above topics BERTopic is more likely to model the defined seeded topics. However, BERTopic is merely nudged towards creating those topics. In practice, if the seeded topics do not exist or might be divided into smaller topics, then they will not be modeled. Thus, seed topics need to be accurate to accurately converge towards them.

        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html","title":"Hierarchical Topic Modeling","text":"

        When tweaking your topic model, the number of topics that are generated has a large effect on the quality of the topic representations. Some topics could be merged and having an understanding of the effect will help you understand which topics should and which should not be merged.

        That is where hierarchical topic modeling comes in. It tries to model the possible hierarchical nature of the topics you have created to understand which topics are similar to each other. Moreover, you will have more insight into sub-topics that might exist in your data.

        Create a distance matrix by calculating the cosine similarity between c-TF-IDF representations of each topic. Apply a linkage function of choice on the distance matrix to model the hierarchical structure of topics. Topic 26 Topic 1 Topic 38 Topic 42 re-calculate c-TF-IDF Update the c-TF-IDF representation based on the collection of documents across the merged topics. Topic 1 .12 .12 .53 .53 .74 .74 .89 .89 .24 .24 .01 .01 1 1 1 1 ... ... ... ... ... ... ... ... 1 2 3 1 2 3 n ... . . . n

        In BERTopic, we can approximate this potential hierarchy by making use of our topic-term matrix (c-TF-IDF matrix). This matrix contains information about the importance of every word in every topic and makes for a nice numerical representation of our topics. The smaller the distance between two c-TF-IDF representations, the more similar we assume they are. In practice, this process of merging topics is done through the hierarchical clustering capabilities of scipy (see here). It allows for several linkage methods through which we can approximate our topic hierarchy. As a default, we are using the ward but many others are available.

        Whenever we merge two topics, we can calculate the c-TF-IDF representation of these two merged by summing their bag-of-words representation. We assume that two sets of topics are merged and that all others are kept the same, regardless of their location in the hierarchy. This helps us isolate the potential effect of merging sets of topics. As a result, we can see the topic representation at each level in the tree.

        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#example","title":"Example","text":"

        To demonstrate hierarchical topic modeling with BERTopic, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents.

        First, we train a basic BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\n

        Next, we can use our fitted BERTopic model to extract possible hierarchies from our c-TF-IDF matrix:

        hierarchical_topics = topic_model.hierarchical_topics(docs)\n

        The resulting hierarchical_topics is a dataframe in which merged topics are described. For example, if you would merge two topics, what would the topic representation of the new topic be?

        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#linkage-functions","title":"Linkage functions","text":"

        When creating the potential hierarchical nature of topics, we use Scipy's ward linkage function as a default to generate the hierarchy. However, you might want to use a different linkage function for your use case, such as single, complete, average, centroid, or median. In BERTopic, you can define the linkage function yourself, including the distance function that you would like to use:

        from scipy.cluster import hierarchy as sch\nfrom bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Hierarchical topics\nlinkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)\nhierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n
        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#visualizations","title":"Visualizations","text":"

        To visualize these results, we can start by running a familiar function, namely topic_model.visualize_hierarchy:

        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you hover over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes.

        Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use topic_model.get_topic_tree to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged:

        >>> tree = topic_model.get_topic_tree(hierarchical_topics)\n>>> print(tree)\n.\n\u2514\u2500atheists_atheism_god_moral_atheist\n     \u251c\u2500atheists_atheism_god_atheist_argument\n     \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n     \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n     \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n
        Click here to view the full tree.
          .\n  \u251c\u2500people_armenian_said_god_armenians\n  \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u2502    \u251c\u2500jehovah_lord_mormon_mcconkie_god\n  \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500ra_satan_thou_god_lucifer \u2500\u2500 Topic: 94\n  \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500jehovah_lord_mormon_mcconkie_unto \u2500\u2500 Topic: 78\n  \u2502    \u2502    \u2502    \u2514\u2500jesus_mary_god_hell_sin\n  \u2502    \u2502    \u2502         \u251c\u2500jesus_hell_god_eternal_heaven\n  \u2502    \u2502    \u2502         \u2502    \u251c\u2500hell_jesus_eternal_god_heaven\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500jesus_tomb_disciples_resurrection_john \u2500\u2500 Topic: 69\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500hell_eternal_god_jesus_heaven \u2500\u2500 Topic: 53\n  \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500aaron_baptism_sin_law_god \u2500\u2500 Topic: 89\n  \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500mary_sin_maria_priest_conception \u2500\u2500 Topic: 56\n  \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500marriage_married_marry_ceremony_marriages \u2500\u2500 Topic: 110\n  \u2502    \u2514\u2500people_armenian_armenians_said_mr\n  \u2502         \u251c\u2500people_armenian_armenians_said_israel\n  \u2502         \u2502    \u251c\u2500god_homosexual_homosexuality_atheists_sex\n  \u2502         \u2502    \u2502    \u251c\u2500homosexual_homosexuality_sex_gay_homosexuals\n  \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500kinsey_sex_gay_men_sexual \u2500\u2500 Topic: 44\n  \u2502         \u2502    \u2502    \u2502    \u2514\u2500homosexuality_homosexual_sin_homosexuals_gay\n  \u2502         \u2502    \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500gay_homosexual_homosexuals_sexual_cramer \u2500\u2500 Topic: 50\n  \u2502         \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500homosexuality_homosexual_sin_paul_sex \u2500\u2500 Topic: 27\n  \u2502         \u2502    \u2502    \u2514\u2500god_atheists_atheism_moral_atheist\n  \u2502         \u2502    \u2502         \u251c\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500jim_context_challenges_articles_quote \u2500\u2500 Topic: 36\n  \u2502         \u2502    \u2502         \u2502    \u2514\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500islam_quran_islamic_rushdie_muslims \u2500\u2500 Topic: 31\n  \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500judas_scripture_bible_books_greek \u2500\u2500 Topic: 33\n  \u2502         \u2502    \u2502         \u2514\u2500atheists_atheism_god_moral_atheist\n  \u2502         \u2502    \u2502              \u251c\u2500atheists_atheism_god_atheist_argument\n  \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n  \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n  \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n  \u2502         \u2502    \u2514\u2500armenian_armenians_people_israel_said\n  \u2502         \u2502         \u251c\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502    \u251c\u2500tax_rights_government_income_taxes\n  \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500rights_right_slavery_slaves_residence \u2500\u2500 Topic: 106\n  \u2502         \u2502         \u2502    \u2502    \u2514\u2500tax_government_taxes_income_libertarians\n  \u2502         \u2502         \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500government_libertarians_libertarian_regulation_party \u2500\u2500 Topic: 58\n  \u2502         \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500tax_taxes_income_billion_deficit \u2500\u2500 Topic: 41\n  \u2502         \u2502         \u2502    \u2514\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502         \u251c\u2500gun_guns_militia_firearms_amendment\n  \u2502         \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500blacks_penalty_death_cruel_punishment \u2500\u2500 Topic: 55\n  \u2502         \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500gun_guns_militia_firearms_amendment \u2500\u2500 Topic: 7\n  \u2502         \u2502         \u2502         \u2514\u2500armenian_armenians_israel_jews_turkish\n  \u2502         \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500israel_israeli_jews_arab_jewish \u2500\u2500 Topic: 4\n  \u2502         \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500armenian_armenians_turkish_armenia_azerbaijan \u2500\u2500 Topic: 15\n  \u2502         \u2502         \u2514\u2500stephanopoulos_president_mr_myers_ms\n  \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500serbs_muslims_stephanopoulos_mr_bosnia \u2500\u2500 Topic: 35\n  \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500myers_stephanopoulos_president_ms_mr \u2500\u2500 Topic: 87\n  \u2502         \u2514\u2500batf_fbi_koresh_compound_gas\n  \u2502              \u251c\u2500\u25a0\u2500\u2500reno_workers_janet_clinton_waco \u2500\u2500 Topic: 77\n  \u2502              \u2514\u2500batf_fbi_koresh_gas_compound\n  \u2502                   \u251c\u2500batf_koresh_fbi_warrant_compound\n  \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500batf_warrant_raid_compound_fbi \u2500\u2500 Topic: 42\n  \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500koresh_batf_fbi_children_compound \u2500\u2500 Topic: 61\n  \u2502                   \u2514\u2500\u25a0\u2500\u2500fbi_gas_tear_bds_building \u2500\u2500 Topic: 23\n  \u2514\u2500use_like_just_dont_new\n      \u251c\u2500game_team_year_games_like\n      \u2502    \u251c\u2500game_team_games_25_year\n      \u2502    \u2502    \u251c\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502    \u251c\u2500window_printer_use_problem_mhz\n      \u2502    \u2502    \u2502    \u2502    \u251c\u2500mhz_wire_simms_wiring_battery\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_mhz_battery_cpu_heat\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_pds_simm_vram_lc\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500pds_nubus_lc_slot_card \u2500\u2500 Topic: 119\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500simms_simm_vram_meg_dram \u2500\u2500 Topic: 32\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500mhz_battery_cpu_heat_speed\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500fan_cpu_heat_sink_fans \u2500\u2500 Topic: 92\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500mhz_speed_cpu_fpu_clock \u2500\u2500 Topic: 22\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500monitor_turn_power_computer_electricity \u2500\u2500 Topic: 91\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500battery_batteries_concrete_duo_discharge\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500duo_battery_apple_230_problem \u2500\u2500 Topic: 121\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500battery_batteries_concrete_discharge_temperature \u2500\u2500 Topic: 75\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500leds_uv_blue_light_boards \u2500\u2500 Topic: 66\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500wire_wiring_ground_neutral_outlets \u2500\u2500 Topic: 120\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500scope_scopes_phone_dial_number\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500dial_number_phone_line_output \u2500\u2500 Topic: 93\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500scope_scopes_motorola_generator_oscilloscope \u2500\u2500 Topic: 113\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500celp_dsp_sampling_antenna_digital\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500antenna_antennas_receiver_cable_transmitter \u2500\u2500 Topic: 70\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500celp_dsp_sampling_speech_voice \u2500\u2500 Topic: 52\n      \u2502    \u2502    \u2502    \u2502    \u2514\u2500window_printer_xv_mouse_windows\n      \u2502    \u2502    \u2502    \u2502         \u251c\u2500window_xv_error_widget_problem\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500error_symbol_undefined_xterm_rx\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500symbol_error_undefined_doug_parse \u2500\u2500 Topic: 63\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500rx_remote_server_xdm_xterm \u2500\u2500 Topic: 45\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500window_xv_widget_application_expose\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500window_widget_expose_application_event\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500gc_mydisplay_draw_gxxor_drawing \u2500\u2500 Topic: 103\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500window_widget_application_expose_event \u2500\u2500 Topic: 25\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500xv_den_polygon_points_algorithm\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500den_polygon_points_algorithm_polygons \u2500\u2500 Topic: 28\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500xv_24bit_image_bit_images \u2500\u2500 Topic: 57\n      \u2502    \u2502    \u2502    \u2502         \u2514\u2500printer_fonts_print_mouse_postscript\n      \u2502    \u2502    \u2502    \u2502              \u251c\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500scanner_logitech_grayscale_ocr_scanman \u2500\u2500 Topic: 108\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u2514\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u251c\u2500\u25a0\u2500\u2500printer_print_deskjet_hp_ink \u2500\u2500 Topic: 18\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u2514\u2500\u25a0\u2500\u2500fonts_font_truetype_tt_atm \u2500\u2500 Topic: 49\n      \u2502    \u2502    \u2502    \u2502              \u2514\u2500mouse_ghostscript_midi_driver_postscript\n      \u2502    \u2502    \u2502    \u2502                   \u251c\u2500ghostscript_midi_postscript_files_file\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500ghostscript_postscript_pageview_ghostview_dsc \u2500\u2500 Topic: 104\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u2514\u2500midi_sound_file_windows_driver\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u251c\u2500\u25a0\u2500\u2500location_mar_file_host_rwrr \u2500\u2500 Topic: 83\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u2514\u2500\u25a0\u2500\u2500midi_sound_driver_blaster_soundblaster \u2500\u2500 Topic: 98\n      \u2502    \u2502    \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500mouse_driver_mice_ball_problem \u2500\u2500 Topic: 68\n      \u2502    \u2502    \u2502    \u2514\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502         \u251c\u25001st_sale_condition_comics_hulk\n      \u2502    \u2502    \u2502         \u2502    \u251c\u2500sale_condition_offer_asking_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500condition_stereo_amp_speakers_asking\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500miles_car_amfm_toyota_cassette \u2500\u2500 Topic: 62\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500amp_speakers_condition_stereo_audio \u2500\u2500 Topic: 24\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500games_sale_pom_cds_shipping\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u251c\u2500pom_cds_sale_shipping_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500size_shipping_sale_condition_mattress \u2500\u2500 Topic: 100\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500pom_cds_cd_sale_picture \u2500\u2500 Topic: 37\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500games_game_snes_sega_genesis \u2500\u2500 Topic: 40\n      \u2502    \u2502    \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u251c\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500lens_tape_camera_backup_lenses\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500tape_backup_tapes_drive_4mm \u2500\u2500 Topic: 107\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500lens_camera_lenses_zoom_pouch \u2500\u2500 Topic: 114\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u25001st_hulk_comics_art_appears \u2500\u2500 Topic: 105\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500books_book_cover_trek_chemistry \u2500\u2500 Topic: 125\n      \u2502    \u2502    \u2502         \u2502         \u2514\u2500tickets_hotel_ticket_voucher_package\n      \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500hotel_voucher_package_vacation_room \u2500\u2500 Topic: 74\n      \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500tickets_ticket_june_airlines_july \u2500\u2500 Topic: 84\n      \u2502    \u2502    \u2502         \u2514\u2500game_team_games_season_hockey\n      \u2502    \u2502    \u2502              \u251c\u2500game_hockey_team_25_550\n      \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500espn_pt_pts_game_la \u2500\u2500 Topic: 17\n      \u2502    \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500team_25_game_hockey_550 \u2500\u2500 Topic: 2\n      \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500year_game_hit_baseball_players \u2500\u2500 Topic: 0\n      \u2502    \u2502    \u2514\u2500bike_car_greek_insurance_msg\n      \u2502    \u2502         \u251c\u2500car_bike_insurance_cars_engine\n      \u2502    \u2502         \u2502    \u251c\u2500car_insurance_cars_radar_engine\n      \u2502    \u2502         \u2502    \u2502    \u251c\u2500insurance_health_private_care_canada\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500insurance_health_private_care_canada \u2500\u2500 Topic: 99\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500insurance_car_accident_rates_sue \u2500\u2500 Topic: 82\n      \u2502    \u2502         \u2502    \u2502    \u2514\u2500car_cars_radar_engine_detector\n      \u2502    \u2502         \u2502    \u2502         \u251c\u2500car_radar_cars_detector_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500radar_detector_detectors_ka_alarm \u2500\u2500 Topic: 39\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500car_cars_mustang_ford_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500clutch_shift_shifting_transmission_gear \u2500\u2500 Topic: 88\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500car_cars_mustang_ford_v8 \u2500\u2500 Topic: 14\n      \u2502    \u2502         \u2502    \u2502         \u2514\u2500oil_diesel_odometer_diesels_car\n      \u2502    \u2502         \u2502    \u2502              \u251c\u2500odometer_oil_sensor_car_drain\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500odometer_sensor_speedo_gauge_mileage \u2500\u2500 Topic: 96\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500oil_drain_car_leaks_taillights \u2500\u2500 Topic: 102\n      \u2502    \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500diesel_diesels_emissions_fuel_oil \u2500\u2500 Topic: 79\n      \u2502    \u2502         \u2502    \u2514\u2500bike_riding_ride_bikes_motorcycle\n      \u2502    \u2502         \u2502         \u251c\u2500bike_ride_riding_bikes_lane\n      \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500bike_ride_riding_lane_car \u2500\u2500 Topic: 11\n      \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500bike_bikes_miles_honda_motorcycle \u2500\u2500 Topic: 19\n      \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500countersteering_bike_motorcycle_rear_shaft \u2500\u2500 Topic: 46\n      \u2502    \u2502         \u2514\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u251c\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u251c\u2500greek_kuwait_greece_turkish_greeks\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500greek_greece_turkish_greeks_cyprus \u2500\u2500 Topic: 71\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kuwait_iraq_iran_gulf_arabia \u2500\u2500 Topic: 76\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2514\u2500msg_dog_drugs_drug_food\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u251c\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500clinton_bush_quayle_reagan_panicking \u2500\u2500 Topic: 101\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u2514\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500cooper_trial_weaver_spence_witnesses \u2500\u2500 Topic: 90\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500dog_dogs_bike_trained_springer \u2500\u2500 Topic: 67\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2514\u2500msg_drugs_drug_food_chinese\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500msg_food_chinese_foods_taste \u2500\u2500 Topic: 30\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500drugs_drug_marijuana_cocaine_alcohol \u2500\u2500 Topic: 72\n      \u2502    \u2502              \u2502    \u2502    \u2514\u2500water_theory_universe_science_larsons\n      \u2502    \u2502              \u2502    \u2502         \u251c\u2500water_nuclear_cooling_steam_dept\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500rocketry_rockets_engines_nuclear_plutonium \u2500\u2500 Topic: 115\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u2514\u2500water_cooling_steam_dept_plants\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500water_dept_phd_environmental_atmospheric \u2500\u2500 Topic: 97\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500cooling_water_steam_towers_plants \u2500\u2500 Topic: 109\n      \u2502    \u2502              \u2502    \u2502         \u2514\u2500theory_universe_larsons_larson_science\n      \u2502    \u2502              \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500theory_universe_larsons_larson_science \u2500\u2500 Topic: 54\n      \u2502    \u2502              \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500oort_cloud_grbs_gamma_burst \u2500\u2500 Topic: 80\n      \u2502    \u2502              \u2502    \u2514\u2500helmet_kirlian_photography_lock_wax\n      \u2502    \u2502              \u2502         \u251c\u2500helmet_kirlian_photography_leaf_mask\n      \u2502    \u2502              \u2502         \u2502    \u251c\u2500kirlian_photography_leaf_pictures_deleted\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u251c\u2500deleted_joke_stuff_maddi_nickname\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500joke_maddi_nickname_nicknames_frank \u2500\u2500 Topic: 43\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500deleted_stuff_bookstore_joke_motto \u2500\u2500 Topic: 81\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kirlian_photography_leaf_pictures_aura \u2500\u2500 Topic: 85\n      \u2502    \u2502              \u2502         \u2502    \u2514\u2500helmet_mask_liner_foam_cb\n      \u2502    \u2502              \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500helmet_liner_foam_cb_helmets \u2500\u2500 Topic: 112\n      \u2502    \u2502              \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500mask_goalies_77_santore_tl \u2500\u2500 Topic: 123\n      \u2502    \u2502              \u2502         \u2514\u2500lock_wax_paint_plastic_ear\n      \u2502    \u2502              \u2502              \u251c\u2500\u25a0\u2500\u2500lock_cable_locks_bike_600 \u2500\u2500 Topic: 117\n      \u2502    \u2502              \u2502              \u2514\u2500wax_paint_ear_plastic_skin\n      \u2502    \u2502              \u2502                   \u251c\u2500\u25a0\u2500\u2500wax_paint_plastic_scratches_solvent \u2500\u2500 Topic: 65\n      \u2502    \u2502              \u2502                   \u2514\u2500\u25a0\u2500\u2500ear_wax_skin_greasy_acne \u2500\u2500 Topic: 116\n      \u2502    \u2502              \u2514\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u251c\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500m4_mp_14_mw_mo \u2500\u2500 Topic: 111\n      \u2502    \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500test_ensign_nameless_deane_deanebinahccbrandeisedu \u2500\u2500 Topic: 118\n      \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500ites_cheek_hello_hi_ken \u2500\u2500 Topic: 3\n      \u2502    \u2514\u2500space_medical_health_disease_cancer\n      \u2502         \u251c\u2500medical_health_disease_cancer_patients\n      \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500cancer_centers_center_medical_research \u2500\u2500 Topic: 122\n      \u2502         \u2502    \u2514\u2500health_medical_disease_patients_hiv\n      \u2502         \u2502         \u251c\u2500patients_medical_disease_candida_health\n      \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n      \u2502         \u2502         \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n      \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n      \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n      \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n      \u2502         \u2514\u2500space_launch_nasa_shuttle_orbit\n      \u2502              \u251c\u2500space_moon_station_nasa_launch\n      \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500sky_advertising_billboard_billboards_space \u2500\u2500 Topic: 59\n      \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500space_station_moon_redesign_nasa \u2500\u2500 Topic: 16\n      \u2502              \u2514\u2500space_mission_hst_launch_orbit\n      \u2502                   \u251c\u2500space_launch_nasa_orbit_propulsion\n      \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500space_launch_nasa_propulsion_astronaut \u2500\u2500 Topic: 47\n      \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500orbit_km_jupiter_probe_earth \u2500\u2500 Topic: 86\n      \u2502                   \u2514\u2500\u25a0\u2500\u2500hst_mission_shuttle_orbit_arrays \u2500\u2500 Topic: 60\n      \u2514\u2500drive_file_key_windows_use\n          \u251c\u2500key_file_jpeg_encryption_image\n          \u2502    \u251c\u2500key_encryption_clipper_chip_keys\n          \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500key_clipper_encryption_chip_keys \u2500\u2500 Topic: 1\n          \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500entry_file_ripem_entries_key \u2500\u2500 Topic: 73\n          \u2502    \u2514\u2500jpeg_image_file_gif_images\n          \u2502         \u251c\u2500motif_graphics_ftp_available_3d\n          \u2502         \u2502    \u251c\u2500motif_graphics_openwindows_ftp_available\n          \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500openwindows_motif_xview_windows_mouse \u2500\u2500 Topic: 20\n          \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500graphics_widget_ray_3d_available \u2500\u2500 Topic: 95\n          \u2502         \u2502    \u2514\u2500\u25a0\u2500\u25003d_machines_version_comments_contact \u2500\u2500 Topic: 38\n          \u2502         \u2514\u2500jpeg_image_gif_images_format\n          \u2502              \u251c\u2500\u25a0\u2500\u2500gopher_ftp_files_stuffit_images \u2500\u2500 Topic: 51\n          \u2502              \u2514\u2500\u25a0\u2500\u2500jpeg_image_gif_format_images \u2500\u2500 Topic: 13\n          \u2514\u2500drive_db_card_scsi_windows\n              \u251c\u2500db_windows_dos_mov_os2\n              \u2502    \u251c\u2500\u25a0\u2500\u2500copy_protection_program_software_disk \u2500\u2500 Topic: 64\n              \u2502    \u2514\u2500\u25a0\u2500\u2500db_windows_dos_mov_os2 \u2500\u2500 Topic: 8\n              \u2514\u2500drive_card_scsi_drives_ide\n                      \u251c\u2500drive_scsi_drives_ide_disk\n                      \u2502    \u251c\u2500\u25a0\u2500\u2500drive_scsi_drives_ide_disk \u2500\u2500 Topic: 6\n                      \u2502    \u2514\u2500\u25a0\u2500\u2500meg_sale_ram_drive_shipping \u2500\u2500 Topic: 12\n                      \u2514\u2500card_modem_monitor_video_drivers\n                          \u251c\u2500\u25a0\u2500\u2500card_monitor_video_drivers_vga \u2500\u2500 Topic: 5\n                          \u2514\u2500\u25a0\u2500\u2500modem_port_serial_irq_com \u2500\u2500 Topic: 10\n
        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#merge-topics","title":"Merge topics","text":"

        After seeing the potential hierarchy of your topic, you might want to merge specific topics. For example, if topic 1 is 1_space_launch_moon_nasa and topic 2 is 2_spacecraft_solar_space_orbit it might make sense to merge those two topics as they are quite similar in meaning. In BERTopic, you can use .merge_topics to manually select and merge those topics. Doing so will update their topic representation which in turn updates the entire model:

        topics_to_merge = [1, 2]\ntopic_model.merge_topics(docs, topics_to_merge)\n

        If you have several groups of topics you want to merge, create a list of lists instead:

        topics_to_merge = [[1, 2],\n                   [3, 4]]\ntopic_model.merge_topics(docs, topics_to_merge)\n
        "},{"location":"getting_started/manual/manual.html","title":"Manual Topic Modeling","text":"

        Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used 20 NewsGroups dataset is already split up into 20 classes. Here, we might want to see how we can transform those 20 classes into 20 topics. Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic without actually learning them.

        We can view this as a manual topic modeling approach. There is no underlying algorithm for detecting these topics since you already have done that before. Whether that is simply because they are already available, like with the 20 NewsGroups dataset, or maybe because you have created clusters of documents before using packages like human-learn, bulk, thisnotthat or something entirely different.

        In other words, we can pass our labels to BERTopic and it will try to transform those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.

        Documents Labels c-TF-IDF

        To do so, we need to skip over the dimensionality reduction and clustering steps since we already know the labels for our documents. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels:

        from sklearn.datasets import fetch_20newsgroups\n\n# Get labeled data\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n

        Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process:

        from bertopic import BERTopic\nfrom bertopic.backend import BaseEmbedder\nfrom bertopic.cluster import BaseCluster\nfrom bertopic.vectorizers import ClassTfidfTransformer\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\n# Prepare our empty sub-models and reduce frequent words while we are at it.\nempty_embedding_model = BaseEmbedder()\nempty_dimensionality_model = BaseDimensionalityReduction()\nempty_cluster_model = BaseCluster()\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\n\n# Fit BERTopic without actually performing any clustering\ntopic_model= BERTopic(\n        embedding_model=empty_embedding_model,\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=empty_cluster_model,\n        ctfidf_model=ctfidf_model\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n

        Let's take a look at a few topics that we get out of training this way by running topic_model.get_topic_info():

        Topic Count Name 0 0 999 0_game_hockey_team_25 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit

        We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship:

        # Map input `y` to topics\nmappings = topic_model.topic_mapper_.get_mappings()\nmappings = {value: data[\"target_names\"][key] for key, value in mappings.items()}\n\n# Assign original classes to our topics\ndf = topic_model.get_topic_info()\ndf[\"Class\"] = df.Topic.map(mappings)\ndf\n

        Topic Count Name Class 0 0 999 0_game_hockey_team_25 rec.sport.hockey 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit sci.space comp.windows.x sci.med rec.autos sci.crypt rec.sport.baseball rec.motorcycles soc.religion.christian

        We can see that the c-TF-IDF representations nicely extract the words that give a nice representation of our input classes. This is all done without actually embedding and clustering the data.

        As a result, the entire \"training\" process only takes a couple of seconds. Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc.

        Note

        The resulting topics may be a different mapping from the y labels. To map y to topics, we can run the following:

        mappings = topic_model.topic_mapper_.get_mappings()\ny_mapped = [mappings[val] for val in y]\n
        "},{"location":"getting_started/merge/merge.html","title":"Merge Multiple Fitted Models","text":"

        After you have trained a new BERTopic model on your data, new data might still be coming in. Although you can use online BERTopic, you might prefer to use the default HDBSCAN and UMAP models since they do not support incremental learning out of the box.

        Instead, we you can train a new BERTopic on incoming data and merge it with your base model to detect whether new topics have appeared in the unseen documents. This is a great way of detecting whether your new model contains information that was not previously found in your base topic model.

        Similarly, you might want to train multiple BERTopic models using different sets of settings, even though they might all be using the same underlying embedding model. Merging these models would also allow for a single model that you can use throughout your use cases.

        Lastly, this methods also allows for a degree of federated learning where each node trains a topic model that are aggregated in a central server.

        "},{"location":"getting_started/merge/merge.html#example","title":"Example","text":"

        To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain.

        First, we train three separate models on different parts of the data:

        from umap import UMAP\nfrom bertopic import BERTopic\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\n\n# Extract abstracts to train on and corresponding titles\nabstracts_1 = dataset[\"abstract\"][:5_000]\nabstracts_2 = dataset[\"abstract\"][5_000:10_000]\nabstracts_3 = dataset[\"abstract\"][10_000:15_000]\n\n# Create topic models\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\ntopic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1)\ntopic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2)\ntopic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3)\n

        Then, we can combine all three models into one with .merge_models:

        # Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n

        When we inspect the first model, we can see it has 52 topics:

        >>> len(topic_model_1.get_topic_info())\n52\n

        Now, we inspect the merged model, we can see it has 57 topics:

        >>> len(merged_model.get_topic_info())\n57\n

        It seems that by merging these three models, there were 6 undiscovered topics that we could add to the very first model.

        Note

        Note that the models are merged sequentially. This means that the comparison starts with topic_model_1 and that each new topic from topic_model_2 and topic_model_3 will be added to topic_model_1.

        We can check the newly added topics in the merged_model by simply looking at the 6 latest topics that were added. The order of topics from topic_model_1 remains the same. All new topics are simply added on top of them.

        Let's inspect them:

        >>> merged_model.get_topic_info().tail(5)\n
        Topic Count Name Representation Representative_Docs 52 51 47 50_activity_mobile_wearable_sensors ['activity', 'mobile', 'wearable', 'sensors', 'falls', 'human', 'phone', 'recognition', 'activities', 'accelerometer'] nan 53 52 48 25_music_musical_audio_chord ['music', 'musical', 'audio', 'chord', 'and', 'we', 'to', 'that', 'of', 'for'] nan 54 53 32 36_fairness_discrimination_fair_groups ['fairness', 'discrimination', 'fair', 'groups', 'protected', 'decision', 'we', 'of', 'classifier', 'to'] nan 55 54 30 38_traffic_driver_prediction_flow ['traffic', 'driver', 'prediction', 'flow', 'trajectory', 'the', 'and', 'congestion', 'of', 'transportation'] nan 56 55 22 50_spiking_neurons_networks_learning ['spiking', 'neurons', 'networks', 'learning', 'neural', 'snn', 'dynamics', 'plasticity', 'snns', 'of'] nan

        It seems that topics about activity, music, fairness, traffic, and spiking networks were added to the base topic model! Two things that you might have noticed. First, the representative documents were not added to the model. This is because of privacy reasons, you might want to combine models that were trained on different stations which would allow for a degree of federated learning. Second, the names of the new topics contain topic ids that refer to one of the old models. They were purposefully left this way so that the user can identify which topics were newly added which you could inspect in the original models.

        "},{"location":"getting_started/merge/merge.html#min_similarity","title":"min_similarity","text":"

        The way the models are merged is through comparison of their topic embeddings. If topics between models are similar enough, then they will be regarded as the same topics and the topic of the first model in the list will be chosen. However, if topics between models are dissimilar enough, then the topic of the latter model will be added to the former.

        This (dis)similarity is can be tweaked using the min_similarity parameter. Increasing this value will increase the chance of adding new topics. In contrast, decreasing this value will make it more strict and threfore decrease the chance of adding new topics. The value is set to 0.7 by default, so let's see what happens if we were to increase this value to `0.9``:

        # Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3], min_similarity=0.9)\n

        When we inspect the number of topics in our new model, we can see that they have increased quite a bit:

        >>> len(merged_model.get_topic_info())\n102\n

        This demonstrates the influence of min_similarity on the number of new topics that are added to the base model.

        "},{"location":"getting_started/multiaspect/multiaspect.html","title":"6C. Multiple Representations","text":"

        During the development of BERTopic, many different types of representations can be created, from keywords and phrases to summaries and custom labels. There is a variety of techniques that one can choose from to represent a topic. As such, there are a number of interesting and creative ways one can summarize topics. A topic is more than just a single representation.

        Therefore, multi-aspect topic modeling is introduced! During the .fit or .fit_transform stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).

        The approach is rather straightforward. We might want to represent our topics using a PartOfSpeech representation model but we might also want to try out KeyBERTInspired and compare those representation models. We can do this as follows:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic.representation import PartOfSpeech\nfrom bertopic.representation import MaximalMarginalRelevance\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Documents to train on\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# The main representation of a topic\nmain_representation = KeyBERTInspired()\n\n# Additional ways of representing a topic\naspect_model1 = PartOfSpeech(\"en_core_web_sm\")\naspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]\n\n# Add all models together to be run in a single `fit`\nrepresentation_model = {\n   \"Main\": main_representation,\n   \"Aspect1\":  aspect_model1,\n   \"Aspect2\":  aspect_model2 \n}\ntopic_model = BERTopic(representation_model=representation_model).fit(docs)\n

        As show above, to perform multi-aspect topic modeling, we make sure that representation_model is a dictionary where each representation model pipeline is defined. The main pipeline, that is used in most visualization options, is defined with the \"Main\" key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as \"Aspect1\" and \"Aspect2\".

        After we have fitted our model, we can access all representations with topic_model.get_topic_info():

        As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in topic_model.topic_aspects_.

        "},{"location":"getting_started/multimodal/multimodal.html","title":"Multimodal Topic Modeling","text":"

        Documents or text are often accompanied by imagery or the other way around. For example, social media images with captions and products with descriptions. Topic modeling has traditionally focused on creating topics from textual representations. However, as more multimodal representations are created, the need for multimodal topics increases.

        BERTopic can perform multimodal topic modeling in a number of ways during .fit and .fit_transform stages.

        "},{"location":"getting_started/multimodal/multimodal.html#text-images","title":"Text + Images","text":"

        The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.

        In this example, we are going to use images from flickr that each have a caption accociated to it:

        # NOTE: This requires the `datasets` package which you can \n# install with `pip install datasets`\nfrom datasets import load_dataset\n\nds = load_dataset(\"maderix/flickr_bw_rgb\")\nimages = ds[\"train\"][\"image\"]\ndocs = ds[\"train\"][\"caption\"]\n

        The docs variable contains the captions for each image in images. We can now use these variables to run our multimodal example:

        Tip

        Do note that it is better to pass the paths of the images instead of the images themselves as there is no need to keep all images in memory. When passing the paths of the images, they are only opened temporarily when they are needed.

        from bertopic import BERTopic\nfrom bertopic.representation import VisualRepresentation\n\n# Additional ways of representing a topic\nvisual_model = VisualRepresentation()\n\n# Make sure to add the `visual_model` to a dictionary\nrepresentation_model = {\n   \"Visual_Aspect\":  visual_model,\n}\ntopic_model = BERTopic(representation_model=representation_model, verbose=True)\n

        In this example, we are clustering the documents and are then looking for the best matching images to the resulting clusters.

        We can now access our image representations for each topic with topic_model.topic_aspects_[\"Visual_Aspect\"]. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:

        import base64\nfrom io import BytesIO\nfrom IPython.display import HTML\n\ndef image_base64(im):\n    if isinstance(im, str):\n        im = get_thumbnail(im)\n    with BytesIO() as buffer:\n        im.save(buffer, 'jpeg')\n        return base64.b64encode(buffer.getvalue()).decode()\n\n\ndef image_formatter(im):\n    return f'<img src=\"data:image/jpeg;base64,{image_base64(im)}\">'\n\n# Extract dataframe\ndf = topic_model.get_topic_info().drop(\"Representative_Docs\", 1).drop(\"Name\", 1)\n\n# Visualize the images\nHTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))\n

        Tip

        In the example above, we are clustering the documents but since you have images, you might want to cluster those or cluster an aggregation of both images and documents. For that, you can use the new MultiModalBackend to generate embeddings:

        from bertopic.backend import MultiModalBackend\nmodel = MultiModalBackend('clip-ViT-B-32', batch_size=32)\n\n# Embed documents only\ndoc_embeddings = model.embed_documents(docs)\n\n# Embeding images only\nimage_embeddings = model.embed_images(images)\n\n# Embed both images and documents, then average them\ndoc_image_embeddings = model.embed(docs, images)\n
        "},{"location":"getting_started/multimodal/multimodal.html#images-only","title":"Images Only","text":"

        Traditional topic modeling techniques can only be run on textual data, as is shown in the example above. However, there are plenty of cases where textual data is not available but images are. BERTopic allows topic modeling to be performed using only images as your input data.

        To run BERTopic on images only, we first need to embed our images and then define a model that convert images to text. To do so, we are going to need some images. We will take the same images as the above but instead save them locally and pass the paths to the images instead. As mentioned before, this will make sure that we do not hold too many images in memory whilst only a small subset is needed:

        import os\nimport glob\nimport zipfile\nimport numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nfrom sentence_transformers import util\n\n# Flickr 8k images\nimg_folder = 'photos/'\ncaps_folder = 'captions/'\nif not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:\n    os.makedirs(img_folder, exist_ok=True)\n\n    if not os.path.exists('Flickr8k_Dataset.zip'):   #Download dataset if does not exist\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip')\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip')\n\n    for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]:\n        with zipfile.ZipFile(file, 'r') as zf:\n            for member in tqdm(zf.infolist(), desc='Extracting'):\n                zf.extract(member, folder)\nimages = list(glob.glob('photos/Flicker8k_Dataset/*.jpg'))\n

        Next, we can run our pipeline:

        from bertopic.representation import KeyBERTInspired, VisualRepresentation\nfrom bertopic.backend import MultiModalBackend\n\n# Image embedding model\nembedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)\n\n# Image to text representation model\nrepresentation_model = {\n    \"Visual_Aspect\": VisualRepresentation(image_to_text_model=\"nlpconnect/vit-gpt2-image-captioning\")\n}\n

        Using these models, we can run our pipeline:

        from bertopic import BERTopic\n\n# Train our model with images only\ntopic_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, min_topic_size=30)\ntopics, probs = topic_model.fit_transform(documents=None, images=images)\n

        We can now access our image representations for each topic with topic_model.topic_aspects_[\"Visual_Aspect\"]. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:

        import base64\nfrom io import BytesIO\nfrom IPython.display import HTML\n\ndef image_base64(im):\n    if isinstance(im, str):\n        im = get_thumbnail(im)\n    with BytesIO() as buffer:\n        im.save(buffer, 'jpeg')\n        return base64.b64encode(buffer.getvalue()).decode()\n\n\ndef image_formatter(im):\n    return f'<img src=\"data:image/jpeg;base64,{image_base64(im)}\">'\n\n# Extract dataframe\ndf = topic_model.get_topic_info().drop(\"Representative_Docs\", 1).drop(\"Name\", 1)\n\n# Visualize the images\nHTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))\n

        "},{"location":"getting_started/online/online.html","title":"Online Topic Modeling","text":"

        Online topic modeling (sometimes called \"incremental topic modeling\") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained before. In Scikit-Learn, this technique is often modeled through a .partial_fit function, which is also used in BERTopic.

        Tip

        Another method for online topic modeling can be found with the .merge_models functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging. A major benefit, compared to .partial_fit is that you can keep using the original UMAP and HDBSCAN models which tends result in improved performance and gives you significant more flexibility.

        In BERTopic, there are three main goals for using this technique.

        • To reduce the memory necessary for training a topic model.
        • To continuously update the topic model as new data comes in.
        • To continuously find new topics as new data comes in.

        In BERTopic, online topic modeling can be a bit tricky as there are several steps involved in which online learning needs to be made available. To recap, BERTopic consists of the following 6 steps:

        1. Extract embeddings
        2. Reduce dimensionality
        3. Cluster reduced embeddings
        4. Tokenize topics
        5. Extract topic words
        6. (Optional) Fine-tune topic words

        For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6.

        SBERT IncrementalPCA MiniBatchKMeans Online CountVectorizer Embeddings Dimensionality reduction Clustering Incremental Bag-of-Words c-TF-IDF Topic representation Online variants of these steps in the main BERTopic pipeline are needed in order to enable incremental learning.

        This means that we will need online variants for steps 2 through 4. Steps 2 and 3, dimensionality reduction and clustering, can be modeled through the use of Scikit-Learn's .partial_fit function. In other words, it supports any algorithm that can be trained using .partial_fit since these algorithms can be trained incrementally. For example, incremental dimensionality reduction can be achieved using Scikit-Learn's IncrementalPCA and incremental clustering with MiniBatchKMeans.

        Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the CountVectorizer. However, as new data comes in, its vocabulary will need to be updated. For that purpose, bertopic.vectorizers.OnlineCountVectorizer was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the decay parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of .1 will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, delete_min_df will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the decay parameter as some words will decay over time if not used. For more information regarding the OnlineCountVectorizer, please see the vectorizers documentation.

        "},{"location":"getting_started/online/online.html#example","title":"Example","text":"

        Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally.

        from sklearn.datasets import fetch_20newsgroups\n\n# Prepare documents\nall_docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\ndoc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)]\n

        Here, we created chunks of 1000 documents to be fed in BERTopic. Then, we will need to define several sub-models that support online learning. Specifically, we are going to be using IncrementalPCA, MiniBatchKMeans, and the OnlineCountVectorizer:

        from sklearn.cluster import MiniBatchKMeans\nfrom sklearn.decomposition import IncrementalPCA\nfrom bertopic.vectorizers import OnlineCountVectorizer\n\n# Prepare sub-models that support online learning\numap_model = IncrementalPCA(n_components=5)\ncluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n

        After having defined our sub-models, we can start training our topic model incrementally by looping over our document chunks:

        from bertopic import BERTopic\n\ntopic_model = BERTopic(umap_model=umap_model,\n                       hdbscan_model=cluster_model,\n                       vectorizer_model=vectorizer_model)\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n

        And that is it! During each iteration, you can access the predicted topics through the .topics_ attribute.

        Note

        Do note that in BERTopic it is not possible to use .partial_fit after the .fit as they work quite differently concerning internally updating topics, frequencies, representations, etc.

        Tip

        You can use any other dimensionality reduction and clustering algorithm as long as they have a .partial_fit function. Moreover, you can use dimensionality reduction algorithms that do not support .partial_fit functions but do have a .fit function to first train it on a large amount of data and then continuously add documents. The dimensionality reduction will not be updated but may be trained sufficiently to properly reduce the embeddings without the need to continuously add documents.

        Warning

        Only the most recent batch of documents is tracked. If you want to be using online topic modeling for low-memory use cases, then it is advised to also update the .topics_ attribute. Otherwise, variations such as hierarchical topic modeling will not work.

        # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration\ntopics = []\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n    topics.extend(topic_model.topics_)\n\ntopic_model.topics_ = topics\n
        "},{"location":"getting_started/online/online.html#river","title":"River","text":"

        To continuously find new topics as they come in, we can use the package river. It contains several clustering models that can create new clusters as new data comes in. To make sure we can use their models, we first need to create a class that has a .partial_fit function and the option to extract labels through .labels_:

        from river import stream\nfrom river import cluster\n\nclass River:\n    def __init__(self, model):\n        self.model = model\n\n    def partial_fit(self, umap_embeddings):\n        for umap_embedding, _ in stream.iter_array(umap_embeddings):\n            self.model.learn_one(umap_embedding)\n\n        labels = []\n        for umap_embedding, _ in stream.iter_array(umap_embeddings):\n            label = self.model.predict_one(umap_embedding)\n            labels.append(label)\n\n        self.labels_ = labels\n        return self\n

        Then, we can choose any river.cluster model that we are interested in and pass it to the River class before using it in BERTopic:

        # Using DBSTREAM to detect new topics as they come in\ncluster_model = River(cluster.DBSTREAM())\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\")\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)\n\n# Prepare model\ntopic_model = BERTopic(\n    hdbscan_model=cluster_model, \n    vectorizer_model=vectorizer_model, \n    ctfidf_model=ctfidf_model,\n)\n\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html","title":"Outlier reduction","text":"

        When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created that do not fall within any of the created topics. These are labeled as -1. Depending on your use case, you might want to decrease the number of documents that are labeled as outliers. Fortunately, there are a number of strategies one might use to reduce the number of outliers after you have trained your BERTopic model.

        The main way to reduce your outliers in BERTopic is by using the .reduce_outliers function. To make it work without too much tweaking, you will only need to pass the docs and their corresponding topics. You can pass outlier and non-outlier documents together since it will only try to reduce outlier documents and label them to a non-outlier topic.

        The following is a minimal example:

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers\nnew_topics = topic_model.reduce_outliers(docs, topics)\n

        Note

        You can use the threshold parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. This allows the user to change the amount of outlier documents are assigned to non-outlier topics.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#strategies","title":"Strategies","text":"

        The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them to the best matching c-TF-IDF representations of non-outlier topics.

        However, there are a number of other strategies one can use, either separately or in conjunction that are worthwhile to explore:

        • Using the topic-document probabilities to assign topics
        • Using the topic-document distributions to assign topics
        • Using c-TF-IDF representations to assign topics
        • Using document and topic embeddings to assign topics
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#probabilities","title":"Probabilities","text":"

        This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True.

        from bertopic import BERTopic\n\n# Train your BERTopic model and calculate the document-topic probabilities\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `probabilities` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#topic-distributions","title":"Topic Distributions","text":"

        Use the topic distributions, as calculated with .approximate_distribution to find the most frequent topic in each outlier document. You can use the distributions_params variable to tweak the parameters of .approximate_distribution.

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `distributions` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"distributions\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#c-tf-idf","title":"c-TF-IDF","text":"

        Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `c-tf-idf` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"c-tf-idf\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#embeddings","title":"Embeddings","text":"

        Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity.

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `embeddings` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"embeddings\")\n

        Note

        If you have pre-calculated the documents embeddings you can speed up the outlier reduction process for the \"embeddings\" strategy as it will prevent re-calculating the document embeddings.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#chain-strategies","title":"Chain Strategies","text":"

        Since the .reduce_outliers function does not internally update the topics, we can easily try out different strategies but also chain them together. You might want to do a first pass with the \"c-tf-idf\" strategy as it is quite fast. Then, we can perform the \"distributions\" strategy on the outliers that are left since this method is typically much slower:

        # Use the \"c-TF-IDF\" strategy with a threshold\nnew_topics = topic_model.reduce_outliers(docs, new_topics , strategy=\"c-tf-idf\", threshold=0.1)\n\n# Reduce all outliers that are left with the \"distributions\" strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"distributions\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#update-topics","title":"Update Topics","text":"

        After generating our updated topics, we can feed them back into BERTopic in one of two ways. We can either update the topic representations themselves based on the documents that now belong to new topics or we can only update the topic frequency without updating the topic representations themselves.

        Warning

        In both cases, it is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#update-topic-representation","title":"Update Topic Representation","text":"

        When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Thus, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics. To do so, we can make use of the .update_topics function:

        topic_model.update_topics(docs, topics=new_topics)\n

        As seen above, you will only need to pass the documents on which the model was trained including the new topics that were generated using one of the above four strategies.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#exploration","title":"Exploration","text":"

        When you are reducing the number of topics, it might be worthwhile to iteratively visualize the results in order to get an intuitive understanding of the effect of the above four strategies. Making use of .visualize_documents, we can quickly iterate over the different strategies and view their effects. Here, an example will be shown on how to approach such a pipeline.

        First, we train our model:

        from umap import UMAP\nfrom bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Prepare data, extract embeddings, and prepare sub-models\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\nvectorizer_model = CountVectorizer(stop_words=\"english\")\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# We reduce our embeddings to 2D as it will allows us to quickly iterate later on\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, \n                          min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Train our topic model\ntopic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, \n                       vectorizer_model=vectorizer_model calculate_probabilities=True, nr_topics=40)\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n

        After having trained our model, let us take a look at the 2D representation of the generated topics:

        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, \n                                hide_document_hover=True, hide_annotations=True)\n

        Next, we reduce the number of outliers using the probabilities strategy:

        new_topics = reduce_outliers(topic_model, docs, topics, probabilities=probs, \n                             threshold=0.05, strategy=\"probabilities\")\ntopic_model.update_topics(docs, topics=new_topics)\n

        And finally, we visualize the results:

        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, \n                                hide_document_hover=True, hide_annotations=True)\n
        "},{"location":"getting_started/parameter%20tuning/parametertuning.html","title":"Hyperparameter Tuning","text":"

        Although BERTopic works quite well out of the box, there are a number of hyperparameters to tune according to your use case. This section will focus on important parameters directly accessible in BERTopic but also hyperparameter optimization in sub-models such as HDBSCAN and UMAP.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#bertopic","title":"BERTopic","text":"

        When instantiating BERTopic, there are several hyperparameters that you can directly adjust that could significantly improve the performance of your topic model. In this section, we will go through the most impactful parameters in BERTopic and directions on how to optimize them.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#language","title":"language","text":"

        The language parameter is used to simplify the selection of models for those who are not familiar with sentence-transformers models.

        In essence, there are two options to choose from:

        • language = \"english\" or
        • language = \"multilingual\"

        The English model is \"all-MiniLM-L6-v2\" and can be found here. It is the default model that is used in BERTopic and works great for English documents.

        The multilingual model is \"paraphrase-multilingual-MiniLM-L12-v2\" and supports over 50+ languages which can be found here. The model is very similar to the base model but is trained on many languages and has a slightly different architecture.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#top_n_words","title":"top_n_words","text":"

        top_n_words refers to the number of words per topic that you want to be extracted. In practice, I would advise you to keep this value below 30 and preferably between 10 and 20. The reasoning for this is that the more words you put in a topic the less coherent it can become. The top words are the most representative of the topic and should be focused on.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#n_gram_range","title":"n_gram_range","text":"

        The n_gram_range parameter refers to the CountVectorizer used when creating the topic representation. It relates to the number of words you want in your topic representation. For example, \"New\" and \"York\" are two separate words but are often used as \"New York\" which represents an n-gram of 2. Thus, the n_gram_range should be set to (1, 2) if you want \"New York\" in your topic representation.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#min_topic_size","title":"min_topic_size","text":"

        min_topic_size is an important parameter! It is used to specify what the minimum size of a topic can be. The lower this value the more topics are created. If you set this value too high, then it is possible that simply no topics will be created! Set this value too low and you will get many microclusters.

        It is advised to play around with this value depending on the size of your dataset. If it nears a million documents, then it is advised to set it much higher than the default of 10, for example, 100 or even 500.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#nr_topics","title":"nr_topics","text":"

        nr_topics can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced. For example, if your topic model results in 100 topics but you have set nr_topics to 20 then the topic model will try to reduce the number of topics from 100 to 20.

        This reduction can take a while as each reduction in topics activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use \"auto\" to automatically reduce topics using HDBSCAN.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#low_memory","title":"low_memory","text":"

        low_memory sets UMAP's low_memory to True to make sure that less memory is used in the computation. This slows down computation but allows UMAP to be run on low-memory machines.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#calculate_probabilities","title":"calculate_probabilities","text":"

        calculate_probabilities lets you calculate the probabilities of each topic in each document. This is computationally quite expensive and is turned off by default.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#umap","title":"UMAP","text":"

        UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something easier to use with HDBSCAN to create good clusters.

        However, it does has a significant number of parameters you could take into account. As exposing all parameters in BERTopic would be difficult to manage, we can instantiate our UMAP model and pass it to BERTopic:

        from umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False)\ntopic_model = BERTopic(umap_model=umap_model).fit(docs)\n
        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#n_neighbors","title":"n_neighbors","text":"

        n_neighbors is the number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters being created.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#n_components","title":"n_components","text":"

        n_components refers to the dimensionality of the embeddings after reducing them. This is set as a default to 5 to reduce dimensionality as much as possible whilst trying to maximize the information kept in the resulting embeddings. Although lowering or increasing this value influences the quality of embeddings, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a hard time clustering the high-dimensional embeddings. Lower this value too much and too little information in the resulting embeddings are available to create proper clusters. If you want to increase this value, I would advise setting using a metric for HDBSCAN that works well in high dimensional data.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#metric","title":"metric","text":"

        metric refers to the method used to compute the distances in high dimensional space. The default is cosine as we are dealing with high dimensional data. However, BERTopic is also able to use any input, even regular tabular data, to cluster the documents. Thus, you might want to change the metric to something that fits your use case.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#low_memory_1","title":"low_memory","text":"

        low_memory is used when datasets may consume a lot of memory. Using millions of documents can lead to memory issues and setting this value to True might alleviate some of the issues.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#hdbscan","title":"HDBSCAN","text":"

        After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked to improve the cluster's quality.

        from hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model).fit(docs)\n
        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#min_cluster_size","title":"min_cluster_size","text":"

        min_cluster_size is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the number of clusters that will be generated. It is set to 10 as a default. Increasing this value results in fewer clusters but of larger size whereas decreasing this value results in more micro clusters being generated. Typically, I would advise increasing this value rather than decreasing it.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#min_samples","title":"min_samples","text":"

        min_samples is automatically set to min_cluster_size and controls the number of outliers generated. Setting this value significantly lower than min_cluster_size might help you reduce the amount of noise you will get. Do note that outliers are to be expected and forcing the output to have no outliers may not properly represent the data.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#metric_1","title":"metric","text":"

        metric, like with HDBSCAN is used to calculate the distances. Here, we went with euclidean as, after reducing the dimensionality, we have low dimensional data and not much optimization is necessary. However, if you increase n_components in UMAP, then it would be advised to look into metrics that work with high dimensional data.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#prediction_data","title":"prediction_data","text":"

        Make sure you always set this value to True as it is needed to predict new points later on. You can set this to False if you do not wish to predict any unseen data points.

        "},{"location":"getting_started/quickstart/quickstart.html","title":"Quick Start","text":""},{"location":"getting_started/quickstart/quickstart.html#installation","title":"Installation","text":"

        Installation, with sentence-transformers, can be done using pypi:

        pip install bertopic\n

        You may want to install more depending on the transformers and language backends that you will be using. The possible installations are:

        # Choose an embedding backend\npip install bertopic[flair, gensim, spacy, use]\n\n# Topic modeling with images\npip install bertopic[vision]\n
        "},{"location":"getting_started/quickstart/quickstart.html#quick-start","title":"Quick Start","text":"

        We start by extracting topics from the well-known 20 newsgroups dataset which is comprised of English documents:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        After generating topics, we can access the frequent topics that were generated:

        >>> topic_model.get_topic_info()\n\nTopic   Count   Name\n-1      4630    -1_can_your_will_any\n0       693     49_windows_drive_dos_file\n1       466     32_jesus_bible_christian_faith\n2       441     2_space_launch_orbit_lunar\n3       381     22_key_encryption_keys_encrypted\n

        -1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0:

        >>> topic_model.get_topic(0)\n\n[('windows', 0.006152228076250982),\n ('drive', 0.004982897610645755),\n ('dos', 0.004845038866360651),\n ('file', 0.004140142872194834),\n ('disk', 0.004131678774810884),\n ('mac', 0.003624848635985097),\n ('memory', 0.0034840976976789903),\n ('software', 0.0034415334250699077),\n ('email', 0.0034239554442333257),\n ('pc', 0.003047105930670237)]\n

        Using .get_document_info, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.:

        >>> topic_model.get_document_info(docs)\n\nDocument                               Topic    Name                        Top_n_words                     Probability    ...\nI am sure some bashers of Pens...       0       0_game_team_games_season    game - team - games...          0.200010       ...\nMy brother is in the market for...      -1     -1_can_your_will_any         can - your - will...            0.420668       ...\nFinally you said what you dream...      -1     -1_can_your_will_any         can - your - will...            0.807259       ...\nThink! It is the SCSI card doing...     49     49_windows_drive_dos_file    windows - drive - docs...       0.071746       ...\n1) I have an old Jasmine drive...       49     49_windows_drive_dos_file    windows - drive - docs...       0.038983       ...\n

        Multilingual

        Use BERTopic(language=\"multilingual\") to select a model that supports 50+ languages.

        "},{"location":"getting_started/quickstart/quickstart.html#fine-tune-topic-representations","title":"Fine-tune Topic Representations","text":"

        In BERTopic, there are a number of different topic representations that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is KeyBERTInspired, which for many users increases the coherence and reduces stopwords from the resulting topic representations:

        from bertopic.representation import KeyBERTInspired\n\n# Fine-tune your topic representations\nrepresentation_model = KeyBERTInspired()\ntopic_model = BERTopic(representation_model=representation_model)\n

        However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more:

        import openai\nfrom bertopic.representation import OpenAI\n\n# Fine-tune topic representations with GPT\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(client, model=\"gpt-3.5-turbo\", chat=True)\ntopic_model = BERTopic(representation_model=representation_model)\n

        Multi-aspect Topic Modeling

        Instead of iterating over all of these different topic representations, you can model them simultaneously with multi-aspect topic representations in BERTopic.

        "},{"location":"getting_started/quickstart/quickstart.html#visualizations","title":"Visualizations","text":"

        After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can use one of the many visualization options in BERTopic. For example, we can visualize the topics that were generated in a way very similar to LDAvis:

        topic_model.visualize_topics()\n
        "},{"location":"getting_started/quickstart/quickstart.html#saveload-bertopic-model","title":"Save/Load BERTopic model","text":"

        There are three methods for saving BERTopic:

        1. A light model with .safetensors and config files
        2. A light model with pytorch .bin and config files
        3. A full model with .pickle

        Method 3 allows for saving the entire topic model but has several drawbacks:

        • Arbitrary code can be run from .pickle files
        • The resulting model is rather large (often > 500MB) since all sub-models need to be saved
        • Explicit and specific version control is needed as they typically only run if the environment is exactly the same

        It is advised to use methods 1 or 2 for saving.

        These methods have a number of advantages:

        • .safetensors is a relatively safe format
        • The resulting model can be very small (often < 20MB) since no sub-models need to be saved
        • Although version control is important, there is a bit more flexibility with respect to specific versions of packages
        • More easily used in production
        • Share models with the HuggingFace Hub

        Tip

        For more detail about how to load in a custom vectorizer, representation model, and more, it is highly advised to checkout the serialization page. It contains more examples, details, and some tips and tricks for loading and saving your environment.

        The methods are as used as follows:

        topic_model = BERTopic().fit(my_docs)\n\n# Method 1 - safetensors\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 2 - pytorch\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"pytorch\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 3 - pickle\ntopic_model.save(\"my_model\", serialization=\"pickle\")\n

        To load a model:

        # Load from directory\nloaded_model = BERTopic.load(\"path/to/my/model_dir\")\n\n# Load from file\nloaded_model = BERTopic.load(\"my_model\")\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_Wikipedia\")\n

        Warning

        When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic should not be loaded in other versions.

        "},{"location":"getting_started/representation/llm.html","title":"6B. LLM & Generative AI","text":"

        As we have seen in the previous section, the topics that you get from BERTopic can be fine-tuned using a number of approaches. Here, we are going to focus on text generation Large Language Models such as ChatGPT, GPT-4, and open-source solutions.

        Using these techniques, we can further fine-tune topics to generate labels, summaries, poems of topics, and more. To do so, we first generate a set of keywords and documents that describe a topic best using BERTopic's c-TF-IDF calculate. Then, these candidate keywords and documents are passed to the text generation model and asked to generate output that fits the topic best.

        A huge benefit of this is that we can describe a topic with only a few documents and we therefore do not need to pass all documents to the text generation model. Not only speeds this the generation of topic labels up significantly, you also do not need a massive amount of credits when using an external API, such as Cohere or OpenAI.

        "},{"location":"getting_started/representation/llm.html#prompt-engineering","title":"Prompt Engineering","text":"

        In most of the examples below, we use certain tags to customize our prompts. There are currently two tags, namely \"[KEYWORDS]\" and \"[DOCUMENTS]\". These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively. For example, if we have the following prompt:

        prompt = \"\"\"\nI have topic that contains the following documents: \\n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the above information, can you give a short label of the topic?\n\"\"\"\n

        then that will be rendered as follows:

        \"\"\"\nI have a topic that contains the following documents: \n- Our videos are also made possible by your support on patreon.co.\n- If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop.\n- If you want to help us make more videos, you can do so there.\n- And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com.\n\nThe topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch \n\nBased on the above information, can you give a short label of the topic?\n\"\"\"\n

        Tip 1

        You can access the default prompts of these models with representation_model.default_prompt_. The prompts that were generated after training can be accessed with topic_model.representation_model.prompts_.

        "},{"location":"getting_started/representation/llm.html#selecting-documents","title":"Selecting Documents","text":"

        By default, four of the most representative documents will be passed to [DOCUMENTS]. These documents are selected by calculating their similarity (through c-TF-IDF representations) with the main c-TF-IDF representation of the topics. The four best matching documents per topic are selected.

        To increase the number of documents passed to [DOCUMENTS], we can use the nr_docs parameter which is accessible in all LLMs on this page. Using this value allows you to select the top n most representative documents instead. If you have a long enough context length, then you could even give the LLM dozens of documents.

        However, some of these documents might be very similar to one another and might be near duplicates. They will not provide much additional information about the content of the topic. Instead, we can use the diversity parameter in each LLM to only select documents that are sufficiently diverse. It takes values between 0 and 1 but a value of 0.1 already does wonders!

        "},{"location":"getting_started/representation/llm.html#truncating-documents","title":"Truncating Documents","text":"

        We can truncate the input documents in [DOCUMENTS] in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak:

        • doc_length
          • The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.
        • tokenizer
          • The tokenizer used to calculate to split the document into segments used to count the length of a document.
            • If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length
            • If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length
            • If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and trunctated depending on doc_length
            • If tokenizer is a callable, then that callable is used to tokenized the document. These tokens are counted and truncated depending on doc_length

        This means that the definition of doc_length changes depending on what constitutes a token in the tokenizer parameter. If a token is a character, then doc_length refers to max length in characters. If a token is a word, then doc_length refers to the max length in words.

        Let's illustrate this with an example. In the code below, we will use tiktoken to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated.

        We start by installing the relevant packages:

        pip install tiktoken openai\n

        Then, we use bertopic.representation.OpenAI to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens:

        import openai\nimport tiktoken\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Tokenizer\ntokenizer= tiktoken.encoding_for_model(\"gpt-3.5-turbo\")\n\n# Create your representation model\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(\n    client,\n    model=\"gpt-3.5-turbo\", \n    delay_in_seconds=2, \n    chat=True,\n    nr_docs=4,\n    doc_length=100,\n    tokenizer=tokenizer\n)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"getting_started/representation/llm.html#transformers","title":"\ud83e\udd17 Transformers","text":"

        Nearly every week, there are new and improved models released on the \ud83e\udd17 Model Hub that, with some creativity, allow for further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these methods are created as a way to support whatever might be released in the future.

        Using a GPT-like model from the huggingface hub is rather straightforward:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = TextGeneration('gpt2')\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        GPT2, however, is not the most accurate model out there on HuggingFace models. You can get much better results with a flan-T5 like model:

        from transformers import pipeline\nfrom bertopic.representation import TextGeneration\n\nprompt = \"I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?\"\n\n# Create your representation model\ngenerator = pipeline('text2text-generation', model='google/flan-t5-base')\nrepresentation_model = TextGeneration(generator)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation beef volcanoes immune system earth european union cotton \ud83e\udd17 Transformers

        As can be seen from the example above, if you would like to use a text2text-generation model, you will to pass a transformers.pipeline with the \"text2text-generation\" parameter. Moreover, you can use a custom prompt and decide where the keywords should be inserted by using the [KEYWORDS] or documents with the [DOCUMENTS] tag.

        "},{"location":"getting_started/representation/llm.html#zephyr-mistral-7b","title":"Zephyr (Mistral 7B)","text":"

        We can go a step further with open-source Large Language Models (LLMs) that have shown to match the performance of closed-source LLMs like ChatGPT.

        In this example, we will show you how to use Zephyr, a fine-tuning version of Mistral 7B. Mistral 7B outperforms other open-source LLMs at a much smaller scale and is a worthwhile solution for use cases such as topic modeling. We want to keep inference as fast as possible and a relatively small model helps with that. Zephyr is a fine-tuned version of Mistral 7B that was trained on a mix of publicly available and synthetic datasets using Direct Preference Optimization (DPO).

        To use Zephyr in BERTopic, we will first need to install and update a couple of packages that can handle quantized versions of Zephyr:

        pip install ctransformers[cuda]\npip install --upgrade git+https://github.com/huggingface/transformers\n

        Instead of loading in the full model, we can instead load a quantized model which is a compressed version of the original model:

        from ctransformers import AutoModelForCausalLM\nfrom transformers import AutoTokenizer, pipeline\n\n# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.\nmodel = AutoModelForCausalLM.from_pretrained(\n    \"TheBloke/zephyr-7B-alpha-GGUF\",\n    model_file=\"zephyr-7b-alpha.Q4_K_M.gguf\",\n    model_type=\"mistral\",\n    gpu_layers=50,\n    hf=True\n)\ntokenizer = AutoTokenizer.from_pretrained(\"HuggingFaceH4/zephyr-7b-alpha\")\n\n# Pipeline\ngenerator = pipeline(\n    model=model, tokenizer=tokenizer,\n    task='text-generation',\n    max_new_tokens=50,\n    repetition_penalty=1.1\n)\n

        This Zephyr model requires a specific prompt template in order to work:

        prompt = \"\"\"<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>\n<|user|>\nI have a topic that contains the following documents:\n[DOCUMENTS]\n\nThe topic is described by the following keywords: '[KEYWORDS]'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>\n<|assistant|>\"\"\"\n

        After creating this prompt template, we can create our representation model to be used in BERTopic:

        from bertopic.representation import TextGeneration\n\n# Text generation with Zephyr\nzephyr = TextGeneration(generator, prompt=prompt)\nrepresentation_model = {\"Zephyr\": zephyr}\n\n# Topic Modeling\ntopic_model = BERTopic(representation_model=representation_model, verbose=True)\n
        "},{"location":"getting_started/representation/llm.html#llama-2","title":"Llama 2","text":"

        Full Llama 2 Tutorial:

        Open-source LLMs are starting to become more and more popular. Here, we will go through a minimal example of using Llama 2 together with BERTopic.

        First, we need to load in our Llama 2 model:

        from torch import bfloat16\nimport transformers\n\n# set quantization configuration to load large model with less GPU memory\n# this requires the `bitsandbytes` library\nbnb_config = transformers.BitsAndBytesConfig(\n    load_in_4bit=True,  # 4-bit quantization\n    bnb_4bit_quant_type='nf4',  # Normalized float 4\n    bnb_4bit_use_double_quant=True,  # Second quantization after the first\n    bnb_4bit_compute_dtype=bfloat16  # Computation type\n)\n\n# Llama 2 Tokenizer\ntokenizer = transformers.AutoTokenizer.from_pretrained(model_id)\n\n# Llama 2 Model\nmodel = transformers.AutoModelForCausalLM.from_pretrained(\n    model_id,\n    trust_remote_code=True,\n    quantization_config=bnb_config,\n    device_map='auto',\n)\nmodel.eval()\n\n# Our text generator\ngenerator = transformers.pipeline(\n    model=model, tokenizer=tokenizer,\n    task='text-generation',\n    temperature=0.1,\n    max_new_tokens=500,\n    repetition_penalty=1.1\n)\n

        After doing so, we will need to define a prompt that works with both Llama 2 as well as BERTopic:

        # System prompt describes information given to all conversations\nsystem_prompt = \"\"\"\n<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant for labeling topics.\n<</SYS>>\n\"\"\"\n\n# Example prompt demonstrating the output we are looking for\nexample_prompt = \"\"\"\nI have a topic that contains the following documents:\n- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.\n- Meat, but especially beef, is the word food in terms of emissions.\n- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.\n\nThe topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.\n\n[/INST] Environmental impacts of eating meat\n\"\"\"\n\n# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags\nmain_prompt = \"\"\"\n[INST]\nI have a topic that contains the following documents:\n[DOCUMENTS]\n\nThe topic is described by the following keywords: '[KEYWORDS]'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.\n[/INST]\n\"\"\"\n\nprompt = system_prompt + example_prompt + main_prompt\n

        Three pieces of the prompt were created:

        • system_prompt helps us guide the model during a conversation. For example, we can say that it is a helpful assisant that is specialized in labeling topics.
        • example_prompt gives an example of a correctly labeled topic to guide Llama 2
        • main_prompt contains the main question we are going to ask it, namely to label a topic. Note that it uses the [DOCUMENTS] and [KEYWORDS] to provide the most relevant documents and keywords as additional context

        After having generated our prompt template, we can start running our topic model:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Text generation with Llama 2\nllama2 = TextGeneration(generator, prompt=prompt)\nrepresentation_model = {\n    \"Llama2\": llama2,\n}\n\n# Create our BERTopic model\ntopic_model = BERTopic(representation_model=representation_model,  verbose=True)\n
        "},{"location":"getting_started/representation/llm.html#llamacpp","title":"llama.cpp","text":"

        An amazing framework for using LLMs for inference is llama.cpp which has python bindings that we can use in BERTopic. To start with, we first need to install llama-cpp-python:

        pip install llama-cpp-python\n

        or using the following for hardware acceleration:

        CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python\n

        Note

        There are a number of installation options depending on your hardware and OS. Make sure that you select the correct one to optimize your performance.

        After installation, you need to download your LLM locally before we use it in BERTopic, like so:

        wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf\n

        Finally, we can now use the model the model with BERTopic in just a couple of lines:

        from bertopic import BERTopic\nfrom bertopic.representation import LlamaCPP\n\n# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha\nrepresentation_model = LlamaCPP(\"zephyr-7b-alpha.Q4_K_M.gguf\")\n\n# Create our BERTopic model\ntopic_model = BERTopic(representation_model=representation_model,  verbose=True)\n

        If you want to have more control over the LLMs parameters, you can run it like so:

        from bertopic import BERTopic\nfrom bertopic.representation import LlamaCPP\nfrom llama_cpp import Llama\n\n# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha\nllm = Llama(model_path=\"zephyr-7b-alpha.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=4096, stop=\"Q:\")\nrepresentation_model = LlamaCPP(llm)\n\n# Create our BERTopic model\ntopic_model = BERTopic(representation_model=representation_model,  verbose=True)\n

        Note

        The default template that is being used uses a \"Q: ... A: ... \" type of structure which is why the stop is set at \"Q:\". The default template is:

        \"\"\"\nQ: I have a topic that contains the following documents: \n[DOCUMENTS]\n\nThe topic is described by the following keywords: '[KEYWORDS]'.\n\nBased on the above information, can you give a short label of the topic?\nA: \n\"\"\"\n

        "},{"location":"getting_started/representation/llm.html#openai","title":"OpenAI","text":"

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use OpenAI to extract our topic labels from the candidate documents and keywords. To use this, you will need to install openai first:

        pip install openai\n

        Then, get yourself an API key and use OpenAI's API as follows:

        import openai\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Create your representation model\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(client)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation Organic vs Conventional Food: Environmental and Health Considerations Volcanic Eruptions and Impacts The Immune System: Understanding and Boosting Immunity The Moon's Tides and Orbit Phenomena Democracy in the European Union Plastic Pollution and its environmental impact OpenAI

        You can also use a custom prompt:

        prompt = \"I have the following documents: [DOCUMENTS] \\nThese documents are about the following topic: '\"\nrepresentation_model = OpenAI(client, prompt=prompt)\n
        "},{"location":"getting_started/representation/llm.html#chatgpt","title":"ChatGPT","text":"

        Within OpenAI's API, the ChatGPT models use a different API structure compared to the GPT-3 models. In order to use ChatGPT with BERTopic, we need to define the model and make sure to enable chat:

        representation_model = OpenAI(client, model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n

        Prompting with ChatGPT is very satisfying and is customizable as follows:

        prompt = \"\"\"\nI have a topic that contains the following documents: \n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n\"\"\"\n

        Note

        Whenever you create a custom prompt, it is important to add

        Based on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n
        at the end of your prompt as BERTopic extracts everything that comes after topic:. Having said that, if topic: is not in the output, then it will simply extract the entire response, so feel free to experiment with the prompts.

        "},{"location":"getting_started/representation/llm.html#summarization","title":"Summarization","text":"

        Due to the structure of the prompts in OpenAI's chat models, we can extract different types of topic representations from their GPT models. Instead of extracting a topic label, we can instead ask it to extract a short description of the topic instead:

        summarization_prompt = \"\"\"\nI have a topic that is described by the following keywords: [KEYWORDS]\nIn this topic, the following documents are a small but representative subset of all documents in the topic:\n[DOCUMENTS]\n\nBased on the information above, please give a description of this topic in the following format:\ntopic: <description>\n\"\"\"\n\nrepresentation_model = OpenAI(client, model=\"gpt-3.5-turbo\", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3)\n

        The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method! If you want to have multiple representations of a single topic, it might be worthwhile to also check out multi-aspect topic modeling with BERTopic.

        "},{"location":"getting_started/representation/llm.html#langchain","title":"LangChain","text":"

        Langchain is a package that helps users with chaining large language models. In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this external knowledge are the most representative documents in each topic.

        To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai:

        pip install langchain, openai\n

        Then, you can create your chain as follows:

        from langchain.chains.question_answering import load_qa_chain\nfrom langchain.llms import OpenAI\nchain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type=\"stuff\")\n

        Finally, you can pass the chain to BERTopic as follows:

        from bertopic.representation import LangChain\n\n# Create your representation model\nrepresentation_model = LangChain(chain)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"What are these documents about? Please give a single label.\"\nrepresentation_model = LangChain(chain, prompt=prompt)\n

        Note

        The prompt does not make use of [KEYWORDS] and [DOCUMENTS] tags as the documents are already used within langchain's load_qa_chain.

        "},{"location":"getting_started/representation/llm.html#cohere","title":"Cohere","text":"

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use Cohere to extract our topic labels from the candidate documents and keywords. To use this, you will need to install cohere first:

        pip install cohere\n

        Then, get yourself an API key and use Cohere's API as follows:

        import cohere\nfrom bertopic.representation import Cohere\nfrom bertopic import BERTopic\n\n# Create your representation model\nco = cohere.Client(my_api_key)\nrepresentation_model = Cohere(co)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation Organic food Exploding planets How your immune system works How tides work How democratic is the European Union? Plastic pollution Cohere

        You can also use a custom prompt:

        prompt = \"\"\"\nI have topic that contains the following documents: [DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS].\nBased on the above information, can you give a short label of the topic?\n\"\"\"\nrepresentation_model = Cohere(co, prompt=prompt)\n
        "},{"location":"getting_started/representation/representation.html","title":"6A. Representation Models","text":"

        One of the core components of BERTopic is its Bag-of-Words representation and weighting with c-TF-IDF. This method is fast and can quickly generate a number of keywords for a topic without depending on the clustering task. As a result, topics can easily and quickly be updated after training the model without the need to re-train it. Although these give good topic representations, we may want to further fine-tune the topic representations.

        As such, there are a number of representation models implemented in BERTopic that allows for further fine-tuning of the topic representations. These are optional and are not used by default. You are not restrained by the how the representation can be fine-tuned, from GPT-like models to fast keyword extraction with KeyBERT-like models:

        For each model below, an example will be shown on how it may change or improve upon the default topic keywords that are generated. The dataset used in these examples can be found here.

        If you want to have multiple representations of a single topic, it might be worthwhile to also check out multi-aspect topic modeling with BERTopic.

        "},{"location":"getting_started/representation/representation.html#keybertinspired","title":"KeyBERTInspired","text":"

        After having generated our topics with c-TF-IDF, we might want to do some fine-tuning based on the semantic relationship between keywords/keyphrases and the set of documents in each topic. Although we can use a centroid-based technique for this, it can be costly and does not take the structure of a cluster into account. Instead, we leverage c-TF-IDF to create a set of representative documents per topic and use those as our updated topic embedding. Then, we calculate the similarity between candidate keywords and the topic embedding using the same embedding model that embedded the documents.

        n Topic Extract representative documents Embed candidate keywords Compare embedded keywords with embedded documents Embed and average documents Extract candidate keywords Compare c-TF-IDF sampled documents with the topic c-TF-IDF. Extract top n words per topic based on their c-TF-IDF scores

        Thus, the algorithm follows some principles of KeyBERT but does some optimization in order to speed up inference. Usage is straightforward:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation organic | meat | foods | crops | beef | produce | food | diet | cows | eating explosion | explodes | eruptions | eruption | blast | volcanoes | volcanic immune | immunology | antibodies | disease | cells | infection | cell | system moon | moons | lunar | tides | tidal | gravity | orbit | satellites | earth | orbits eu | democracy | european | democratic | parliament | governments | voting plastics | plastic | pollution | microplastics | environmental | polymers | bpa KeyBERT-Inspired

        "},{"location":"getting_started/representation/representation.html#partofspeech","title":"PartOfSpeech","text":"

        Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of keywords and documents that best represent a topic.

        n Topic Extract documents that contain at least one keyword Sort keywords by their c-TF-IDF value Use the POS matcher on those documents to generate new candidate keywords Extract candidate keywords

        More specifically, we find documents that contain the keywords from our candidate topics as calculated with c-TF-IDF. These documents serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic. These candidate keywords are first put through Spacy's POS module to see whether they match with the DEFAULT_PATTERNS:

        DEFAULT_PATTERNS = [\n            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n            [{'POS': 'NOUN'}],\n            [{'POS': 'ADJ'}]\n]\n

        These patterns follow Spacy's Rule-Based Matching. Then, the resulting keywords are sorted by their respective c-TF-IDF values.

        from bertopic.representation import PartOfSpeech\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation meat | organic | food | beef | emissions | most | health | pesticides | production explosion | atmosphere | eruption | kilometers | eruptions | fireball | super immune | system | cells | immunology | adaptive | body | memory | cell moon | earth | lunar | tides | water | orbit | base | moons | surface | gravity democratic | vote | parliament | member | union | states | national | countries plastic | plastics | tons | pollution | waste | microplastics | polymers | bag PartOfSpeech

        You can define custom POS patterns to be extracted:

        pos_patterns = [\n            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n            [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n]\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\", pos_patterns=pos_patterns)\n
        "},{"location":"getting_started/representation/representation.html#maximalmarginalrelevance","title":"MaximalMarginalRelevance","text":"

        When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like \"car\" and \"cars\" essentially represent the same information and often redundant.

        To decrease this redundancy and improve the diversity of keywords, we can use an algorithm called Maximal Marginal Relevance (MMR). MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.

        from bertopic.representation import MaximalMarginalRelevance\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = MaximalMarginalRelevance(diversity=0.3)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation meat | organic | beef | emissions | health | pesticides | foods | farming | conventional explosion | atmosphere | eruption | eruptions | crust | volcanoes | earthquakes immune | system | cells | immunology | adaptive | body | memory | antibodies moon | lunar | tides | moons | surface | gravity | tide | meters | oceans | dust eu | democratic | vote | parliament | citizen | laws | institutions | influence | nations plastics | tons | pollution | waste | microplastics | polymers | ocean | bpa | cotton MaximalMarginalRelevance

        "},{"location":"getting_started/representation/representation.html#zero-shot-classification","title":"Zero-Shot Classification","text":"

        For some use cases, you might already have a set of candidate labels that you would like to automatically assign to some of the topics. Although we can use guided or supervised BERTopic for that, we can also use zero-shot classification to assign labels to our topics. For that, we can make use of \ud83e\udd17 transformers on their models on the model hub.

        To perform this classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels. If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords.

        We use it in BERTopic as follows:

        from bertopic.representation import ZeroShotClassification\nfrom bertopic import BERTopic\n\n# Create your representation model\ncandidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\nrepresentation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation Organic food the | explosion | atmosphere | eruption | kilometers | of Your immune system moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers ZeroShotClassification

        "},{"location":"getting_started/representation/representation.html#chain-models","title":"Chain Models","text":"

        All of the above models can make use of the candidate topics, as generated by c-TF-IDF, to further fine-tune the topic representations. For example, MaximalMarginalRelevance takes the keywords in the candidate topics and re-ranks them. Similarly, the keywords in the candidate topic can be used as the input for GPT-prompts in OpenAI.

        Although the default candidate topics are generated by c-TF-IDF, what if we were to chain these models? For example, we can use MaximalMarginalRelevance to improve upon the keywords in each topic before passing them to OpenAI.

        This is supported in BERTopic by simply passing a list of representation models when instantation the topic model:

        from bertopic.representation import MaximalMarginalRelevance, OpenAI\nfrom bertopic import BERTopic\nimport openai\n\n# Create your representation models\nclient = openai.OpenAI(api_key=\"sk-...\")\nopenai_generator = OpenAI(client)\nmmr = MaximalMarginalRelevance(diversity=0.3)\nrepresentation_models = [mmr, openai_generator]\n\n# Use the chained models\ntopic_model = BERTopic(representation_model=representation_models)\n
        "},{"location":"getting_started/representation/representation.html#custom-model","title":"Custom Model","text":"

        Although several representation models have been implemented in BERTopic, new technologies get released often and we should not have to wait until they get implemented in BERTopic. Therefore, you can create your own representation model and use that to fine-tune the topics.

        The following is the basic structure for creating your custom model. Note that it returns the same topics as the those calculated with c-TF-IDF:

        from bertopic.representation._base import BaseRepresentation\n\n\nclass CustomRepresentationModel(BaseRepresentation):\n    def extract_topics(self, topic_model, documents, c_tf_idf, topics\n                      ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: The BERTopic model\n            documents: A dataframe of documents with their related topics\n            c_tf_idf: The c-TF-IDF matrix\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        updated_topics = topics.copy()\n        return updated_topics\n

        Then, we can use that model as follows:

        from bertopic import BERTopic\n\n# Create our custom representation model\nrepresentation_model = CustomRepresentationModel()\n\n# Pass our custom representation model to BERTopic\ntopic_model = BERTopic(representation_model=representation_model)\n

        There are a few things to take into account when creating your custom model:

        • It needs to have the exact same parameter input: topic_model, documents, c_tf_idf, topics.
        • Make sure that updated_topics has the exact same structure as topics:
        updated_topics = {\n    \"1\", [(\"space\", 0.9), (\"nasa\", 0.7)], \n    \"2\": [(\"science\", 0.66), (\"article\", 0.6)]\n}\n

        Tip

        You can change the __init__ however you want, it does not influence the underlying structure. This also means that you can save data/embeddings/representations/sentiment in your custom representation model.

        "},{"location":"getting_started/search/search.html","title":"Search Topics","text":"

        After having created a BERTopic model, you might end up with over a hundred topics. Searching through those can be quite cumbersome especially if you are searching for a specific topic. Fortunately, BERTopic allows you to search for topics using search terms. First, let's create and train a BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Create topics\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        After having trained our model, we can use find_topics to search for topics that are similar to an input search_term. Here, we are going to be searching for topics that closely relate the search term \"motor\". Then, we extract the most similar topic and check the results:

        >>> similar_topics, similarity = topic_model.find_topics(\"motor\", top_n=5)\n>>> topic_model.get_topic(similar_topics[0])\n[('bike', 0.02275997701645559),\n ('motorcycle', 0.011391202866080292),\n ('bikes', 0.00981187573649205),\n ('dod', 0.009614623748226669),\n ('honda', 0.008247663662558535),\n ('ride', 0.0064683227888861945),\n ('harley', 0.006355502638631013),\n ('riding', 0.005766601561614182),\n ('motorcycles', 0.005596372493714447),\n ('advice', 0.005534544418830091)]\n

        It definitely seems that a topic was found that closely matches \"motor\". The topic seems to be motorcycle related and therefore matches our \"motor\" input. You can use the similarity variable to see how similar the extracted topics are to the search term.

        Note

        You can only use this method if an embedding model was supplied to BERTopic using embedding_model.

        "},{"location":"getting_started/seed_words/seed_words.html","title":"Seed Words","text":"

        When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the \"TNM\" classification is a method for identifying the stage of most cancers. The word \"TNM\" is an abbreviation and might not be correctly captured in generic embedding models.

        To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of seed_words in the bertopic.vectorizer.ClassTfidfTransformer. The ClassTfidfTransformer is the base representation of BERTopic and essentially represents each topic as a bag of words. As such, we can choose to increase the importance of certain words, such as \"TNM\".

        To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like \"agent\" and \"robot\" should be important in such a topic were it to be found. Using the ClassTfidfTransformer, we can define those seed_words and also choose by how much their values are multiplied.

        The full example is then as follows:

        from umap import UMAP\nfrom datasets import load_dataset\nfrom bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\n# Let's take a subset of ArXiv abstracts as the training data\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\nabstracts = dataset[\"abstract\"][:5_000]\n\n# For illustration purposes, we make sure the output is fixed when running this code multiple times\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n\n# We can choose any number of seed words for which we want their representation\n# to be strengthen. We increase the importance of these words as we want them to be more\n# likely to end up in the topic representations.\nctfidf_model = ClassTfidfTransformer(\n    seed_words=[\"agent\", \"robot\", \"behavior\", \"policies\", \"environment\"], \n    seed_multiplier=2\n)\n\n# We run the topic model with the seeded words\ntopic_model = BERTopic(\n    umap_model=umap_model,\n    min_topic_size=15,\n    ctfidf_model=ctfidf_model,\n).fit(abstracts)\n

        Then, when we run topic_model.get_topic(0), we get the following output:

        [('policy', 0.023413102511982354),\n ('reinforcement', 0.021796126795834238),\n ('agent', 0.021131601305431902),\n ('policies', 0.01888385271486409),\n ('environment', 0.017819874593917057),\n ('learning', 0.015321710504308708),\n ('robot', 0.013881115279230468),\n ('control', 0.013297705894983875),\n ('the', 0.013247933839985382),\n ('to', 0.013058208312484141)]\n

        As we can see, the output includes some of the seed words that we assigned. However, if a word is not found to be important in a topic than we can still multiply its importance but it will remain relatively low. This is a great feature as it allows you to improve their importance with less risk of making words important in topics that really should not be.

        A benefit of this method is that this often influences all other representation methods, like KeyBERTInspired and OpenAI. The reason for this is that each representation model uses the words generated by the ClassTfidfTransformer as candidate words to be further optimized. In many cases, words like \"TNM\" might not end up in the candidate words. By increasing their importance, they are more likely to end up as candidate words in representation models.

        Another benefit of using this method is that it artificially increases the interpretability of topics. Sure, some words might be more important than others but there might not mean something to a domain expert. For them, certain words, like \"TNM\" are highly descriptive and that is something difficult to capture using any method (embedding model, large language model, etc.).

        Moreover, these seed_words can be defined together with the domain expert as they can decide what type of words are generally important and might need a nudge from you the algorithmic developer.

        "},{"location":"getting_started/semisupervised/semisupervised.html","title":"Semi-supervised Topic Modeling","text":"

        In BERTopic, you have several options to nudge the creation of topics toward certain pre-specified topics. Here, we will be looking at semi-supervised topic modeling with BERTopic.

        Semi-supervised modeling allows us to steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have.

        SBERT UMAP HDBSCAN c-TF-IDF Embeddings Dimensionality reduction Labels Clustering Topic representation

        In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents with HDBSCAN.

        First, let us prepare the data needed for our topic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\ncategories = data[\"target\"]\ncategory_names = data[\"target_names\"]\n

        We are using the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts that each is assigned to one of 20 categories. Using this dataset we can try to extract its corresponding topic model whilst taking its underlying categories into account. These categories are here the variable targets.

        Each document can be put into one of the following categories:

        >>> category_names\n\n['alt.atheism',\n 'comp.graphics',\n 'comp.os.ms-windows.misc',\n 'comp.sys.ibm.pc.hardware',\n 'comp.sys.mac.hardware',\n 'comp.windows.x',\n 'misc.forsale',\n 'rec.autos',\n 'rec.motorcycles',\n 'rec.sport.baseball',\n 'rec.sport.hockey',\n 'sci.crypt',\n 'sci.electronics',\n 'sci.med',\n 'sci.space',\n 'soc.religion.christian',\n 'talk.politics.guns',\n 'talk.politics.mideast',\n 'talk.politics.misc',\n 'talk.religion.misc'] \n

        To perform this semi-supervised approach, we can take in some pre-defined topics and simply pass those to the y parameter when fitting BERTopic. These labels can be pre-defined topics or simply documents that you feel belong together regardless of their content. BERTopic will nudge the creation of topics toward these categories using the pre-defined labels.

        To perform supervised topic modeling, we simply use all categories:

        topic_model = BERTopic(verbose=True).fit(docs, y=categories)\n

        The topic model will be much more attuned to the categories that were defined previously. However, this does not mean that only topics for these categories will be found. BERTopic is likely to find more specific topics in those you have already defined. This allows you to discover previously unknown topics!

        "},{"location":"getting_started/semisupervised/semisupervised.html#partial-labels","title":"Partial labels","text":"

        At times, you might only have labels for a subset of documents. Fortunately, we can still use those labels to at least nudge the documents for which those labels exist. The documents for which we do not have labels are assigned a -1. For this example, imagine we only have the labels of categories that are related to computers and we want to create a topic model using semi-supervised modeling:

        labels_to_add = ['comp.graphics', 'comp.os.ms-windows.misc',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x',]\nindices = [category_names.index(label) for label in labels_to_add]\ny = [label if label in indices else -1 for label in categories]\n

        The y variable contains many -1 values since we do not know all the categories.

        Next, we use those newly constructed labels to again BERTopic semi-supervised:

        topic_model = BERTopic(verbose=True).fit(docs, y=y)\n

        And that is it! By defining certain classes for our documents, we can steer the topic modeling towards modeling the pre-defined categories.

        "},{"location":"getting_started/serialization/serialization.html","title":"Serialization","text":"

        Saving, loading, and sharing a BERTopic model can be done in several ways. It is generally advised to go with .safetensors as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as .pickle and pytorch .bin are also possible.

        "},{"location":"getting_started/serialization/serialization.html#saving","title":"Saving","text":"

        There are three methods for saving BERTopic:

        1. A light model with .safetensors and config files
        2. A light model with pytorch .bin and config files
        3. A full model with .pickle

        Tip

        It is advised to use methods 1 or 2 for saving as they generated very small models. Especially method 1 (safetensors) allows for a relatively safe format compared to the other methods.

        The methods are used as follows:

        topic_model = BERTopic().fit(my_docs)\n\n# Method 1 - safetensors\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 2 - pytorch\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"pytorch\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 3 - pickle\ntopic_model.save(\"my_model\", serialization=\"pickle\")\n

        Warning

        When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic are not guaranteed to load in other versions.

        "},{"location":"getting_started/serialization/serialization.html#pickle-drawbacks","title":"Pickle Drawbacks","text":"

        Saving the model with pickle allows for saving the entire topic model, including dimensionality reduction and clustering algorithms, but has several drawbacks:

        • Arbitrary code can be run from .pickle files
        • The resulting model is rather large (often > 500MB) since all sub-models need to be saved
        • Explicit and specific version control is needed as they typically only run if the environment is exactly the same
        "},{"location":"getting_started/serialization/serialization.html#safetensors-and-pytorch-advantages","title":"Safetensors and Pytorch Advantages","text":"

        Saving the topic modeling with .safetensors or pytorch has a number of advantages:

        • .safetensors is a relatively safe format
        • The resulting model can be very small (often < 20MB>) since no sub-models need to be saved
        • Although version control is important, there is a bit more flexibility with respect to specific versions of packages
        • More easily used in production
        • Share models with the HuggingFace Hub

        The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing safetensors, pytorch, and pickle. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings.

        "},{"location":"getting_started/serialization/serialization.html#huggingface-hub","title":"HuggingFace Hub","text":"

        When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account which you can do in a number of ways:

        • Log in to your Hugging Face account with the command below
        huggingface-cli login\n\n# or using an environment variable\nhuggingface-cli login --token $HUGGINGFACE_TOKEN\n
        • Alternatively, you can programmatically login using login() in a notebook or a script
        from huggingface_hub import login\nlogin()\n
        • Or you can give a token with the token variable

        When you have logged in to your HuggingFace account, you can save and upload the model as follows:

        from bertopic import BERTopic\n\n# Train model\ntopic_model = BERTopic().fit(my_docs)\n\n# Push to HuggingFace Hub\ntopic_model.push_to_hf_hub(\n    repo_id=\"MaartenGr/BERTopic_ArXiv\",\n    save_ctfidf=True\n)\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_ArXiv\")\n
        "},{"location":"getting_started/serialization/serialization.html#parameters","title":"Parameters","text":"

        There are number of parameters that may be worthwile to know:

        • private
          • Whether to create a private repository
        • serialization
          • The type of serialization. Either safetensors or pytorch. Make sure to run pip install safetensors for safetensors.
        • save_embedding_model
          • A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., sentence-transformers/all-MiniLM-L6-v2
        • save_ctfidf
          • Whether to save c-TF-IDF information
        "},{"location":"getting_started/serialization/serialization.html#loading","title":"Loading","text":"

        To load a model:

        # Load from directory\nloaded_model = BERTopic.load(\"path/to/my/model_dir\")\n\n# Load from file\nloaded_model = BERTopic.load(\"my_model\")\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_Wikipedia\")\n

        The embedding model cannot always be saved using a non-pickle method if, for example, you are using OpenAI embeddings. Instead, you can load them in as follows:

        # Define embedding model\nimport openai\nfrom bertopic.backend import OpenAIBackend\n\nclient = openai.OpenAI(api_key=\"sk-...\")\nembedding_model = OpenAIBackend(client, \"text-embedding-ada-002\")\n\n# Load model and add embedding model\nloaded_model = BERTopic.load(\"path/to/my/model_dir\", embedding_model=embedding_model)\n
        "},{"location":"getting_started/supervised/supervised.html","title":"Supervised Topic Modeling","text":"

        Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used 20 NewsGroups dataset is already split up into 20 classes. Similarly, you might already have created some labels yourself through packages like human-learn, bulk, thisnotthat or something entirely different.

        Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic and try to learn the relationship between those topics and the input documents.

        In other words, we are going to be performing classification instead!

        We can view this as a supervised topic modeling approach. Instead of using a clustering algorithm, we are going to be using a classification algorithm instead.

        Generally, we have the following pipeline:

        SBERT UMAP HDBSCAN c-TF-IDF Embeddings Dimensionality reduction Clustering Topic representation

        Instead, we are now going to skip over the dimensionality reduction step and replace the clustering step with a classification model:

        SBERT Logistic Regression c-TF-IDF Embeddings Classifier Topic representation

        In other words, we can pass our labels to BERTopic and it will not only learn how to predict labels for new instances, but it also transforms those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.

        To do so, we need to skip over the dimensionality reduction step and replace the clustering step with a classification algorithm. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels:

        from sklearn.datasets import fetch_20newsgroups\n\n# Get labeled data\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n

        Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\nfrom bertopic.dimensionality import BaseDimensionalityReduction\nfrom sklearn.linear_model import LogisticRegression\n\n# Get labeled data\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n\n# Skip over dimensionality reduction, replace cluster model with classifier,\n# and reduce frequent words while we are at it.\nempty_dimensionality_model = BaseDimensionalityReduction()\nclf = LogisticRegression()\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\n\n# Create a fully supervised BERTopic instance\ntopic_model= BERTopic(\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=clf,\n        ctfidf_model=ctfidf_model\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n

        Let's take a look at a few topics that we get out of training this way by running topic_model.get_topic_info():

        Topic Count Name 0 0 999 0_game_hockey_team_25 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit

        We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship:

        # Map input `y` to topics\nmappings = topic_model.topic_mapper_.get_mappings()\nmappings = {value: data[\"target_names\"][key] for key, value in mappings.items()}\n\n# Assign original classes to our topics\ndf = topic_model.get_topic_info()\ndf[\"Class\"] = df.Topic.map(mappings)\ndf\n
        Topic Count Name Class 0 0 999 0_game_hockey_team_25 rec.sport.hockey 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit sci.space comp.windows.x sci.med rec.autos sci.crypt rec.sport.baseball rec.motorcycles soc.religion.christian

        We can see that the c-TF-IDF representations extract the words that give a good representation of our input classes. This is all done directly from the labeling. A welcome side-effect is that we now have a classification algorithm that allows us to predict the topics of unseen data:

        >>> topic, _ = topic_model.transform(\"this is a document about cars\")\n>>> topic_model.get_topic(topic)\n[('car', 0.4407600315538472),\n ('cars', 0.32348015696446325),\n ('engine', 0.28032518444946686),\n ('ford', 0.2500224508115155),\n ('oil', 0.2325984913598611),\n ('dealer', 0.2310723968585826),\n ('my', 0.22045777551991935),\n ('it', 0.21327993649430219),\n ('tires', 0.20420842634292657),\n ('brake', 0.20246902481367085)]\n

        Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc.

        Note

        The resulting topics may be a different mapping from the y labels. To map y to topics, we can run the following:

        mappings = topic_model.topic_mapper_.get_mappings()\ny_mapped = [mappings[val] for val in y]\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html","title":"Tips & Tricks","text":""},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#document-length","title":"Document length","text":"

        As a default, we are using sentence-transformers to embed our documents. However, as the name implies, the embedding model works best for either sentences or paragraphs. This means that whenever you have a set of documents, where each documents contains several paragraphs, the document is truncated and the topic model is only trained on a small part of the data.

        One way to solve this issue is by splitting up longer documents into either sentences or paragraphs before embedding them. Another solution is to approximate the topic distributions of topics after having trained your topic model.

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#removing-stop-words","title":"Removing stop words","text":"

        At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context in order to create accurate embeddings.

        Instead, we can use the CountVectorizer to preprocess our documents after having generated embeddings and clustered our documents. Personally, I have found almost no disadvantages to using the CountVectorizer to remove stopwords and it is something I would strongly advise to try out:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        We can also use the ClassTfidfTransformer to reduce the impact of frequent words. The end result is very similar to explicitly removing stopwords but this process does this automatically:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model)\n

        Lastly, we can use a KeyBERT-Inspired model to reduce the appearance of stop words. This also often improves the topic representation:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#diversify-topic-representation","title":"Diversify topic representation","text":"

        After having calculated our top n words per topic there might be many words that essentially mean the same thing. As a little bonus, we can use bertopic.representation.MaximalMarginalRelevance in BERTopic to diversify words in each topic such that we limit the number of duplicate words we find in each topic. This is done using an algorithm called Maximal Marginal Relevance which compares word embeddings with the topic embedding.

        We do this by specifying a value between 0 and 1, with 0 being not at all diverse and 1 being completely diverse:

        from bertopic import BERTopic\nfrom bertopic.representation import MaximalMarginalRelevance\n\nrepresentation_model = MaximalMarginalRelevance(diversity=0.2)\ntopic_model = BERTopic(representation_model=representation_model)\n

        Since MMR is using word embeddings to diversify the topic representations, it is necessary to pass the embedding model to BERTopic if you are using pre-computed embeddings:

        from bertopic import BERTopic\nfrom bertopic.representation import MaximalMarginalRelevance\nfrom sentence_transformers import SentenceTransformer\n\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\nrepresentation_model = MaximalMarginalRelevance(diversity=0.2)\ntopic_model = BERTopic(embedding_model=sentence_model, representation_model=representation_model)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#topic-term-matrix","title":"Topic-term matrix","text":"

        Although BERTopic focuses on clustering our documents, the end result does contain a topic-term matrix. This topic-term matrix is calculated using c-TF-IDF, a TF-IDF procedure optimized for class-based analyses.

        To extract the topic-term matrix (or c-TF-IDF matrix) with the corresponding words, we can simply do the following:

        topic_term_matrix = topic_model.c_tf_idf_\nwords = topic_model.vectorizer_model.get_feature_names()\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#pre-compute-embeddings","title":"Pre-compute embeddings","text":"

        Typically, we want to iterate fast over different versions of our BERTopic model whilst we are trying to optimize it to a specific use case. To speed up this process, we can pre-compute the embeddings, save them, and pass them to BERTopic so it does not need to calculate the embeddings each time:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train our topic model using our pre-trained sentence-transformers embeddings\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#speed-up-umap","title":"Speed up UMAP","text":"

        At times, UMAP may take a while to fit on the embeddings that you have. This often happens when you have the embeddings millions of documents that you want to reduce in dimensionality. There is a trick that can speed up this process somewhat: Initializing UMAP with rescaled PCA embeddings.

        Without going in too much detail (look here for more information), you can reduce the embeddings using PCA and use that as a starting point. This can speed up the dimensionality reduction a bit:

        import numpy as np\nfrom umap import UMAP\nfrom bertopic import BERTopic\nfrom sklearn.decomposition import PCA\n\n\ndef rescale(x, inplace=False):\n    \"\"\" Rescale an embedding so optimization will not have convergence issues.\n    \"\"\"\n    if not inplace:\n        x = np.array(x, copy=True)\n\n    x /= np.std(x[:, 0]) * 10000\n\n    return x\n\n\n# Initialize and rescale PCA embeddings\npca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))\n\n# Start UMAP from PCA embeddings\numap_model = UMAP(\n    n_neighbors=15,\n    n_components=5,\n    min_dist=0.0,\n    metric=\"cosine\",\n    init=pca_embeddings,\n)\n\n# Pass the model to BERTopic:\ntopic_model = BERTopic(umap_model=umap_model)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#gpu-acceleration","title":"GPU acceleration","text":"

        You can use cuML to speed up both UMAP and HDBSCAN through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\nfrom cuml.manifold import UMAP\n\n# Create instances of GPU-accelerated UMAP and HDBSCAN\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\n\n# Pass the above models to be used in BERTopic\ntopic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Depending on the embeddings you are using, you might want to normalize them first in order to force a cosine-related distance metric in UMAP:

        from cuml.preprocessing import normalize\nembeddings = normalize(embeddings)\n

        Note

        As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., .transform) using cuML's HDBSCAN. However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., .fit and .fit_transform).

        Note

        If you want to install cuML together with BERTopic using Google Colab, you can run the following code:

        !pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation","title":"Lightweight installation","text":"

        The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely \"all-MiniLM-L6-v2\". Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires pytorch which often results in a rather large environment, memory-wise.

        Fortunately, it is possible to install BERTopic without sentence-transformers and use it as a lightweight solution instead. The installation can be done as follows:

        pip install --no-deps bertopic\npip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml\n

        Then, we can use BERTopic without sentence-transformers as follows using a CPU-based embedding technique:

        from sklearn.pipeline import make_pipeline\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\npipe = make_pipeline(\n    TfidfVectorizer(),\n    TruncatedSVD(100)\n)\n\ntopic_model = BERTopic(embedding_model=pipe)\n

        As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary!

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#wordcloud","title":"WordCloud","text":"

        To minimize the number of dependencies in BERTopic, it is not possible to generate wordclouds out-of-the-box. However, there is a minimal script that you can use to generate wordclouds in BERTopic. First, you will need to install the wordcloud package with pip install wordcloud. Then, run the following code to generate the wordcloud for a specific topic:

        from wordcloud import WordCloud\nimport matplotlib.pyplot as plt\n\ndef create_wordcloud(model, topic):\n    text = {word: value for word, value in model.get_topic(topic)}\n    wc = WordCloud(background_color=\"white\", max_words=1000)\n    wc.generate_from_frequencies(text)\n    plt.imshow(wc, interpolation=\"bilinear\")\n    plt.axis(\"off\")\n    plt.show()\n\n# Show wordcloud\ncreate_wordcloud(topic_model, topic=1)\n

        Tip

        To increase the number of words shown in the wordcloud, you can increase the top_n_words parameter when instantiating BERTopic. You can also increase the number of words in a topic after training the model using .update_topics().

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#finding-similar-topics-between-models","title":"Finding similar topics between models","text":"

        Whenever you have trained separate BERTopic models on different datasets, it might be worthful to find the similarities among these models. Is there overlap between topics in model A and topic in model B? In other words, can we find topics in model A that are similar to those in model B?

        We can compare the topic representations of several models in two ways. First, by comparing the topic embeddings that are created when using the same embedding model across both fitted BERTopic instances. Second, we can compare the c-TF-IDF representations instead assuming we have fixed the vocabulary in both instances.

        This example will go into the former, using the same embedding model across two BERTopic instances. To do this comparison, let's first create an example where I trained two models, one on an English dataset and one on a Dutch dataset:

        from datasets import load_dataset\nfrom bertopic import BERTopic\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# The same embedding model needs to be used for both topic models\n# and since we are dealing with multiple languages, the model needs to be multi-lingual\nsentence_model = SentenceTransformer(\"paraphrase-multilingual-MiniLM-L12-v2\")\n\n# To make this example reproducible\numap_model = UMAP(n_neighbors=15, n_components=5, \n                  min_dist=0.0, metric='cosine', random_state=42)\n\n# English\nen_dataset = load_dataset(\"stsb_multi_mt\", name=\"en\", split=\"train\").to_pandas().sentence1.tolist()\nen_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model)\nen_model.fit(en_dataset)\n\n# Dutch\nnl_dataset = load_dataset(\"stsb_multi_mt\", name=\"nl\", split=\"train\").to_pandas().sentence1.tolist()\nnl_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model)\nnl_model.fit(nl_dataset)\n

        In the code above, there is one important thing to note and that is the sentence_model. This model needs to be exactly the same in all BERTopic models, otherwise, it is not possible to compare topic models.

        Next, we can calculate the similarity between topics in the English topic model en_model and the Dutch model nl_model. To do so, we can simply calculate the cosine similarity between the topic_embedding of both models:

        from sklearn.metrics.pairwise import cosine_similarity\nsim_matrix = cosine_similarity(en_model.topic_embeddings_, nl_model.topic_embeddings_)\n

        Now that we know which topics are similar to each other, we can extract the most similar topics. Let's say that we have topic 10 in the en_model which represents a topic related to trains:

        >>> topic = 10\n>>> en_model.get_topic(topic)\n[('train', 0.2588080580844999),\n ('tracks', 0.1392140438801078),\n ('station', 0.12126454635946024),\n ('passenger', 0.058057876475695866),\n ('engine', 0.05123717127783682),\n ('railroad', 0.048142847325312044),\n ('waiting', 0.04098973702226946),\n ('track', 0.03978248702913929),\n ('subway', 0.03834661195748458),\n ('steam', 0.03834661195748458)]\n

        To find the matching topic, we extract the most similar topic in the sim_matrix:

        >>> most_similar_topic = np.argmax(sim_matrix[topic + 1])-1\n>>> nl_model.get_topic(most_similar_topic)\n[('trein', 0.24186603209316418),\n ('spoor', 0.1338118418551581),\n ('sporen', 0.07683661859111401),\n ('station', 0.056990389779394225),\n ('stoommachine', 0.04905829711711234),\n ('zilveren', 0.04083879598477808),\n ('treinen', 0.03534099197032758),\n ('treinsporen', 0.03534099197032758),\n ('staat', 0.03481332997324445),\n ('zwarte', 0.03179591746822408)]\n

        It seems to be working as, for example, trein is a translation of train and sporen a translation of tracks! You can do this for every single topic to find out which topic in the en_model might belong to a model in the nl_model.

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#multimodal-data","title":"Multimodal data","text":"

        Concept is a variation of BERTopic for multimodal data, such as images with captions. Although we can use that package for multimodal data, we can perform a small trick with BERTopic to have a similar feature.

        BERTopic is a relatively modular approach that attempts to isolate steps from one another. This means, for example, that you can use k-Means instead of HDBSCAN or PCA instead of UMAP as it does not make any assumptions with respect to the nature of the clustering.

        Similarly, you can pass pre-calculated embeddings to BERTopic that represent the documents that you have. However, it does not make any assumption with respect to the relationship between those embeddings and the documents. This means that we could pass any metadata to BERTopic to cluster on instead of document embeddings. In this example, we can separate our embeddings from our documents so that the embeddings are generated from images instead of their corresponding images. Thus, we will cluster image embeddings but create the topic representation from the related captions.

        In this example, we first need to fetch our data, namely the Flickr 8k dataset that contains images with captions:

        import os\nimport glob\nimport zipfile\nimport numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nfrom PIL import Image\nfrom sentence_transformers import SentenceTransformer, util\n\n# Flickr 8k images\nimg_folder = 'photos/'\ncaps_folder = 'captions/'\nif not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:\n    os.makedirs(img_folder, exist_ok=True)\n\n    if not os.path.exists('Flickr8k_Dataset.zip'):   #Download dataset if does not exist\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip')\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip')\n\n    for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]:\n        with zipfile.ZipFile(file, 'r') as zf:\n            for member in tqdm(zf.infolist(), desc='Extracting'):\n                zf.extract(member, folder)\nimages = list(glob.glob('photos/Flicker8k_Dataset/*.jpg'))\n\n# Prepare dataframe\ncaptions = pd.read_csv(\"captions/Flickr8k.lemma.token.txt\",sep='\\t',names=[\"img_id\",\"img_caption\"])\ncaptions.img_id = captions.apply(lambda row: \"photos/Flicker8k_Dataset/\" + row.img_id.split(\".jpg\")[0] + \".jpg\", 1)\ncaptions = captions.groupby([\"img_id\"])[\"img_caption\"].apply(','.join).reset_index()\ncaptions = pd.merge(captions, pd.Series(images, name=\"img_id\"), on=\"img_id\")\n\n# Extract images together with their documents/captions\nimages = captions.img_id.to_list()\ndocs = captions.img_caption.to_list()\n

        Now that we have our images and captions, we need to generate our image embeddings:

        model = SentenceTransformer('clip-ViT-B-32')\n\n# Prepare images\nbatch_size = 32\nnr_iterations = int(np.ceil(len(images) / batch_size))\n\n# Embed images per batch\nembeddings = []\nfor i in tqdm(range(nr_iterations)):\n    start_index = i * batch_size\n    end_index = (i * batch_size) + batch_size\n\n    images_to_embed = [Image.open(filepath) for filepath in images[start_index:end_index]]\n    img_emb = model.encode(images_to_embed, show_progress_bar=False)\n    embeddings.extend(img_emb.tolist())\n\n    # Close images\n    for image in images_to_embed:\n        image.close()\nembeddings = np.array(embeddings)\n

        Finally, we can fit BERTopic the way we are used to, with documents and embeddings:

        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\ntopics, probs = topic_model.fit_transform(docs, embeddings)\ncaptions[\"Topic\"] = topics\n

        After fitting our model, let's inspect a topic about skateboarders:

        >>> topic_model.get_topic(2)\n[('skateboard', 0.09592033177340711),\n ('skateboarder', 0.07792520092546491),\n ('trick', 0.07481578896400298),\n ('ramp', 0.056952605147927216),\n ('skate', 0.03745127816149923),\n ('perform', 0.036546213623432654),\n ('bicycle', 0.03453483070441857),\n ('bike', 0.033233021253898994),\n ('jump', 0.026709362981948037),\n ('air', 0.025422798170830936)]\n

        Based on the above output, we can take an image to see if the representation makes sense:

        image = captions.loc[captions.Topic == 2, \"img_id\"].values.tolist()[0]\nImage.open(image)\n

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#keybert-bertopic","title":"KeyBERT & BERTopic","text":"

        Although BERTopic focuses on topic extraction methods that does not assume specific structures for the generated clusters, it is possible to do this on a more local level. More specifically, we can use KeyBERT to generate a number of keywords for each document and then build a vocabulary on top of that as the input for BERTopic. This way, we can select words that we know have meaning to a topic, without focusing on the centroid of that cluster. This also allows more frequent words to pop-up regardless of the structure and density of a cluster.

        To do this, we first need to run KeyBERT on our data and create our vocabulary:

        from sklearn.datasets import fetch_20newsgroups\nfrom keybert import KeyBERT\n\n# Prepare documents \ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# Extract keywords\nkw_model = KeyBERT()\nkeywords = kw_model.extract_keywords(docs)\n\n# Create our vocabulary\nvocabulary = [k[0] for keyword in keywords for k in keyword]\nvocabulary = list(set(vocabulary))\n

        Then, we pass our vocabulary to BERTopic and train the model:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model= CountVectorizer(vocabulary=vocabulary)\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\ntopics, probs = topic_model.fit_transform(docs)\n
        "},{"location":"getting_started/topicreduction/topicreduction.html","title":"Topic Reduction","text":"

        BERTopic uses HDBSCAN for clustering the data and it cannot specify the number of clusters you would want. To a certain extent, this is an advantage, as we can trust HDBSCAN to be better in finding the number of clusters than we are. Instead, we can try to reduce the number of topics that have been created. Below, you will find three methods of doing so.

        "},{"location":"getting_started/topicreduction/topicreduction.html#manual-topic-reduction","title":"Manual Topic Reduction","text":"

        Each resulting topic has its feature vector constructed from c-TF-IDF. Using those feature vectors, we can find the most similar topics and merge them. If we do this iteratively, starting from the least frequent topic, we can reduce the number of topics quite easily. We do this until we reach the value of nr_topics:

        from bertopic import BERTopic\ntopic_model = BERTopic(nr_topics=20)\n

        It is also possible to manually select certain topics that you believe should be merged. For example, if topic 1 is 1_space_launch_moon_nasa and topic 2 is 2_spacecraft_solar_space_orbit it might make sense to merge those two topics:

        topics_to_merge = [1, 2]\ntopic_model.merge_topics(docs, topics_to_merge)\n

        If you have several groups of topics you want to merge, create a list of lists instead:

        topics_to_merge = [[1, 2]\n                   [3, 4]]\ntopic_model.merge_topics(docs, topics_to_merge)\n
        "},{"location":"getting_started/topicreduction/topicreduction.html#automatic-topic-reduction","title":"Automatic Topic Reduction","text":"

        One issue with the approach above is that it will merge topics regardless of whether they are very similar. They are simply the most similar out of all options. This can be resolved by reducing the number of topics automatically. To do this, we can use HDBSCAN to cluster our topics using each c-TF-IDF representation. Then, we merge topics that are clustered together. Another benefit of HDBSCAN is that it generates outliers. These outliers prevent topics from being merged if no other topics are similar.

        To use this option, we simply set nr_topics to \"auto\":

        from bertopic import BERTopic\ntopic_model = BERTopic(nr_topics=\"auto\")\n
        "},{"location":"getting_started/topicreduction/topicreduction.html#topic-reduction-after-training","title":"Topic Reduction after Training","text":"

        Finally, we can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so is that you can decide the number of topics after knowing how many are created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterward how many topics seem realistic:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Create topics -> Typically over 50 topics\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Further reduce topics\ntopic_model.reduce_topics(docs, nr_topics=30)\n\n# Access updated topics\ntopics = topic_model.topics_\n

        The reasoning for putting docs as a parameter is that the documents are not saved within BERTopic on purpose. If you were to have a million documents, it is very inefficient to save those in BERTopic instead of a dedicated database.

        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html","title":"Update Topic Representations","text":"

        The topics that are extracted from BERTopic are represented by words. These words are extracted from the documents occupying their topics using a class-based TF-IDF. This allows us to extract words that are interesting to a topic but less so to another.

        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html#update-topic-representation-after-training","title":"Update Topic Representation after Training","text":"

        When you have trained a model and viewed the topics and the words that represent them, you might not be satisfied with the representation. Perhaps you forgot to remove stop_words or you want to try out a different n_gram_range. We can use the function update_topics to update the topic representation with new parameters for c-TF-IDF:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Create topics\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic(n_gram_range=(2, 3))\ntopics, probs = topic_model.fit_transform(docs)\n

        From the model created above, one of the most frequent topics is the following:

        >>> topic_model.get_topic(31)[:10]\n[('clipper chip', 0.007240771542316232),\n ('key escrow', 0.004601603973377443),\n ('law enforcement', 0.004277247929596332),\n ('intercon com', 0.0035961920238955824),\n ('amanda walker', 0.003474856425297157),\n ('serial number', 0.0029876119137150358),\n ('com amanda', 0.002789303096817983),\n ('intercon com amanda', 0.0027386688593327084),\n ('amanda intercon', 0.002585262048515583),\n ('amanda intercon com', 0.002585262048515583)]\n

        Although there does seems to be some relation between words, it is difficult, at least for me, to intuitively understand what the topic is about. Instead, let's simplify the topic representation by setting n_gram_range to (1, 3) to also allow for single words.

        >>> topic_model.update_topics(docs, n_gram_range=(1, 3))\n>>> topic_model.get_topic(31)[:10]\n[('encryption', 0.008021846079148017),\n ('clipper', 0.00789642647602742),\n ('chip', 0.00637127942464045),\n ('key', 0.006363124787175884),\n ('escrow', 0.005030980365244285),\n ('clipper chip', 0.0048271268437973395),\n ('keys', 0.0043245812747907545),\n ('crypto', 0.004311198708675516),\n ('intercon', 0.0038772934659295076),\n ('amanda', 0.003516026493904586)]\n

        To me, the combination of the words above seem a bit more intuitive than the words we previously had! You can play around with n_gram_range or use your own custom sklearn.feature_extraction.text.CountVectorizer and pass that instead:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(stop_words=\"english\", ngram_range=(1, 5))\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        Tip!

        If you want to change the topics to something else, whether that is merging them or removing outliers, you can pass a custom list of topics to update them: topic_model.update_topics(docs, topics=my_updated_topics)

        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html#custom-labels","title":"Custom labels","text":"

        The topic labels are currently automatically generated by taking the top 3 words and combining them using the _ separator. Although this is an informative label, in practice, this is definitely not the prettiest nor necessarily the most accurate label. For example, although the topic label 1_space_nasa_orbit is informative, but we would prefer to have a bit more intuitive label, such as space travel. The difficulty with creating such topic labels is that much of the interpretation is left to the user. Would space travel be more accurate or perhaps space explorations? To truly understand which labels are most suited, going into some of the documents in topics is especially helpful.

        Although we can go through every single topic ourselves and try to label them, we can start by creating an overview of labels that have the length and number of words that we are looking for. To do so, we can generate our list of topic labels with .generate_topic_labels and define the number of words, the separator, word length, etc:

        topic_labels = topic_model.generate_topic_labels(nr_words=3,\n                                                 topic_prefix=False,\n                                                 word_length=10,\n                                                 separator=\", \")\n

        Tip

        If you created multiple topic representations or aspects, you can choose one of these aspects with aspect=\"Aspect1\" or whatever you named the aspect.

        In the above example, 1_space_nasa_orbit would turn into space, nasa, orbit since we selected 3 words, no topic prefix, and the , separator. We can then either change our topic_labels to whatever we want or directly pass them to .set_topic_labels so that they can be used across most visualization functions:

        topic_model.set_topic_labels(topic_labels)\n

        It is also possible to only change a few topic labels at a time by passing a dictionary where the key represents the topic ID and the value is the topic label:

        topic_model.set_topic_labels({1: \"Space Travel\", 7: \"Religion\"})\n

        Then, to make use of those custom topic labels across visualizations, such as .visualize_hierarchy(), we can use the custom_labels=True parameter that is found in most visualizations.

        fig = topic_model.visualize_barchart(custom_labels=True)\n
        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html#optimize-labels","title":"Optimize labels","text":"

        The great advantage of passing custom labels to BERTopic is that when more accurate zero-shot are released, we can simply use those on top of BERTopic to further fine-tune the labeling. For example, let's say you have a set of potential topic labels that you want to use instead of the ones generated by BERTopic. You could use the bart-large-mnli model to find which user-defined labels best represent the BERTopic-generated labels:

        from transformers import pipeline\nclassifier = pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")\n\n# A selected topic representation\n# 'god jesus atheists atheism belief atheist believe exist beliefs existence'\nsequence_to_classify =  \" \".join([word for word, _ in topic_model.get_topic(1)])\n\n# Our set of potential topic labels\ncandidate_labels = ['cooking', 'dancing', 'religion']\nclassifier(sequence_to_classify, candidate_labels)\n\n#{'labels': ['cooking', 'dancing', 'religion'],\n# 'scores': [0.086, 0.063, 0.850],\n# 'sequence': 'god jesus atheists atheism belief atheist believe exist beliefs existence'}\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html","title":"Dynamic Topic Modeling","text":"

        Dynamic topic modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. These methods allow you to understand how a topic is represented across different times. For example, in 1995 people may talk differently about environmental awareness than those in 2015. Although the topic itself remains the same, environmental awareness, the exact representation of that topic might differ.

        BERTopic allows for DTM by calculating the topic representation at each timestep without the need to run the entire model several times. To do this, we first need to fit BERTopic as if there were no temporal aspect in the data. Thus, a general topic model will be created. We use the global representation as to the main topics that can be found at, most likely, different timesteps. For each topic and timestep, we calculate the c-TF-IDF representation. This will result in a specific topic representation at each timestep without the need to create clusters from embeddings as they were already created.

        1 Topic Timestep 1 m Timestep Timestep 1 Timestep m n Topic c-TF-IDF c-TF-IDF c-TF-IDF c-TF-IDF topic c-TF-IDF c-TF-IDF at t + 2 c-TF-IDF at t c-TF-IDF at t-1 + 2 Global Tuning Split documents by topic Split documents by topic and timestep Apply pre-fitted c-TF-IDF on each subset of documents. Tune the c-TF-IDF at each timestep t by either averaging the representations with the global representation or with the representation at t-1. Evolutionary Tuning Optional tuning of representations

        Next, there are two main ways to further fine-tune these specific topic representations, namely globally and evolutionary.

        A topic representation at timestep t can be fine-tuned globally by averaging its c-TF-IDF representation with that of the global representation. This allows each topic representation to move slightly towards the global representation whilst still keeping some of its specific words.

        A topic representation at timestep t can be fine-tuned evolutionary by averaging its c-TF-IDF representation with that of the c-TF-IDF representation at timestep t-1. This is done for each topic representation allowing for the representations to evolve over time.

        Both fine-tuning methods are set to True as a default and allow for interesting representations to be created.

        "},{"location":"getting_started/topicsovertime/topicsovertime.html#example","title":"Example","text":"

        To demonstrate DTM in BERTopic, we first need to prepare our data. A good example of where DTM is useful is topic modeling on Twitter data. We can analyze how certain people have talked about certain topics in the years they have been on Twitter. Due to the controversial nature of his tweets, we are going to be using all tweets by Donald Trump.

        First, we need to load the data and do some very basic cleaning. For example, I am not interested in his re-tweets for this use-case:

        import re\nimport pandas as pd\n\n# Prepare data\ntrump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')\ntrump.text = trump.apply(lambda row: re.sub(r\"http\\S+\", \"\", row.text).lower(), 1)\ntrump.text = trump.apply(lambda row: \" \".join(filter(lambda x:x[0]!=\"@\", row.text.split())), 1)\ntrump.text = trump.apply(lambda row: \" \".join(re.sub(\"[^a-zA-Z]+\", \" \", row.text).split()), 1)\ntrump = trump.loc[(trump.isRetweet == \"f\") & (trump.text != \"\"), :]\ntimestamps = trump.date.to_list()\ntweets = trump.text.to_list()\n

        Then, we need to extract the global topic representations by simply creating and training a BERTopic model:

        from bertopic import BERTopic\n\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(tweets)\n

        From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this by simply calling topics_over_time and passing the tweets, the corresponding timestamps, and the related topics:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20)\n

        And that is it! Aside from what you always need for BERTopic, you now only need to add timestamps to quickly calculate the topics over time.

        "},{"location":"getting_started/topicsovertime/topicsovertime.html#parameters","title":"Parameters","text":"

        There are a few parameters that are of interest which will be discussed below.

        "},{"location":"getting_started/topicsovertime/topicsovertime.html#tuning","title":"Tuning","text":"

        Both global_tuning and evolutionary_tuning are set to True as a default, but can easily be changed. Perhaps you do not want the representations to be influenced by the global representation and merely see how they evolved over time:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, \n                                                global_tuning=True, evolution_tuning=True, nr_bins=20)\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html#bins","title":"Bins","text":"

        If you have more than 100 unique timestamps, then there will be topic representations created for each of those timestamps which can negatively affect the topic representations. It is advised to keep the number of unique timestamps below 50. To do this, you can simply set the number of bins that are created when calculating the topic representations. The timestamps will be taken and put into equal-sized bins:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20)\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html#datetime-format","title":"Datetime format","text":"

        If you are passing strings (dates) instead of integers, then BERTopic will try to automatically detect which datetime format your strings have. Unfortunately, this will not always work if they are in an unexpected format. We can use datetime_format to pass the format the timestamps have:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, datetime_format=\"%b%M\", nr_bins=20)\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html#visualization","title":"Visualization","text":"

        To me, DTM becomes truly interesting when you have a good way of visualizing how topics have changed over time. A nice way of doing so is by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call visualize_topics_over_time with the newly created topics over time:

        topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)\n

        I used top_n_topics to only show the top 20 most frequent topics. If I were to visualize all topics, which is possible by leaving top_n_topics empty, there is a chance that hundreds of lines will fill the plot.

        You can also use topics to show specific topics:

        topic_model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])\n
        "},{"location":"getting_started/topicsperclass/topicsperclass.html","title":"Topics per Class","text":"

        In some cases, you might be interested in how certain topics are represented over certain categories. Perhaps there are specific groups of users for which you want to see how they talk about certain topics.

        Instead of running the topic model per class, we can simply create a topic model and then extract, for each topic, its representation per class. This allows you to see how certain topics, calculated over all documents, are represented for certain subgroups.

        1 Topic 1 Class m Class 1 Class m Class n Topic c-TF-IDF c-TF-IDF c-TF-IDF c-TF-IDF Split documents by topic Split documents by topic and class Apply pre-fitted c-TF-IDF on each subset of documents.

        To do so, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents.

        First, let's prepare the data:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\ntargets = data[\"target\"]\ntarget_names = data[\"target_names\"]\nclasses = [data[\"target_names\"][i] for i in data[\"target\"]]\n

        Next, we want to extract the topics across all documents without taking the categories into account:

        topic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\n

        Now that we have created our global topic model, let us calculate the topic representations across each category:

        topics_per_class = topic_model.topics_per_class(docs, classes=classes)\n

        The classes variable contains the class for each document. Then, we simply visualize these topics per class:

        topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)\n

        You can hover over the bars to see the topic representation per class.

        As you can see in the visualization above, the topics 93_homosexual_homosexuality_sex and 58_bike_bikes_motorcycle are somewhat distributed over all classes.

        You can see that the topic representation between rec.motorcycles and rec.autos in 58_bike_bikes_motorcycle clearly differs from one another. It seems that BERTopic has tried to combine those two categories into a single topic. However, since they do contain two separate topics, the topic representation in those two categories differs.

        We see something similar for 93_homosexual_homosexuality_sex, where the topic is distributed among several categories and is represented slightly differently.

        Thus, you can see that although in certain categories the topic is similar, the way the topic is represented can differ.

        "},{"location":"getting_started/vectorizers/vectorizers.html","title":"4. Vectorizers","text":"

        In topic modeling, the quality of the topic representations is key for interpreting the topics, communicating results, and understanding patterns. It is of utmost importance to make sure that the topic representations fit with your use case.

        In practice, there is not one correct way of creating topic representations. Some use cases might opt for higher n-grams, whereas others might focus more on single words without any stop words. The diversity in use cases also means that we need to have some flexibility in BERTopic to make sure it can be used across most use cases. The image below illustrates this modularity:

        In this section, we will go through several examples of vectorization algorithms and how they can be implemented.

        "},{"location":"getting_started/vectorizers/vectorizers.html#countvectorizer","title":"CountVectorizer","text":"

        One often underestimated component of BERTopic is the CountVectorizer and c-TF-IDF calculation. Together, they are responsible for creating the topic representations and luckily can be quite flexible in parameter tuning. Here, we will go through tips and tricks for tuning your CountVectorizer and see how they might affect the topic representations.

        Before starting, it should be noted that you can pass the CountVectorizer before and after training your topic model. Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Train BERTopic with a custom CountVectorizer\nvectorizer_model = CountVectorizer(min_df=10)\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Passing it after training allows you to fine-tune the topic representations by using .update_topics():

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Train a BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Fine-tune topic representations after training BERTopic\nvectorizer_model = CountVectorizer(stop_words=\"english\", ngram_range=(1, 3), min_df=10)\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        The great thing about using .update_topics() is that it allows you to tweak the topic representations without re-training your model! Thus, here we will be focusing on fine-tuning our topic representations after training our model.

        Note

        The great thing about processing our topic representations with the CountVectorizer is that it does not influence the quality of clusters as that is being performed before generating the topic representations.

        "},{"location":"getting_started/vectorizers/vectorizers.html#basic-usage","title":"Basic Usage","text":"

        First, let's start with defining our documents and training our topic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Prepare documents\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# Train a BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        Now, let's see the top 10 most frequent topics that have been generated:

        >>> topic_model.get_topic_info()[1:11]\nTopic   Count   Name\n1   0   1822    0_game_team_games_he\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_ites_hello_cheek_hi\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_you_your_post_jim\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_fire_gas\n10  9   174 9_amp_audio_condition_asking\n

        The topic representations generated already seem quite interpretable! However, I am quite sure we do much better without having to re-train our model. Next, we will go through common parameters in CountVectorizer and focus on the effects that they might have. As a baseline, we will be comparing them to the topic representation above.

        "},{"location":"getting_started/vectorizers/vectorizers.html#parameters","title":"Parameters","text":"

        There are several basic parameters in the CountVectorizer that we can use to improve upon the quality of the resulting topic representations.

        "},{"location":"getting_started/vectorizers/vectorizers.html#ngram_range","title":"ngram_range","text":"

        The ngram_range parameter allows us to decide how many tokens each entity is in a topic representation. For example, we have words like game and team with a length of 1 in a topic but it would also make sense to have words like hockey league with a length of 2. To allow for these words to be generated, we can set the ngram_range parameter:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=\"english\")\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        As you might have noticed, I also added stop_words=\"english\". This is necessary as longer words tend to have many stop words and removing them allows for nicer topic representations:

        >>> topic_model.get_topic_info()[1:11]\n    Topic   Count   Name\n1   0   1822    0_game_team_games_players\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_hello ites_forget hello_ites 15_huh hi\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_post_jim_context_forged\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_gas_compound\n10  9   174 9_amp_audio_condition_asking\n

        Although they look very similar, if we zoom in on topic 8, we can see longer words in our representation:

        >>> topic_model.get_topic(8)\n[('fbi', 0.019637149205975653),\n ('koresh', 0.019054514637064403),\n ('gas', 0.014156057632897179),\n ('compound', 0.012381224868591681),\n ('batf', 0.010349992314076047),\n ('children', 0.009336408916322387),\n ('tear gas', 0.008941747802855279),\n ('tear', 0.008446786597564537),\n ('davidians', 0.007911119583253022),\n ('started', 0.007398687505638955)]\n

        tear and gas have now been combined into a single representation. This helps us understand what those individual words might have been representing.

        "},{"location":"getting_started/vectorizers/vectorizers.html#stop_words","title":"stop_words","text":"

        In some of the topics, we can see stop words appearing like he or the. Stop words are something we typically want to prevent in our topic representations as they do not give additional information to the topic. To prevent those stop words, we can use the stop_words parameter in the CountVectorizer to remove them from the representations:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        After running the above, we get the following output:

        >>> topic_model.get_topic_info()[1:11]\n    Topic   Count   Name\n1   0   1822    0_game_team_games_players\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_ites_cheek_hello_hi\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_monitor_card_video_vga\n6   5   438 5_post_jim_context_forged\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_tobacco\n9   8   218 8_fbi_koresh_gas_compound\n10  9   174 9_amp_audio_condition_stereo\n

        As you can see, the topic representations already look much better! Stop words are removed and the representations are more interpretable. We can also pass in a list of stop words if you have multiple languages to take into account.

        "},{"location":"getting_started/vectorizers/vectorizers.html#min_df","title":"min_df","text":"

        One important parameter to keep in mind is the min_df. This is typically an integer representing how frequent a word must be before being added to our representation. You can imagine that if we have a million documents and a certain word only appears a single time across all of them, then it would be highly unlikely to be representative of a topic. Typically, the c-TF-IDF calculation removes that word from the topic representation but when you have millions of documents, that will also lead to a very large topic-term matrix. To prevent a huge vocabulary, we can set the min_df to only accept words that have a minimum frequency.

        When you have millions of documents or error issues, I would advise increasing the value of min_df as long as the topic representations might sense:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(min_df=10)\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        With the following topic representation:

        >>> topic_model.get_topic_info()[1:11]\n    Topic   Count   Name\n1   0   1822    0_game_team_games_he\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_hello_hi_yep_huh\n4   3   493 3_israel_jews_jewish_peace\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_you_your_post_jim\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_fire_gas\n10  9   174 9_audio_condition_stereo_asking\n

        As you can see, the output is nearly the same which is what we would like to achieve. All words that appear less than 10 times are now removed from our topic-term matrix (i.e., c-TF-IDF matrix) which drastically lowers the matrix in size.

        "},{"location":"getting_started/vectorizers/vectorizers.html#max_features","title":"max_features","text":"

        A parameter similar to min_df is max_features which allows you to select the top n most frequent words to be used in the topic representation. Setting this, for example, to 10_000 creates a topic-term matrix with 10_000 terms. This helps you control the size of the topic-term matrix directly without having to fiddle around with the min_df parameter:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(max_features=10_000)\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        With the following representation:

        >>> topic_model.get_topic_info()[1:11]\nTopic   Count   Name\n1   0   1822    0_game_team_games_he\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_hello_hi_yep_huh\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_you_your_post_jim\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_fire_gas\n10  9   174 9_amp_audio_condition_asking\n

        As with min_df, we would like the topic representations to be very similar.

        "},{"location":"getting_started/vectorizers/vectorizers.html#tokenizer","title":"tokenizer","text":"

        The default tokenizer in the CountVectorizer works well for western languages but fails to tokenize some non-western languages, like Chinese. Fortunately, we can use the tokenizer variable in the CountVectorizer to use jieba, which is a package for Chinese text segmentation. Using it is straightforward:

        from sklearn.feature_extraction.text import CountVectorizer\nimport jieba\n\ndef tokenize_zh(text):\n    words = jieba.lcut(text)\n    return words\n\nvectorizer = CountVectorizer(tokenizer=tokenize_zh)\n

        Then, we can simply pass the vectorizer to update our topic representations:

        topic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n
        "},{"location":"getting_started/vectorizers/vectorizers.html#onlinecountvectorizer","title":"OnlineCountVectorizer","text":"

        When using the online/incremental variant of BERTopic, we need a CountVectorizer than can incrementally update its representation. For that purpose, OnlineCountVectorizer was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. It is a class that can be found in bertopic.vectorizers which extends sklearn.feature_extraction.text.CountVectorizer. In other words, you can use the exact same parameter in OnlineCountVectorizer as found in Scikit-Learn's CountVectorizer. We can use it as follows:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import OnlineCountVectorizer\n\n# Train BERTopic with a custom OnlineCountVectorizer\nvectorizer_model = OnlineCountVectorizer()\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n
        "},{"location":"getting_started/vectorizers/vectorizers.html#parameters_1","title":"Parameters","text":"

        Other than parameters found in CountVectorizer, such as stop_words and ngram_range, we can two parameters in OnlineCountVectorizer to adjust the way old data is processed and kept.

        "},{"location":"getting_started/vectorizers/vectorizers.html#decay","title":"decay","text":"

        At each iteration, we sum the bag-of-words representation of the new documents with the bag-of-words representation of all documents processed thus far. In other words, the bag-of-words matrix keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less and less relevant as time goes on. Therefore, a decay parameter was implemented that decays the bag-of-words' frequencies at each iteration before adding the document frequencies of new documents. The decay parameter is a value between 0 and 1 and indicates the percentage of frequencies the previous bag-of-words matrix should be reduced to. For example, a value of .1 will decrease the frequencies in the bag-of-words matrix by 10% at each iteration before adding the new bag-of-words matrix. This will make sure that recent data has more weight than previous iterations.

        "},{"location":"getting_started/vectorizers/vectorizers.html#delete_min_df","title":"delete_min_df","text":"

        In BERTopic, we might want to remove words from the topic representation that appear infrequently. The min_df in the CountVectorizer works quite well for that. However, when we have a streaming setting, the min_df does not work as well since a word's frequency might start below min_df but will end up higher than that over time. Setting that value high might not always be advised.

        As a result, the vocabulary of the resulting bag-of-words matrix can become quite large. Similarly, if we implement the decay parameter, then some values will decrease over time until they are below min_df. For these reasons, the delete_min_df parameter was implemented. The parameter takes positive integers and indicates, at each iteration, which words will be removed. If the value is set to 5, it will check after each iteration if the total frequency of a word is exceeded by that value. If so, the word will be removed in its entirety from the bag-of-words matrix. This helps to keep the bag-of-words matrix of a manageable size.

        Note

        Although the delete_min_df parameter removes words from the bag-of-words matrix, it is not permanent. If new documents come in where those previously deleted words are used frequently, they get added back to the matrix.

        "},{"location":"getting_started/visualization/visualization.html","title":"Visualization","text":"

        Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue.

        "},{"location":"getting_started/visualization/visualization.html#visualize-topics","title":"Visualize Topics","text":"

        After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to LDAvis.

        We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs) \n

        Then, we can call .visualize_topics to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML:

        topic_model.visualize_topics()\n

        You can use the slider to select the topic which then lights up red. If you hover over a topic, then general information is given about the topic, including the size of the topic and its corresponding words.

        "},{"location":"getting_started/visualization/visualization.html#visualize-documents","title":"Visualize Documents","text":"

        Using the previous method, we can visualize the topics and get insight into their relationships. However, you might want a more fine-grained approach where we can visualize the documents inside the topics to see if they were assigned correctly or whether they make sense. To do so, we can use the topic_model.visualize_documents() function. This function recalculates the document embeddings and reduces them to 2-dimensional space for easier visualization purposes. This process can be quite expensive, so it is advised to adhere to the following pipeline:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This was done for demonstration purposes as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set hide_document_hover=False in order to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualization.html#custom-hover","title":"Custom Hover","text":"

        When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (titles) containing the title for each document:

        topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-topic-hierarchy","title":"Visualize Topic Hierarchy","text":"

        The topics that were created can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use scipy.cluster.hierarchy to create clusters and visualize how they relate to one another. This might help to select an appropriate nr_topics when reducing the number of topics that you have created. To visualize this hierarchy, run the following:

        topic_model.visualize_hierarchy()\n

        Note

        Do note that this is not the actual procedure of .reduce_topics() when nr_topics is set to auto since HDBSCAN is used to automatically extract topics. The visualization above closely resembles the actual procedure of .reduce_topics() when any number of nr_topics is selected.

        "},{"location":"getting_started/visualization/visualization.html#hierarchical-labels","title":"Hierarchical labels","text":"

        Although visualizing this hierarchy gives us information about the structure, it would be helpful to see what happens to the topic representations when merging topics. To do so, we first need to calculate the representations of the hierarchical topics:

        First, we train a basic BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n

        To visualize these results, we simply need to pass the resulting hierarchical_topics to our .visualize_hierarchy function:

        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you hover over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes.

        "},{"location":"getting_started/visualization/visualization.html#text-based-topic-tree","title":"Text-based topic tree","text":"

        Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use topic_model.get_topic_tree to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged:

        >>> tree = topic_model.get_topic_tree(hierarchical_topics)\n>>> print(tree)\n.\n\u2514\u2500atheists_atheism_god_moral_atheist\n     \u251c\u2500atheists_atheism_god_atheist_argument\n     \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n     \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n     \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n
        Click here to view the full tree.
          .\n  \u251c\u2500people_armenian_said_god_armenians\n  \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u2502    \u251c\u2500jehovah_lord_mormon_mcconkie_god\n  \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500ra_satan_thou_god_lucifer \u2500\u2500 Topic: 94\n  \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500jehovah_lord_mormon_mcconkie_unto \u2500\u2500 Topic: 78\n  \u2502    \u2502    \u2502    \u2514\u2500jesus_mary_god_hell_sin\n  \u2502    \u2502    \u2502         \u251c\u2500jesus_hell_god_eternal_heaven\n  \u2502    \u2502    \u2502         \u2502    \u251c\u2500hell_jesus_eternal_god_heaven\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500jesus_tomb_disciples_resurrection_john \u2500\u2500 Topic: 69\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500hell_eternal_god_jesus_heaven \u2500\u2500 Topic: 53\n  \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500aaron_baptism_sin_law_god \u2500\u2500 Topic: 89\n  \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500mary_sin_maria_priest_conception \u2500\u2500 Topic: 56\n  \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500marriage_married_marry_ceremony_marriages \u2500\u2500 Topic: 110\n  \u2502    \u2514\u2500people_armenian_armenians_said_mr\n  \u2502         \u251c\u2500people_armenian_armenians_said_israel\n  \u2502         \u2502    \u251c\u2500god_homosexual_homosexuality_atheists_sex\n  \u2502         \u2502    \u2502    \u251c\u2500homosexual_homosexuality_sex_gay_homosexuals\n  \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500kinsey_sex_gay_men_sexual \u2500\u2500 Topic: 44\n  \u2502         \u2502    \u2502    \u2502    \u2514\u2500homosexuality_homosexual_sin_homosexuals_gay\n  \u2502         \u2502    \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500gay_homosexual_homosexuals_sexual_cramer \u2500\u2500 Topic: 50\n  \u2502         \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500homosexuality_homosexual_sin_paul_sex \u2500\u2500 Topic: 27\n  \u2502         \u2502    \u2502    \u2514\u2500god_atheists_atheism_moral_atheist\n  \u2502         \u2502    \u2502         \u251c\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500jim_context_challenges_articles_quote \u2500\u2500 Topic: 36\n  \u2502         \u2502    \u2502         \u2502    \u2514\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500islam_quran_islamic_rushdie_muslims \u2500\u2500 Topic: 31\n  \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500judas_scripture_bible_books_greek \u2500\u2500 Topic: 33\n  \u2502         \u2502    \u2502         \u2514\u2500atheists_atheism_god_moral_atheist\n  \u2502         \u2502    \u2502              \u251c\u2500atheists_atheism_god_atheist_argument\n  \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n  \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n  \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n  \u2502         \u2502    \u2514\u2500armenian_armenians_people_israel_said\n  \u2502         \u2502         \u251c\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502    \u251c\u2500tax_rights_government_income_taxes\n  \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500rights_right_slavery_slaves_residence \u2500\u2500 Topic: 106\n  \u2502         \u2502         \u2502    \u2502    \u2514\u2500tax_government_taxes_income_libertarians\n  \u2502         \u2502         \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500government_libertarians_libertarian_regulation_party \u2500\u2500 Topic: 58\n  \u2502         \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500tax_taxes_income_billion_deficit \u2500\u2500 Topic: 41\n  \u2502         \u2502         \u2502    \u2514\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502         \u251c\u2500gun_guns_militia_firearms_amendment\n  \u2502         \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500blacks_penalty_death_cruel_punishment \u2500\u2500 Topic: 55\n  \u2502         \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500gun_guns_militia_firearms_amendment \u2500\u2500 Topic: 7\n  \u2502         \u2502         \u2502         \u2514\u2500armenian_armenians_israel_jews_turkish\n  \u2502         \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500israel_israeli_jews_arab_jewish \u2500\u2500 Topic: 4\n  \u2502         \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500armenian_armenians_turkish_armenia_azerbaijan \u2500\u2500 Topic: 15\n  \u2502         \u2502         \u2514\u2500stephanopoulos_president_mr_myers_ms\n  \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500serbs_muslims_stephanopoulos_mr_bosnia \u2500\u2500 Topic: 35\n  \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500myers_stephanopoulos_president_ms_mr \u2500\u2500 Topic: 87\n  \u2502         \u2514\u2500batf_fbi_koresh_compound_gas\n  \u2502              \u251c\u2500\u25a0\u2500\u2500reno_workers_janet_clinton_waco \u2500\u2500 Topic: 77\n  \u2502              \u2514\u2500batf_fbi_koresh_gas_compound\n  \u2502                   \u251c\u2500batf_koresh_fbi_warrant_compound\n  \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500batf_warrant_raid_compound_fbi \u2500\u2500 Topic: 42\n  \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500koresh_batf_fbi_children_compound \u2500\u2500 Topic: 61\n  \u2502                   \u2514\u2500\u25a0\u2500\u2500fbi_gas_tear_bds_building \u2500\u2500 Topic: 23\n  \u2514\u2500use_like_just_dont_new\n      \u251c\u2500game_team_year_games_like\n      \u2502    \u251c\u2500game_team_games_25_year\n      \u2502    \u2502    \u251c\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502    \u251c\u2500window_printer_use_problem_mhz\n      \u2502    \u2502    \u2502    \u2502    \u251c\u2500mhz_wire_simms_wiring_battery\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_mhz_battery_cpu_heat\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_pds_simm_vram_lc\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500pds_nubus_lc_slot_card \u2500\u2500 Topic: 119\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500simms_simm_vram_meg_dram \u2500\u2500 Topic: 32\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500mhz_battery_cpu_heat_speed\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500fan_cpu_heat_sink_fans \u2500\u2500 Topic: 92\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500mhz_speed_cpu_fpu_clock \u2500\u2500 Topic: 22\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500monitor_turn_power_computer_electricity \u2500\u2500 Topic: 91\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500battery_batteries_concrete_duo_discharge\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500duo_battery_apple_230_problem \u2500\u2500 Topic: 121\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500battery_batteries_concrete_discharge_temperature \u2500\u2500 Topic: 75\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500leds_uv_blue_light_boards \u2500\u2500 Topic: 66\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500wire_wiring_ground_neutral_outlets \u2500\u2500 Topic: 120\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500scope_scopes_phone_dial_number\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500dial_number_phone_line_output \u2500\u2500 Topic: 93\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500scope_scopes_motorola_generator_oscilloscope \u2500\u2500 Topic: 113\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500celp_dsp_sampling_antenna_digital\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500antenna_antennas_receiver_cable_transmitter \u2500\u2500 Topic: 70\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500celp_dsp_sampling_speech_voice \u2500\u2500 Topic: 52\n      \u2502    \u2502    \u2502    \u2502    \u2514\u2500window_printer_xv_mouse_windows\n      \u2502    \u2502    \u2502    \u2502         \u251c\u2500window_xv_error_widget_problem\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500error_symbol_undefined_xterm_rx\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500symbol_error_undefined_doug_parse \u2500\u2500 Topic: 63\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500rx_remote_server_xdm_xterm \u2500\u2500 Topic: 45\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500window_xv_widget_application_expose\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500window_widget_expose_application_event\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500gc_mydisplay_draw_gxxor_drawing \u2500\u2500 Topic: 103\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500window_widget_application_expose_event \u2500\u2500 Topic: 25\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500xv_den_polygon_points_algorithm\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500den_polygon_points_algorithm_polygons \u2500\u2500 Topic: 28\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500xv_24bit_image_bit_images \u2500\u2500 Topic: 57\n      \u2502    \u2502    \u2502    \u2502         \u2514\u2500printer_fonts_print_mouse_postscript\n      \u2502    \u2502    \u2502    \u2502              \u251c\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500scanner_logitech_grayscale_ocr_scanman \u2500\u2500 Topic: 108\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u2514\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u251c\u2500\u25a0\u2500\u2500printer_print_deskjet_hp_ink \u2500\u2500 Topic: 18\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u2514\u2500\u25a0\u2500\u2500fonts_font_truetype_tt_atm \u2500\u2500 Topic: 49\n      \u2502    \u2502    \u2502    \u2502              \u2514\u2500mouse_ghostscript_midi_driver_postscript\n      \u2502    \u2502    \u2502    \u2502                   \u251c\u2500ghostscript_midi_postscript_files_file\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500ghostscript_postscript_pageview_ghostview_dsc \u2500\u2500 Topic: 104\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u2514\u2500midi_sound_file_windows_driver\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u251c\u2500\u25a0\u2500\u2500location_mar_file_host_rwrr \u2500\u2500 Topic: 83\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u2514\u2500\u25a0\u2500\u2500midi_sound_driver_blaster_soundblaster \u2500\u2500 Topic: 98\n      \u2502    \u2502    \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500mouse_driver_mice_ball_problem \u2500\u2500 Topic: 68\n      \u2502    \u2502    \u2502    \u2514\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502         \u251c\u25001st_sale_condition_comics_hulk\n      \u2502    \u2502    \u2502         \u2502    \u251c\u2500sale_condition_offer_asking_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500condition_stereo_amp_speakers_asking\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500miles_car_amfm_toyota_cassette \u2500\u2500 Topic: 62\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500amp_speakers_condition_stereo_audio \u2500\u2500 Topic: 24\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500games_sale_pom_cds_shipping\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u251c\u2500pom_cds_sale_shipping_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500size_shipping_sale_condition_mattress \u2500\u2500 Topic: 100\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500pom_cds_cd_sale_picture \u2500\u2500 Topic: 37\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500games_game_snes_sega_genesis \u2500\u2500 Topic: 40\n      \u2502    \u2502    \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u251c\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500lens_tape_camera_backup_lenses\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500tape_backup_tapes_drive_4mm \u2500\u2500 Topic: 107\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500lens_camera_lenses_zoom_pouch \u2500\u2500 Topic: 114\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u25001st_hulk_comics_art_appears \u2500\u2500 Topic: 105\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500books_book_cover_trek_chemistry \u2500\u2500 Topic: 125\n      \u2502    \u2502    \u2502         \u2502         \u2514\u2500tickets_hotel_ticket_voucher_package\n      \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500hotel_voucher_package_vacation_room \u2500\u2500 Topic: 74\n      \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500tickets_ticket_june_airlines_july \u2500\u2500 Topic: 84\n      \u2502    \u2502    \u2502         \u2514\u2500game_team_games_season_hockey\n      \u2502    \u2502    \u2502              \u251c\u2500game_hockey_team_25_550\n      \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500espn_pt_pts_game_la \u2500\u2500 Topic: 17\n      \u2502    \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500team_25_game_hockey_550 \u2500\u2500 Topic: 2\n      \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500year_game_hit_baseball_players \u2500\u2500 Topic: 0\n      \u2502    \u2502    \u2514\u2500bike_car_greek_insurance_msg\n      \u2502    \u2502         \u251c\u2500car_bike_insurance_cars_engine\n      \u2502    \u2502         \u2502    \u251c\u2500car_insurance_cars_radar_engine\n      \u2502    \u2502         \u2502    \u2502    \u251c\u2500insurance_health_private_care_canada\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500insurance_health_private_care_canada \u2500\u2500 Topic: 99\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500insurance_car_accident_rates_sue \u2500\u2500 Topic: 82\n      \u2502    \u2502         \u2502    \u2502    \u2514\u2500car_cars_radar_engine_detector\n      \u2502    \u2502         \u2502    \u2502         \u251c\u2500car_radar_cars_detector_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500radar_detector_detectors_ka_alarm \u2500\u2500 Topic: 39\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500car_cars_mustang_ford_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500clutch_shift_shifting_transmission_gear \u2500\u2500 Topic: 88\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500car_cars_mustang_ford_v8 \u2500\u2500 Topic: 14\n      \u2502    \u2502         \u2502    \u2502         \u2514\u2500oil_diesel_odometer_diesels_car\n      \u2502    \u2502         \u2502    \u2502              \u251c\u2500odometer_oil_sensor_car_drain\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500odometer_sensor_speedo_gauge_mileage \u2500\u2500 Topic: 96\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500oil_drain_car_leaks_taillights \u2500\u2500 Topic: 102\n      \u2502    \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500diesel_diesels_emissions_fuel_oil \u2500\u2500 Topic: 79\n      \u2502    \u2502         \u2502    \u2514\u2500bike_riding_ride_bikes_motorcycle\n      \u2502    \u2502         \u2502         \u251c\u2500bike_ride_riding_bikes_lane\n      \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500bike_ride_riding_lane_car \u2500\u2500 Topic: 11\n      \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500bike_bikes_miles_honda_motorcycle \u2500\u2500 Topic: 19\n      \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500countersteering_bike_motorcycle_rear_shaft \u2500\u2500 Topic: 46\n      \u2502    \u2502         \u2514\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u251c\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u251c\u2500greek_kuwait_greece_turkish_greeks\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500greek_greece_turkish_greeks_cyprus \u2500\u2500 Topic: 71\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kuwait_iraq_iran_gulf_arabia \u2500\u2500 Topic: 76\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2514\u2500msg_dog_drugs_drug_food\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u251c\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500clinton_bush_quayle_reagan_panicking \u2500\u2500 Topic: 101\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u2514\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500cooper_trial_weaver_spence_witnesses \u2500\u2500 Topic: 90\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500dog_dogs_bike_trained_springer \u2500\u2500 Topic: 67\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2514\u2500msg_drugs_drug_food_chinese\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500msg_food_chinese_foods_taste \u2500\u2500 Topic: 30\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500drugs_drug_marijuana_cocaine_alcohol \u2500\u2500 Topic: 72\n      \u2502    \u2502              \u2502    \u2502    \u2514\u2500water_theory_universe_science_larsons\n      \u2502    \u2502              \u2502    \u2502         \u251c\u2500water_nuclear_cooling_steam_dept\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500rocketry_rockets_engines_nuclear_plutonium \u2500\u2500 Topic: 115\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u2514\u2500water_cooling_steam_dept_plants\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500water_dept_phd_environmental_atmospheric \u2500\u2500 Topic: 97\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500cooling_water_steam_towers_plants \u2500\u2500 Topic: 109\n      \u2502    \u2502              \u2502    \u2502         \u2514\u2500theory_universe_larsons_larson_science\n      \u2502    \u2502              \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500theory_universe_larsons_larson_science \u2500\u2500 Topic: 54\n      \u2502    \u2502              \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500oort_cloud_grbs_gamma_burst \u2500\u2500 Topic: 80\n      \u2502    \u2502              \u2502    \u2514\u2500helmet_kirlian_photography_lock_wax\n      \u2502    \u2502              \u2502         \u251c\u2500helmet_kirlian_photography_leaf_mask\n      \u2502    \u2502              \u2502         \u2502    \u251c\u2500kirlian_photography_leaf_pictures_deleted\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u251c\u2500deleted_joke_stuff_maddi_nickname\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500joke_maddi_nickname_nicknames_frank \u2500\u2500 Topic: 43\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500deleted_stuff_bookstore_joke_motto \u2500\u2500 Topic: 81\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kirlian_photography_leaf_pictures_aura \u2500\u2500 Topic: 85\n      \u2502    \u2502              \u2502         \u2502    \u2514\u2500helmet_mask_liner_foam_cb\n      \u2502    \u2502              \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500helmet_liner_foam_cb_helmets \u2500\u2500 Topic: 112\n      \u2502    \u2502              \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500mask_goalies_77_santore_tl \u2500\u2500 Topic: 123\n      \u2502    \u2502              \u2502         \u2514\u2500lock_wax_paint_plastic_ear\n      \u2502    \u2502              \u2502              \u251c\u2500\u25a0\u2500\u2500lock_cable_locks_bike_600 \u2500\u2500 Topic: 117\n      \u2502    \u2502              \u2502              \u2514\u2500wax_paint_ear_plastic_skin\n      \u2502    \u2502              \u2502                   \u251c\u2500\u25a0\u2500\u2500wax_paint_plastic_scratches_solvent \u2500\u2500 Topic: 65\n      \u2502    \u2502              \u2502                   \u2514\u2500\u25a0\u2500\u2500ear_wax_skin_greasy_acne \u2500\u2500 Topic: 116\n      \u2502    \u2502              \u2514\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u251c\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500m4_mp_14_mw_mo \u2500\u2500 Topic: 111\n      \u2502    \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500test_ensign_nameless_deane_deanebinahccbrandeisedu \u2500\u2500 Topic: 118\n      \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500ites_cheek_hello_hi_ken \u2500\u2500 Topic: 3\n      \u2502    \u2514\u2500space_medical_health_disease_cancer\n      \u2502         \u251c\u2500medical_health_disease_cancer_patients\n      \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500cancer_centers_center_medical_research \u2500\u2500 Topic: 122\n      \u2502         \u2502    \u2514\u2500health_medical_disease_patients_hiv\n      \u2502         \u2502         \u251c\u2500patients_medical_disease_candida_health\n      \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n      \u2502         \u2502         \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n      \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n      \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n      \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n      \u2502         \u2514\u2500space_launch_nasa_shuttle_orbit\n      \u2502              \u251c\u2500space_moon_station_nasa_launch\n      \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500sky_advertising_billboard_billboards_space \u2500\u2500 Topic: 59\n      \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500space_station_moon_redesign_nasa \u2500\u2500 Topic: 16\n      \u2502              \u2514\u2500space_mission_hst_launch_orbit\n      \u2502                   \u251c\u2500space_launch_nasa_orbit_propulsion\n      \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500space_launch_nasa_propulsion_astronaut \u2500\u2500 Topic: 47\n      \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500orbit_km_jupiter_probe_earth \u2500\u2500 Topic: 86\n      \u2502                   \u2514\u2500\u25a0\u2500\u2500hst_mission_shuttle_orbit_arrays \u2500\u2500 Topic: 60\n      \u2514\u2500drive_file_key_windows_use\n          \u251c\u2500key_file_jpeg_encryption_image\n          \u2502    \u251c\u2500key_encryption_clipper_chip_keys\n          \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500key_clipper_encryption_chip_keys \u2500\u2500 Topic: 1\n          \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500entry_file_ripem_entries_key \u2500\u2500 Topic: 73\n          \u2502    \u2514\u2500jpeg_image_file_gif_images\n          \u2502         \u251c\u2500motif_graphics_ftp_available_3d\n          \u2502         \u2502    \u251c\u2500motif_graphics_openwindows_ftp_available\n          \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500openwindows_motif_xview_windows_mouse \u2500\u2500 Topic: 20\n          \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500graphics_widget_ray_3d_available \u2500\u2500 Topic: 95\n          \u2502         \u2502    \u2514\u2500\u25a0\u2500\u25003d_machines_version_comments_contact \u2500\u2500 Topic: 38\n          \u2502         \u2514\u2500jpeg_image_gif_images_format\n          \u2502              \u251c\u2500\u25a0\u2500\u2500gopher_ftp_files_stuffit_images \u2500\u2500 Topic: 51\n          \u2502              \u2514\u2500\u25a0\u2500\u2500jpeg_image_gif_format_images \u2500\u2500 Topic: 13\n          \u2514\u2500drive_db_card_scsi_windows\n              \u251c\u2500db_windows_dos_mov_os2\n              \u2502    \u251c\u2500\u25a0\u2500\u2500copy_protection_program_software_disk \u2500\u2500 Topic: 64\n              \u2502    \u2514\u2500\u25a0\u2500\u2500db_windows_dos_mov_os2 \u2500\u2500 Topic: 8\n              \u2514\u2500drive_card_scsi_drives_ide\n                      \u251c\u2500drive_scsi_drives_ide_disk\n                      \u2502    \u251c\u2500\u25a0\u2500\u2500drive_scsi_drives_ide_disk \u2500\u2500 Topic: 6\n                      \u2502    \u2514\u2500\u25a0\u2500\u2500meg_sale_ram_drive_shipping \u2500\u2500 Topic: 12\n                      \u2514\u2500card_modem_monitor_video_drivers\n                          \u251c\u2500\u25a0\u2500\u2500card_monitor_video_drivers_vga \u2500\u2500 Topic: 5\n                          \u2514\u2500\u25a0\u2500\u2500modem_port_serial_irq_com \u2500\u2500 Topic: 10\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-hierarchical-documents","title":"Visualize Hierarchical Documents","text":"

        We can extend the previous method by calculating the topic representation at different levels of the hierarchy and plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n
        Then, we can visualize the hierarchical documents by either supplying it with our embeddings or by reducing their dimensionality ourselves:

        # Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization smaller and fit into your RAM. However, it might be interesting to set hide_document_hover=False to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualization.html#visualize-terms","title":"Visualize Terms","text":"

        We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other. To visualize this hierarchy, run the following:

        topic_model.visualize_barchart()\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-topic-similarity","title":"Visualize Topic Similarity","text":"

        Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other. To visualize the heatmap, run the following:

        topic_model.visualize_heatmap()\n

        Note

        You can set n_clusters in visualize_heatmap to order the topics by their similarity. This will result in blocks being formed in the heatmap indicating which clusters of topics are similar to each other. This step is very much recommended as it will make reading the heatmap easier.

        "},{"location":"getting_started/visualization/visualization.html#visualize-term-score-decline","title":"Visualize Term Score Decline","text":"

        Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation.

        To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic.

        To visualize the c-TF-IDF score decline, run the following:

        topic_model.visualize_term_rank()\n

        To enable the log scale on the y-axis for a better view of individual topics, run the following:

        topic_model.visualize_term_rank(log_scale=True)\n

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        "},{"location":"getting_started/visualization/visualization.html#visualize-topics-over-time","title":"Visualize Topics over Time","text":"

        After creating topics over time with Dynamic Topic Modeling, we can visualize these topics by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call .visualize_topics_over_time with the newly created topics over time:

        import re\nimport pandas as pd\nfrom bertopic import BERTopic\n\n# Prepare data\ntrump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')\ntrump.text = trump.apply(lambda row: re.sub(r\"http\\S+\", \"\", row.text).lower(), 1)\ntrump.text = trump.apply(lambda row: \" \".join(filter(lambda x:x[0]!=\"@\", row.text.split())), 1)\ntrump.text = trump.apply(lambda row: \" \".join(re.sub(\"[^a-zA-Z]+\", \" \", row.text).split()), 1)\ntrump = trump.loc[(trump.isRetweet == \"f\") & (trump.text != \"\"), :]\ntimestamps = trump.date.to_list()\ntweets = trump.text.to_list()\n\n# Create topics over time\nmodel = BERTopic(verbose=True)\ntopics, probs = model.fit_transform(tweets)\ntopics_over_time = model.topics_over_time(tweets, timestamps)\n

        Then, we visualize some interesting topics:

        model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-topics-per-class","title":"Visualize Topics per Class","text":"

        You might want to extract and visualize the topic representation per class. For example, if you have specific groups of users that might approach topics differently, then extracting them would help understanding how these users talk about certain topics. In other words, this is simply creating a topic representation for certain classes that you might have in your data.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Prepare data and classes\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\nclasses = [data[\"target_names\"][i] for i in data[\"target\"]]\n\n# Create topic model and calculate topics per class\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_per_class = topic_model.topics_per_class(docs, classes=classes)\n

        Then, we visualize the topic representation of major topics per class:

        topic_model.visualize_topics_per_class(topics_per_class)\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-probablities-or-distribution","title":"Visualize Probablities or Distribution","text":"

        We can generate the topic-document probability matrix by simply setting calculate_probabilities=True if a HDBSCAN model is used:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs) \n

        The resulting probs variable contains the soft-clustering as done through HDBSCAN.

        If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model:

        from bertopic import BERTopic\n\ntopic_model = BERTopic()\ntopics, _ = topic_model.fit_transform(docs) \ntopic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0)\n

        Then, we either pass the probs or topic_distr variable to .visualize_distribution to visualize either the probability distributions or the topic distributions:

        # To visualize the probabilities of topic assignment\ntopic_model.visualize_distribution(probs[0])\n\n# To visualize the topic distributions in a document\ntopic_model.visualize_distribution(topic_distr[0])\n

        Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])\ndf\n

        Note

        To get the stylized dataframe for .visualize_approximate_distribution you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via pip install jinja2

        Note

        The distribution of the probabilities does not give an indication to the distribution of the frequencies of topics across a document. It merely shows how confident BERTopic is that certain topics can be found in a document.

        "},{"location":"getting_started/visualization/visualize_documents.html","title":"Documents","text":""},{"location":"getting_started/visualization/visualize_documents.html#visualize-documents-with-plotly","title":"Visualize documents with Plotly","text":"

        Using the .visualize_topics, we can visualize the topics and get insight into their relationships. However, you might want a more fine-grained approach where we can visualize the documents inside the topics to see if they were assigned correctly or whether they make sense. To do so, we can use the topic_model.visualize_documents() function. This function recalculates the document embeddings and reduces them to 2-dimensional space for easier visualization purposes. This process can be quite expensive, so it is advised to adhere to the following pipeline:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This was done for demonstration purposes as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set hide_document_hover=False in order to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualize_documents.html#custom-hover","title":"Custom Hover","text":"

        When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (titles) containing the title for each document:

        topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)\n
        "},{"location":"getting_started/visualization/visualize_documents.html#visualize-documents-with-datamapplot","title":"Visualize documents with DataMapPlot","text":"

        .visualize_document_datamap provides an alternative way to visualize the documents inside the topics as a static DataMapPlot. Using the same pipeline as above, you can generate a DataMapPlot by running:

        # with the original embeddings\ntopic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n# with the reduced embeddings\ntopic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\nfig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n
        "},{"location":"getting_started/visualization/visualize_documents.html#visualize-probablities-or-distribution","title":"Visualize Probablities or Distribution","text":"

        We can generate the topic-document probability matrix by simply setting calculate_probabilities=True if a HDBSCAN model is used:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs) \n

        The resulting probs variable contains the soft-clustering as done through HDBSCAN.

        If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model:

        from bertopic import BERTopic\n\ntopic_model = BERTopic()\ntopics, _ = topic_model.fit_transform(docs) \ntopic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0)\n

        Then, we either pass the probs or topic_distr variable to .visualize_distribution to visualize either the probability distributions or the topic distributions:

        # To visualize the probabilities of topic assignment\ntopic_model.visualize_distribution(probs[0])\n\n# To visualize the topic distributions in a document\ntopic_model.visualize_distribution(topic_distr[0])\n

        Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])\ndf\n

        Note

        To get the stylized dataframe for .visualize_approximate_distribution you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via pip install jinja2

        Note

        The distribution of the probabilities does not give an indication to the distribution of the frequencies of topics across a document. It merely shows how confident BERTopic is that certain topics can be found in a document.

        "},{"location":"getting_started/visualization/visualize_hierarchy.html","title":"Hierarchy","text":"

        The topics that you create can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use scipy.cluster.hierarchy to create clusters and visualize how they relate to one another. This might help to select an appropriate nr_topics when reducing the number of topics that you have created. To visualize this hierarchy, run the following:

        topic_model.visualize_hierarchy()\n

        Note

        Do note that this is not the actual procedure of .reduce_topics() when nr_topics is set to auto since HDBSCAN is used to automatically extract topics. The visualization above closely resembles the actual procedure of .reduce_topics() when any number of nr_topics is selected.

        "},{"location":"getting_started/visualization/visualize_hierarchy.html#hierarchical-labels","title":"Hierarchical labels","text":"

        Although visualizing this hierarchy gives us information about the structure, it would be helpful to see what happens to the topic representations when merging topics. To do so, we first need to calculate the representations of the hierarchical topics:

        First, we train a basic BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n

        To visualize these results, we simply need to pass the resulting hierarchical_topics to our .visualize_hierarchy function:

        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you hover over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes.

        "},{"location":"getting_started/visualization/visualize_hierarchy.html#text-based-topic-tree","title":"Text-based topic tree","text":"

        Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use topic_model.get_topic_tree to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged:

        >>> tree = topic_model.get_topic_tree(hierarchical_topics)\n>>> print(tree)\n.\n\u2514\u2500atheists_atheism_god_moral_atheist\n     \u251c\u2500atheists_atheism_god_atheist_argument\n     \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n     \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n     \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n
        Click here to view the full tree.
          .\n  \u251c\u2500people_armenian_said_god_armenians\n  \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u2502    \u251c\u2500jehovah_lord_mormon_mcconkie_god\n  \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500ra_satan_thou_god_lucifer \u2500\u2500 Topic: 94\n  \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500jehovah_lord_mormon_mcconkie_unto \u2500\u2500 Topic: 78\n  \u2502    \u2502    \u2502    \u2514\u2500jesus_mary_god_hell_sin\n  \u2502    \u2502    \u2502         \u251c\u2500jesus_hell_god_eternal_heaven\n  \u2502    \u2502    \u2502         \u2502    \u251c\u2500hell_jesus_eternal_god_heaven\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500jesus_tomb_disciples_resurrection_john \u2500\u2500 Topic: 69\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500hell_eternal_god_jesus_heaven \u2500\u2500 Topic: 53\n  \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500aaron_baptism_sin_law_god \u2500\u2500 Topic: 89\n  \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500mary_sin_maria_priest_conception \u2500\u2500 Topic: 56\n  \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500marriage_married_marry_ceremony_marriages \u2500\u2500 Topic: 110\n  \u2502    \u2514\u2500people_armenian_armenians_said_mr\n  \u2502         \u251c\u2500people_armenian_armenians_said_israel\n  \u2502         \u2502    \u251c\u2500god_homosexual_homosexuality_atheists_sex\n  \u2502         \u2502    \u2502    \u251c\u2500homosexual_homosexuality_sex_gay_homosexuals\n  \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500kinsey_sex_gay_men_sexual \u2500\u2500 Topic: 44\n  \u2502         \u2502    \u2502    \u2502    \u2514\u2500homosexuality_homosexual_sin_homosexuals_gay\n  \u2502         \u2502    \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500gay_homosexual_homosexuals_sexual_cramer \u2500\u2500 Topic: 50\n  \u2502         \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500homosexuality_homosexual_sin_paul_sex \u2500\u2500 Topic: 27\n  \u2502         \u2502    \u2502    \u2514\u2500god_atheists_atheism_moral_atheist\n  \u2502         \u2502    \u2502         \u251c\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500jim_context_challenges_articles_quote \u2500\u2500 Topic: 36\n  \u2502         \u2502    \u2502         \u2502    \u2514\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500islam_quran_islamic_rushdie_muslims \u2500\u2500 Topic: 31\n  \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500judas_scripture_bible_books_greek \u2500\u2500 Topic: 33\n  \u2502         \u2502    \u2502         \u2514\u2500atheists_atheism_god_moral_atheist\n  \u2502         \u2502    \u2502              \u251c\u2500atheists_atheism_god_atheist_argument\n  \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n  \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n  \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n  \u2502         \u2502    \u2514\u2500armenian_armenians_people_israel_said\n  \u2502         \u2502         \u251c\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502    \u251c\u2500tax_rights_government_income_taxes\n  \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500rights_right_slavery_slaves_residence \u2500\u2500 Topic: 106\n  \u2502         \u2502         \u2502    \u2502    \u2514\u2500tax_government_taxes_income_libertarians\n  \u2502         \u2502         \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500government_libertarians_libertarian_regulation_party \u2500\u2500 Topic: 58\n  \u2502         \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500tax_taxes_income_billion_deficit \u2500\u2500 Topic: 41\n  \u2502         \u2502         \u2502    \u2514\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502         \u251c\u2500gun_guns_militia_firearms_amendment\n  \u2502         \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500blacks_penalty_death_cruel_punishment \u2500\u2500 Topic: 55\n  \u2502         \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500gun_guns_militia_firearms_amendment \u2500\u2500 Topic: 7\n  \u2502         \u2502         \u2502         \u2514\u2500armenian_armenians_israel_jews_turkish\n  \u2502         \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500israel_israeli_jews_arab_jewish \u2500\u2500 Topic: 4\n  \u2502         \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500armenian_armenians_turkish_armenia_azerbaijan \u2500\u2500 Topic: 15\n  \u2502         \u2502         \u2514\u2500stephanopoulos_president_mr_myers_ms\n  \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500serbs_muslims_stephanopoulos_mr_bosnia \u2500\u2500 Topic: 35\n  \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500myers_stephanopoulos_president_ms_mr \u2500\u2500 Topic: 87\n  \u2502         \u2514\u2500batf_fbi_koresh_compound_gas\n  \u2502              \u251c\u2500\u25a0\u2500\u2500reno_workers_janet_clinton_waco \u2500\u2500 Topic: 77\n  \u2502              \u2514\u2500batf_fbi_koresh_gas_compound\n  \u2502                   \u251c\u2500batf_koresh_fbi_warrant_compound\n  \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500batf_warrant_raid_compound_fbi \u2500\u2500 Topic: 42\n  \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500koresh_batf_fbi_children_compound \u2500\u2500 Topic: 61\n  \u2502                   \u2514\u2500\u25a0\u2500\u2500fbi_gas_tear_bds_building \u2500\u2500 Topic: 23\n  \u2514\u2500use_like_just_dont_new\n      \u251c\u2500game_team_year_games_like\n      \u2502    \u251c\u2500game_team_games_25_year\n      \u2502    \u2502    \u251c\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502    \u251c\u2500window_printer_use_problem_mhz\n      \u2502    \u2502    \u2502    \u2502    \u251c\u2500mhz_wire_simms_wiring_battery\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_mhz_battery_cpu_heat\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_pds_simm_vram_lc\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500pds_nubus_lc_slot_card \u2500\u2500 Topic: 119\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500simms_simm_vram_meg_dram \u2500\u2500 Topic: 32\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500mhz_battery_cpu_heat_speed\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500fan_cpu_heat_sink_fans \u2500\u2500 Topic: 92\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500mhz_speed_cpu_fpu_clock \u2500\u2500 Topic: 22\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500monitor_turn_power_computer_electricity \u2500\u2500 Topic: 91\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500battery_batteries_concrete_duo_discharge\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500duo_battery_apple_230_problem \u2500\u2500 Topic: 121\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500battery_batteries_concrete_discharge_temperature \u2500\u2500 Topic: 75\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500leds_uv_blue_light_boards \u2500\u2500 Topic: 66\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500wire_wiring_ground_neutral_outlets \u2500\u2500 Topic: 120\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500scope_scopes_phone_dial_number\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500dial_number_phone_line_output \u2500\u2500 Topic: 93\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500scope_scopes_motorola_generator_oscilloscope \u2500\u2500 Topic: 113\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500celp_dsp_sampling_antenna_digital\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500antenna_antennas_receiver_cable_transmitter \u2500\u2500 Topic: 70\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500celp_dsp_sampling_speech_voice \u2500\u2500 Topic: 52\n      \u2502    \u2502    \u2502    \u2502    \u2514\u2500window_printer_xv_mouse_windows\n      \u2502    \u2502    \u2502    \u2502         \u251c\u2500window_xv_error_widget_problem\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500error_symbol_undefined_xterm_rx\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500symbol_error_undefined_doug_parse \u2500\u2500 Topic: 63\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500rx_remote_server_xdm_xterm \u2500\u2500 Topic: 45\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500window_xv_widget_application_expose\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500window_widget_expose_application_event\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500gc_mydisplay_draw_gxxor_drawing \u2500\u2500 Topic: 103\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500window_widget_application_expose_event \u2500\u2500 Topic: 25\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500xv_den_polygon_points_algorithm\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500den_polygon_points_algorithm_polygons \u2500\u2500 Topic: 28\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500xv_24bit_image_bit_images \u2500\u2500 Topic: 57\n      \u2502    \u2502    \u2502    \u2502         \u2514\u2500printer_fonts_print_mouse_postscript\n      \u2502    \u2502    \u2502    \u2502              \u251c\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500scanner_logitech_grayscale_ocr_scanman \u2500\u2500 Topic: 108\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u2514\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u251c\u2500\u25a0\u2500\u2500printer_print_deskjet_hp_ink \u2500\u2500 Topic: 18\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u2514\u2500\u25a0\u2500\u2500fonts_font_truetype_tt_atm \u2500\u2500 Topic: 49\n      \u2502    \u2502    \u2502    \u2502              \u2514\u2500mouse_ghostscript_midi_driver_postscript\n      \u2502    \u2502    \u2502    \u2502                   \u251c\u2500ghostscript_midi_postscript_files_file\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500ghostscript_postscript_pageview_ghostview_dsc \u2500\u2500 Topic: 104\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u2514\u2500midi_sound_file_windows_driver\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u251c\u2500\u25a0\u2500\u2500location_mar_file_host_rwrr \u2500\u2500 Topic: 83\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u2514\u2500\u25a0\u2500\u2500midi_sound_driver_blaster_soundblaster \u2500\u2500 Topic: 98\n      \u2502    \u2502    \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500mouse_driver_mice_ball_problem \u2500\u2500 Topic: 68\n      \u2502    \u2502    \u2502    \u2514\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502         \u251c\u25001st_sale_condition_comics_hulk\n      \u2502    \u2502    \u2502         \u2502    \u251c\u2500sale_condition_offer_asking_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500condition_stereo_amp_speakers_asking\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500miles_car_amfm_toyota_cassette \u2500\u2500 Topic: 62\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500amp_speakers_condition_stereo_audio \u2500\u2500 Topic: 24\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500games_sale_pom_cds_shipping\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u251c\u2500pom_cds_sale_shipping_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500size_shipping_sale_condition_mattress \u2500\u2500 Topic: 100\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500pom_cds_cd_sale_picture \u2500\u2500 Topic: 37\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500games_game_snes_sega_genesis \u2500\u2500 Topic: 40\n      \u2502    \u2502    \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u251c\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500lens_tape_camera_backup_lenses\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500tape_backup_tapes_drive_4mm \u2500\u2500 Topic: 107\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500lens_camera_lenses_zoom_pouch \u2500\u2500 Topic: 114\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u25001st_hulk_comics_art_appears \u2500\u2500 Topic: 105\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500books_book_cover_trek_chemistry \u2500\u2500 Topic: 125\n      \u2502    \u2502    \u2502         \u2502         \u2514\u2500tickets_hotel_ticket_voucher_package\n      \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500hotel_voucher_package_vacation_room \u2500\u2500 Topic: 74\n      \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500tickets_ticket_june_airlines_july \u2500\u2500 Topic: 84\n      \u2502    \u2502    \u2502         \u2514\u2500game_team_games_season_hockey\n      \u2502    \u2502    \u2502              \u251c\u2500game_hockey_team_25_550\n      \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500espn_pt_pts_game_la \u2500\u2500 Topic: 17\n      \u2502    \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500team_25_game_hockey_550 \u2500\u2500 Topic: 2\n      \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500year_game_hit_baseball_players \u2500\u2500 Topic: 0\n      \u2502    \u2502    \u2514\u2500bike_car_greek_insurance_msg\n      \u2502    \u2502         \u251c\u2500car_bike_insurance_cars_engine\n      \u2502    \u2502         \u2502    \u251c\u2500car_insurance_cars_radar_engine\n      \u2502    \u2502         \u2502    \u2502    \u251c\u2500insurance_health_private_care_canada\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500insurance_health_private_care_canada \u2500\u2500 Topic: 99\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500insurance_car_accident_rates_sue \u2500\u2500 Topic: 82\n      \u2502    \u2502         \u2502    \u2502    \u2514\u2500car_cars_radar_engine_detector\n      \u2502    \u2502         \u2502    \u2502         \u251c\u2500car_radar_cars_detector_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500radar_detector_detectors_ka_alarm \u2500\u2500 Topic: 39\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500car_cars_mustang_ford_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500clutch_shift_shifting_transmission_gear \u2500\u2500 Topic: 88\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500car_cars_mustang_ford_v8 \u2500\u2500 Topic: 14\n      \u2502    \u2502         \u2502    \u2502         \u2514\u2500oil_diesel_odometer_diesels_car\n      \u2502    \u2502         \u2502    \u2502              \u251c\u2500odometer_oil_sensor_car_drain\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500odometer_sensor_speedo_gauge_mileage \u2500\u2500 Topic: 96\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500oil_drain_car_leaks_taillights \u2500\u2500 Topic: 102\n      \u2502    \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500diesel_diesels_emissions_fuel_oil \u2500\u2500 Topic: 79\n      \u2502    \u2502         \u2502    \u2514\u2500bike_riding_ride_bikes_motorcycle\n      \u2502    \u2502         \u2502         \u251c\u2500bike_ride_riding_bikes_lane\n      \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500bike_ride_riding_lane_car \u2500\u2500 Topic: 11\n      \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500bike_bikes_miles_honda_motorcycle \u2500\u2500 Topic: 19\n      \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500countersteering_bike_motorcycle_rear_shaft \u2500\u2500 Topic: 46\n      \u2502    \u2502         \u2514\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u251c\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u251c\u2500greek_kuwait_greece_turkish_greeks\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500greek_greece_turkish_greeks_cyprus \u2500\u2500 Topic: 71\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kuwait_iraq_iran_gulf_arabia \u2500\u2500 Topic: 76\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2514\u2500msg_dog_drugs_drug_food\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u251c\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500clinton_bush_quayle_reagan_panicking \u2500\u2500 Topic: 101\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u2514\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500cooper_trial_weaver_spence_witnesses \u2500\u2500 Topic: 90\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500dog_dogs_bike_trained_springer \u2500\u2500 Topic: 67\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2514\u2500msg_drugs_drug_food_chinese\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500msg_food_chinese_foods_taste \u2500\u2500 Topic: 30\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500drugs_drug_marijuana_cocaine_alcohol \u2500\u2500 Topic: 72\n      \u2502    \u2502              \u2502    \u2502    \u2514\u2500water_theory_universe_science_larsons\n      \u2502    \u2502              \u2502    \u2502         \u251c\u2500water_nuclear_cooling_steam_dept\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500rocketry_rockets_engines_nuclear_plutonium \u2500\u2500 Topic: 115\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u2514\u2500water_cooling_steam_dept_plants\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500water_dept_phd_environmental_atmospheric \u2500\u2500 Topic: 97\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500cooling_water_steam_towers_plants \u2500\u2500 Topic: 109\n      \u2502    \u2502              \u2502    \u2502         \u2514\u2500theory_universe_larsons_larson_science\n      \u2502    \u2502              \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500theory_universe_larsons_larson_science \u2500\u2500 Topic: 54\n      \u2502    \u2502              \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500oort_cloud_grbs_gamma_burst \u2500\u2500 Topic: 80\n      \u2502    \u2502              \u2502    \u2514\u2500helmet_kirlian_photography_lock_wax\n      \u2502    \u2502              \u2502         \u251c\u2500helmet_kirlian_photography_leaf_mask\n      \u2502    \u2502              \u2502         \u2502    \u251c\u2500kirlian_photography_leaf_pictures_deleted\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u251c\u2500deleted_joke_stuff_maddi_nickname\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500joke_maddi_nickname_nicknames_frank \u2500\u2500 Topic: 43\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500deleted_stuff_bookstore_joke_motto \u2500\u2500 Topic: 81\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kirlian_photography_leaf_pictures_aura \u2500\u2500 Topic: 85\n      \u2502    \u2502              \u2502         \u2502    \u2514\u2500helmet_mask_liner_foam_cb\n      \u2502    \u2502              \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500helmet_liner_foam_cb_helmets \u2500\u2500 Topic: 112\n      \u2502    \u2502              \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500mask_goalies_77_santore_tl \u2500\u2500 Topic: 123\n      \u2502    \u2502              \u2502         \u2514\u2500lock_wax_paint_plastic_ear\n      \u2502    \u2502              \u2502              \u251c\u2500\u25a0\u2500\u2500lock_cable_locks_bike_600 \u2500\u2500 Topic: 117\n      \u2502    \u2502              \u2502              \u2514\u2500wax_paint_ear_plastic_skin\n      \u2502    \u2502              \u2502                   \u251c\u2500\u25a0\u2500\u2500wax_paint_plastic_scratches_solvent \u2500\u2500 Topic: 65\n      \u2502    \u2502              \u2502                   \u2514\u2500\u25a0\u2500\u2500ear_wax_skin_greasy_acne \u2500\u2500 Topic: 116\n      \u2502    \u2502              \u2514\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u251c\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500m4_mp_14_mw_mo \u2500\u2500 Topic: 111\n      \u2502    \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500test_ensign_nameless_deane_deanebinahccbrandeisedu \u2500\u2500 Topic: 118\n      \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500ites_cheek_hello_hi_ken \u2500\u2500 Topic: 3\n      \u2502    \u2514\u2500space_medical_health_disease_cancer\n      \u2502         \u251c\u2500medical_health_disease_cancer_patients\n      \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500cancer_centers_center_medical_research \u2500\u2500 Topic: 122\n      \u2502         \u2502    \u2514\u2500health_medical_disease_patients_hiv\n      \u2502         \u2502         \u251c\u2500patients_medical_disease_candida_health\n      \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n      \u2502         \u2502         \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n      \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n      \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n      \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n      \u2502         \u2514\u2500space_launch_nasa_shuttle_orbit\n      \u2502              \u251c\u2500space_moon_station_nasa_launch\n      \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500sky_advertising_billboard_billboards_space \u2500\u2500 Topic: 59\n      \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500space_station_moon_redesign_nasa \u2500\u2500 Topic: 16\n      \u2502              \u2514\u2500space_mission_hst_launch_orbit\n      \u2502                   \u251c\u2500space_launch_nasa_orbit_propulsion\n      \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500space_launch_nasa_propulsion_astronaut \u2500\u2500 Topic: 47\n      \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500orbit_km_jupiter_probe_earth \u2500\u2500 Topic: 86\n      \u2502                   \u2514\u2500\u25a0\u2500\u2500hst_mission_shuttle_orbit_arrays \u2500\u2500 Topic: 60\n      \u2514\u2500drive_file_key_windows_use\n          \u251c\u2500key_file_jpeg_encryption_image\n          \u2502    \u251c\u2500key_encryption_clipper_chip_keys\n          \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500key_clipper_encryption_chip_keys \u2500\u2500 Topic: 1\n          \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500entry_file_ripem_entries_key \u2500\u2500 Topic: 73\n          \u2502    \u2514\u2500jpeg_image_file_gif_images\n          \u2502         \u251c\u2500motif_graphics_ftp_available_3d\n          \u2502         \u2502    \u251c\u2500motif_graphics_openwindows_ftp_available\n          \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500openwindows_motif_xview_windows_mouse \u2500\u2500 Topic: 20\n          \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500graphics_widget_ray_3d_available \u2500\u2500 Topic: 95\n          \u2502         \u2502    \u2514\u2500\u25a0\u2500\u25003d_machines_version_comments_contact \u2500\u2500 Topic: 38\n          \u2502         \u2514\u2500jpeg_image_gif_images_format\n          \u2502              \u251c\u2500\u25a0\u2500\u2500gopher_ftp_files_stuffit_images \u2500\u2500 Topic: 51\n          \u2502              \u2514\u2500\u25a0\u2500\u2500jpeg_image_gif_format_images \u2500\u2500 Topic: 13\n          \u2514\u2500drive_db_card_scsi_windows\n              \u251c\u2500db_windows_dos_mov_os2\n              \u2502    \u251c\u2500\u25a0\u2500\u2500copy_protection_program_software_disk \u2500\u2500 Topic: 64\n              \u2502    \u2514\u2500\u25a0\u2500\u2500db_windows_dos_mov_os2 \u2500\u2500 Topic: 8\n              \u2514\u2500drive_card_scsi_drives_ide\n                      \u251c\u2500drive_scsi_drives_ide_disk\n                      \u2502    \u251c\u2500\u25a0\u2500\u2500drive_scsi_drives_ide_disk \u2500\u2500 Topic: 6\n                      \u2502    \u2514\u2500\u25a0\u2500\u2500meg_sale_ram_drive_shipping \u2500\u2500 Topic: 12\n                      \u2514\u2500card_modem_monitor_video_drivers\n                          \u251c\u2500\u25a0\u2500\u2500card_monitor_video_drivers_vga \u2500\u2500 Topic: 5\n                          \u2514\u2500\u25a0\u2500\u2500modem_port_serial_irq_com \u2500\u2500 Topic: 10\n
        "},{"location":"getting_started/visualization/visualize_hierarchy.html#visualize-hierarchical-documents","title":"Visualize Hierarchical Documents","text":"

        We can extend the previous method by calculating the topic representation at different levels of the hierarchy and plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n
        Then, we can visualize the hierarchical documents by either supplying it with our embeddings or by reducing their dimensionality ourselves:

        # Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization smaller and fit into your RAM. However, it might be interesting to set hide_document_hover=False to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualize_terms.html","title":"Terms","text":"

        We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other. To visualize this hierarchy, run the following:

        topic_model.visualize_barchart()\n
        "},{"location":"getting_started/visualization/visualize_terms.html#visualize-term-score-decline","title":"Visualize Term Score Decline","text":"

        Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation.

        To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic.

        To visualize the c-TF-IDF score decline, run the following:

        topic_model.visualize_term_rank()\n

        To enable the log scale on the y-axis for a better view of individual topics, run the following:

        topic_model.visualize_term_rank(log_scale=True)\n

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        "},{"location":"getting_started/visualization/visualize_topics.html","title":"Topics","text":"

        Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue.

        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topics","title":"Visualize Topics","text":"

        After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to LDAvis.

        We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs) \n

        Then, we can call .visualize_topics to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML:

        topic_model.visualize_topics()\n

        You can use the slider to select the topic which then lights up red. If you hover over a topic, then general information is given about the topic, including the size of the topic and its corresponding words.

        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topic-similarity","title":"Visualize Topic Similarity","text":"

        Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other. To visualize the heatmap, run the following:

        topic_model.visualize_heatmap()\n

        Note

        You can set n_clusters in visualize_heatmap to order the topics by their similarity. This will result in blocks being formed in the heatmap indicating which clusters of topics are similar to each other. This step is very much recommended as it will make reading the heatmap easier.

        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topics-over-time","title":"Visualize Topics over Time","text":"

        After creating topics over time with Dynamic Topic Modeling, we can visualize these topics by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call .visualize_topics_over_time with the newly created topics over time:

        import re\nimport pandas as pd\nfrom bertopic import BERTopic\n\n# Prepare data\ntrump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')\ntrump.text = trump.apply(lambda row: re.sub(r\"http\\S+\", \"\", row.text).lower(), 1)\ntrump.text = trump.apply(lambda row: \" \".join(filter(lambda x:x[0]!=\"@\", row.text.split())), 1)\ntrump.text = trump.apply(lambda row: \" \".join(re.sub(\"[^a-zA-Z]+\", \" \", row.text).split()), 1)\ntrump = trump.loc[(trump.isRetweet == \"f\") & (trump.text != \"\"), :]\ntimestamps = trump.date.to_list()\ntweets = trump.text.to_list()\n\n# Create topics over time\nmodel = BERTopic(verbose=True)\ntopics, probs = model.fit_transform(tweets)\ntopics_over_time = model.topics_over_time(tweets, timestamps)\n

        Then, we visualize some interesting topics:

        model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])\n
        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topics-per-class","title":"Visualize Topics per Class","text":"

        You might want to extract and visualize the topic representation per class. For example, if you have specific groups of users that might approach topics differently, then extracting them would help understanding how these users talk about certain topics. In other words, this is simply creating a topic representation for certain classes that you might have in your data.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Prepare data and classes\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\nclasses = [data[\"target_names\"][i] for i in data[\"target\"]]\n\n# Create topic model and calculate topics per class\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_per_class = topic_model.topics_per_class(docs, classes=classes)\n

        Then, we visualize the topic representation of major topics per class:

        topic_model.visualize_topics_per_class(topics_per_class)\n
        "},{"location":"getting_started/zeroshot/zeroshot.html","title":"Zero-shot Topic Modeling","text":"

        Zero-shot Topic Modeling is a technique that allows you to find topics in large amounts of documents that were predefined. When faced with many documents, you often have an idea of which topics will definitely be in there. Whether that is a result of simply knowing your data or if a domain expert is involved in defining those topics.

        This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. This allows for extensive flexibility as there are three scenario's to explore.

        First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.

        Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.

        Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.

        \"Clustering\" the labels Embed cosine similaritydocumentzeroshot For each document, assign topics based on between and embeddings ManualBERTopicBERTopic Create two models: (zeroshot documents) (non-zeroshot documents) the models into one Merge zeroshot topicslabels Define through . \"Clustering\" Zeroshot topic 1 \"Topic Modeling\" Zeroshot topic 2 \"Large Language Models (LLM)\" Zeroshot topic 3 \"Topic Modeling\" \"Large Language Models\" Topic Modeling BERTopic + Topic X Topic Y Topic Z Manual BERTopic Topic Modeling LLM Clustering Merged BERTopic Topic Modeling LLM Clustering Topic X Topic Y Topic Z Large Language Models No match found Clustering

        This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will be put through a regular BERTopic model.

        This creates two models. One for the zero-shot topics and one for the non-zero-shot topics. We combine these two BERTopic models to create a single model that contains both zero-shot and non-zero-shot topics.

        "},{"location":"getting_started/zeroshot/zeroshot.html#example","title":"Example","text":"

        In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. We know the data and believe there to be at least the following topics: clustering, topic modeling, and large language models. However, we are not sure whether other topics exist and want to explore those.

        Zero-shot BERTopic needs two parameters: * zeroshot_topic_list - The names of the topics to assign documents to. Making sure this is as descriptive as possible helps improve the assignment since they are based on cosine similarities between embeddings. * zeroshot_min_similarity - The minimum cosine similarity needed to match a document to a document. It is a value between 0 and 1.

        Using this feature is straightforward:

        from datasets import load_dataset\n\nfrom bertopic import BERTopic\nfrom bertopic.representation import KeyBERTInspired\n\n# We select a subsample of 5000 abstracts from ArXiv\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\ndocs = dataset[\"abstract\"][:5_000]\n\n# We define a number of topics that we know are in the documents\nzeroshot_topic_list = [\"Clustering\", \"Topic Modeling\", \"Large Language Models\"]\n\n# We fit our model using the zero-shot topics\n# and we define a minimum similarity. For each document,\n# if the similarity does not exceed that value, it will be used\n# for clustering instead.\ntopic_model = BERTopic(\n    embedding_model=\"thenlper/gte-small\", \n    min_topic_size=15,\n    zeroshot_topic_list=zeroshot_topic_list,\n    zeroshot_min_similarity=.85,\n    representation_model=KeyBERTInspired()\n)\ntopics, _ = topic_model.fit_transform(docs)\n

        When we run topic_model.get_topic_info() you will see something like this:

        The zeroshot_min_similarity parameter controls how many of the documents are assigned to the predefined zero-shot topics. Lower this value and you will have more documents assigned to zero-shot topics and fewer documents will be clustered. Increase this value you will have fewer documents assigned to zero-shot topics and more documents will be clustered.

        Note

        Setting the zeroshot_min_similarity parameter requires a bit of experimentation. Some embedding models have different similarity distributions, so trying out the values manually and exploring the results is highly advised.

        Tip

        Because zero-shot topic modeling is essentially merging two different topic models, the probs will be empty initially. If you want to have the probabilities of topics across documents, you can run topic_model.transform on your documents to extract the updated probs.

        "}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"index.html","title":"BERTopic","text":"

        BERTopic is a topic modeling technique that leverages \ud83e\udd17 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

        BERTopic supports all kinds of topic modeling techniques:

        Guided Supervised Semi-supervised Manual Multi-topic distributions Hierarchical Class-based Dynamic Online/Incremental Multimodal Multi-aspect Text Generation/LLM Zero-shot (new!) Merge Models (new!) Seed Words (new!)

        Corresponding medium posts can be found here, here and here. For a more detailed overview, you can read the paper or see a brief overview.

        "},{"location":"index.html#installation","title":"Installation","text":"

        Installation, with sentence-transformers, can be done using pypi:

        pip install bertopic\n

        You may want to install more depending on the transformers and language backends that you will be using. The possible installations are:

        # Choose an embedding backend\npip install bertopic[flair, gensim, spacy, use]\n\n# Topic modeling with images\npip install bertopic[vision]\n
        "},{"location":"index.html#quick-start","title":"Quick Start","text":"

        We start by extracting topics from the well-known 20 newsgroups dataset containing English documents:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        After generating topics and their probabilities, we can access the frequent topics that were generated:

        >>> topic_model.get_topic_info()\n\nTopic   Count   Name\n-1      4630    -1_can_your_will_any\n0       693     49_windows_drive_dos_file\n1       466     32_jesus_bible_christian_faith\n2       441     2_space_launch_orbit_lunar\n3       381     22_key_encryption_keys_encrypted\n

        -1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0:

        >>> topic_model.get_topic(0)\n\n[('windows', 0.006152228076250982),\n ('drive', 0.004982897610645755),\n ('dos', 0.004845038866360651),\n ('file', 0.004140142872194834),\n ('disk', 0.004131678774810884),\n ('mac', 0.003624848635985097),\n ('memory', 0.0034840976976789903),\n ('software', 0.0034415334250699077),\n ('email', 0.0034239554442333257),\n ('pc', 0.003047105930670237)]\n

        Using .get_document_info, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.:

        >>> topic_model.get_document_info(docs)\n\nDocument                               Topic    Name                        Top_n_words                     Probability    ...\nI am sure some bashers of Pens...       0       0_game_team_games_season    game - team - games...          0.200010       ...\nMy brother is in the market for...      -1     -1_can_your_will_any         can - your - will...            0.420668       ...\nFinally you said what you dream...      -1     -1_can_your_will_any         can - your - will...            0.807259       ...\nThink! It is the SCSI card doing...     49     49_windows_drive_dos_file    windows - drive - docs...       0.071746       ...\n1) I have an old Jasmine drive...       49     49_windows_drive_dos_file    windows - drive - docs...       0.038983       ...\n

        Multilingual

        Use BERTopic(language=\"multilingual\") to select a model that supports 50+ languages.

        "},{"location":"index.html#fine-tune-topic-representations","title":"Fine-tune Topic Representations","text":"

        In BERTopic, there are a number of different topic representations that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is KeyBERTInspired, which for many users increases the coherence and reduces stopwords from the resulting topic representations:

        from bertopic.representation import KeyBERTInspired\n\n# Fine-tune your topic representations\nrepresentation_model = KeyBERTInspired()\ntopic_model = BERTopic(representation_model=representation_model)\n

        However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more:

        import openai\nfrom bertopic.representation import OpenAI\n\n# Fine-tune topic representations with GPT\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(client, model=\"gpt-3.5-turbo\", chat=True)\ntopic_model = BERTopic(representation_model=representation_model)\n

        Multi-aspect Topic Modeling

        Instead of iterating over all of these different topic representations, you can model them simultaneously with multi-aspect topic representations in BERTopic.

        "},{"location":"index.html#modularity","title":"Modularity","text":"

        By default, the main steps for topic modeling with BERTopic are sentence-transformers, UMAP, HDBSCAN, and c-TF-IDF run in sequence. However, it assumes some independence between these steps which makes BERTopic quite modular. In other words, BERTopic not only allows you to build your own topic model but to explore several topic modeling techniques on top of your customized topic model:

        You can swap out any of these models or even remove them entirely. The following steps are completely modular:

        1. Embedding documents
        2. Reducing dimensionality of embeddings
        3. Clustering reduced embeddings into topics
        4. Tokenization of topics
        5. Weight tokens
        6. Represent topics with one or multiple representations

        To find more about the underlying algorithm and assumptions here.

        "},{"location":"index.html#overview","title":"Overview","text":"

        BERTopic has many functions that quickly can become overwhelming. To alleviate this issue, you will find an overview of all methods and a short description of its purpose.

        "},{"location":"index.html#common","title":"Common","text":"

        Below, you will find an overview of common functions in BERTopic.

        Method Code Fit the model .fit(docs) Fit the model and predict documents .fit_transform(docs) Predict new documents .transform([new_doc]) Access single topic .get_topic(topic=12) Access all topics .get_topics() Get topic freq .get_topic_freq() Get all topic information .get_topic_info() Get all document information .get_document_info(docs) Get representative docs per topic .get_representative_docs() Update topic representation .update_topics(docs, n_gram_range=(1, 3)) Generate topic labels .generate_topic_labels() Set topic labels .set_topic_labels(my_custom_labels) Merge topics .merge_topics(docs, topics_to_merge) Reduce nr of topics .reduce_topics(docs, nr_topics=30) Reduce outliers .reduce_outliers(docs, topics) Find topics .find_topics(\"vehicle\") Save model .save(\"my_model\", serialization=\"safetensors\") Load model BERTopic.load(\"my_model\") Get parameters .get_params()"},{"location":"index.html#attributes","title":"Attributes","text":"

        After having trained your BERTopic model, several are saved within your model. These attributes, in part, refer to how model information is stored on an estimator during fitting. The attributes that you see below all end in _ and are public attributes that can be used to access model information.

        Attribute Description .topics_ The topics that are generated for each document after training or updating the topic model. .probabilities_ The probabilities that are generated for each document if HDBSCAN is used. .topic_sizes_ The size of each topic .topic_mapper_ A class for tracking topics and their mappings anytime they are merged/reduced. .topic_representations_ The top n terms per topic and their respective c-TF-IDF values. .c_tf_idf_ The topic-term matrix as calculated through c-TF-IDF. .topic_aspects_ The different aspects, or representations, of each topic. .topic_labels_ The default labels for each topic. .custom_labels_ Custom labels for each topic as generated through .set_topic_labels. .topic_embeddings_ The embeddings for each topic if embedding_model was used. .representative_docs_ The representative documents for each topic if HDBSCAN is used."},{"location":"index.html#variations","title":"Variations","text":"

        There are many different use cases in which topic modeling can be used. As such, several variations of BERTopic have been developed such that one package can be used across many use cases.

        Method Code Topic Distribution Approximation .approximate_distribution(docs) Online Topic Modeling .partial_fit(doc) Semi-supervised Topic Modeling .fit(docs, y=y) Supervised Topic Modeling .fit(docs, y=y) Manual Topic Modeling .fit(docs, y=y) Multimodal Topic Modeling .fit(docs, images=images) Topic Modeling per Class .topics_per_class(docs, classes) Dynamic Topic Modeling .topics_over_time(docs, timestamps) Hierarchical Topic Modeling .hierarchical_topics(docs) Guided Topic Modeling BERTopic(seed_topic_list=seed_topic_list) Zero-shot Topic Modeling BERTopic(zeroshot_topic_list=zeroshot_topic_list) Merge Multiple Models BERTopic.merge_models([topic_model_1, topic_model_2])"},{"location":"index.html#visualizations","title":"Visualizations","text":"

        Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. Visualizing different aspects of the topic model helps in understanding the model and makes it easier to tweak the model to your liking.

        Method Code Visualize Topics .visualize_topics() Visualize Documents .visualize_documents() Visualize Document with DataMapPlot .visualize_document_datamap() Visualize Document Hierarchy .visualize_hierarchical_documents() Visualize Topic Hierarchy .visualize_hierarchy() Visualize Topic Tree .get_topic_tree(hierarchical_topics) Visualize Topic Terms .visualize_barchart() Visualize Topic Similarity .visualize_heatmap() Visualize Term Score Decline .visualize_term_rank() Visualize Topic Probability Distribution .visualize_distribution(probs[0]) Visualize Topics over Time .visualize_topics_over_time(topics_over_time) Visualize Topics per Class .visualize_topics_per_class(topics_per_class)"},{"location":"index.html#citation","title":"Citation","text":"

        To cite the BERTopic paper, please use the following bibtex reference:

        @article{grootendorst2022bertopic,\n  title={BERTopic: Neural topic modeling with a class-based TF-IDF procedure},\n  author={Grootendorst, Maarten},\n  journal={arXiv preprint arXiv:2203.05794},\n  year={2022}\n}\n
        "},{"location":"changelog.html","title":"Changelog","text":""},{"location":"changelog.html#version-0163","title":"Version 0.16.3","text":"

        Release date: 22 July, 2024

        Highlights:
        • Simplify zero-shot topic modeling by @ianrandman in #2060
        • Option to choose between c-TF-IDF and Topic Embeddings in many functions by @azikoss in #1894
          • Use the use_ctfidf parameter in the following function to choose between c-TF-IDF and topic embeddings:
            • hierarchical_topics, reduce_topics, visualize_hierarchy, visualize_heatmap, visualize_topics
        • Linting with Ruff by @afuetterer in #2033
        • Switch from setup.py to pyproject.toml by @afuetterer in #1978
        • In multi-aspect context, allow Main model to be chained by @ddicato in #2002
        Fixes:
        • Added templates for issues and pull requests
        • Update River documentation example by @Proteusiq in #2004
        • Fix PartOfSpeech reproducibility by @Greenpp in #1996
        • Fix PartOfSpeech ignoring first word by @Greenpp in #2024
        • Make sklearn embedding backend auto-select more cautious by @freddyheppell in #1984
        • Fix typos by @afuetterer in #1974
        • Fix hierarchical_topics(...) when the distances between three clusters are the same by @azikoss in #1929
        • Fixes to chain strategy example in outlier_reduction.md by @reuning in #2065
        • Remove obsolete flake8 config and update line length by @afuetterer in #22066
        "},{"location":"changelog.html#version-0162","title":"Version 0.16.2","text":"

        Release date: 12 May, 2024

        Fixes:
        • Fix issue with zeroshot topic modeling missing outlier #1957
        • Bump github actions versions by @afuetterer in #1941
        • Drop support for python 3.7 by @afuetterer in #1949
        • Add testing python 3.10+ in Github actions by @afuetterer in #1968
        • Speed up fitting CountVectorizer by @dannywhuang in #1938
        • Fix transform when using cuML HDBSCAN by @beckernick in #1960
        • Fix wrong link in algorithm documentation by @naeyn in #1970
        "},{"location":"changelog.html#version-0161","title":"Version 0.16.1","text":"

        Release date: 21 April, 2024

        Highlights:
        • Add Quantized LLM Tutorial
        • Add optional datamapplot visualization using topic_model.visualize_document_datamap by @lmcinnes in #1750
        • Migrated OpenAIBackend to openai>=1 by @peguerosdc in #1724
        • Add automatic height scaling and font resize by @ir2718 in #1863
        • Use [KEYWORDS] tags with the LangChain representation model by @mcantimmy in #1871
        Fixes:
        • Fixed issue with .merge_models seemingly skipping topic #1898
        • Fixed Cohere client.embed TypeError #1904
        • Fixed AttributeError: 'TextGeneration' object has no attribute 'random_state' #1870
        • Fixed topic embeddings not properly updated if all outliers were removed #1838
        • Fixed issue with representation models not properly merging #1762
        • Fixed Embeddings not ordered correctly when using .merge_models #1804
        • Fixed Outlier topic not in the 0th position when using zero-shot topic modeling causing prediction issues (amongst others) #1804
        • Fixed Incorrect label in ZeroShot doc SVG #1732
        • Fixed MultiModalBackend throws error with clip-ViT-B-32-multilingual-v1 #1670
        • Fixed AuthenticationError while using OpenAI() #1678

        • Update FAQ on Apple Silicon by @benz0li in #1901

        • Add documentation DataMapPlot + FAQ for running on Apple Silicon by @dkapitan in #1854
        • Remove commas from pip install reference in readme by @luisoala in #1850
        • Spelling corrections by @joouha in #1801
        • Replacing the deprecated text-ada-001 model with the latest text-embedding-3-small from OpenAI by @atmb4u in #1800
        • Prevent invalid empty input error when retrieving embeddings with openai backend by @liaoelton in #1827
        • Remove spurious warning about missing embedding model by @sliedes in #1774
        • Fix type hint in ClassTfidfTransformer constructor @snape in #1803
        • Fix typo and simplify wording in OnlineCountVectorizer docstring by @chrisji in #1802
        • Fixed warning when saving a topic model without an embedding model by @zilch42 in #1740
        • Fix bug in TextGeneration by @manveersadhal in #1726
        • Fix an incorrect link to usecases.md by @nicholsonjf in #1731
        • Prevent model argument being passed twice when using generator_kwargs in OpenAI by @ninavandiermen in #1733
        • Several fixes to the docstrings by @arpadikuma in #1719
        • Remove unused cluster_df variable in hierarchical_topics by @shadiakiki1986 in #1701
        • Removed redundant quotation mark by @LawrenceFulton in #1695
        • Fix typo in merge models docs by @zilch42 in #1660
        "},{"location":"changelog.html#version-0160","title":"Version 0.16.0","text":"

        Release date: 26 November, 2023

        Highlights:
        • Merge pre-trained BERTopic models with .merge_models
          • Combine models with different representations together!
          • Use this for incremental/online topic modeling to detect new incoming topics
          • First step towards federated learning with BERTopic
        • Zero-shot Topic Modeling
          • Use a predefined list of topics to assign documents
          • If needed, allows for further exploration of undefined topics
        • Seed (domain-specific) words with ClassTfidfTransformer
          • Make sure selected words are more likely to end up in the representation without influencing the clustering process
        • Added params to truncate documents to length when using LLMs
        • Added LlamaCPP as a representation model
        • LangChain: Support for LCEL Runnables by @joshuasundance-swca in #1586
        • Added topics parameter to .topics_over_time to select a subset of documents and topics
        • Documentation:
          • Best practices Guide
          • Llama 2 Tutorial
          • Zephyr Tutorial
          • Improved embeddings guidance (MTEB)
          • Improved logging throughout the package
        • Added support for Cohere's Embed v3:
          cohere_model = CohereBackend(\n    client,\n    embedding_model=\"embed-english-v3.0\",\n    embed_kwargs={\"input_type\": \"clustering\"}\n)\n
        Fixes:
        • Fixed n-gram Keywords need delimiting in OpenAI() #1546
        • Fixed OpenAI v1.0 issues #1629
        • Improved documentation/logging to address #1589, #1591
        • Fixed engine support for Azure OpenAI embeddings #1577
        • Fixed OpenAI Representation: KeyError: 'content' #1570
        • Fixed Loading topic model with multiple topic aspects changes their format #1487
        • Fix expired link in algorithm.md by @burugaria7 in #1396
        • Fix guided topic modeling in cuML's UMAP by @stevetracvc in #1326
        • OpenAI: Allow retrying on Service Unavailable errors by @agamble in #1407
        • Fixed parameter naming for HDBSCAN in best practices by @rnckp in #1408
        • Fixed typo in tips_and_tricks.md by @aronnoordhoek in #1446
        • Fix typos in documentation by @bobchien in #1481
        • Fix IndexError when all outliers are removed by reduce_outliers by @Aratako in #1466
        • Fix TypeError on reduce_outliers \"probabilities\" by @ananaphasia in #1501
        • Add new line to fix markdown bullet point formatting by @saeedesmaili in #1519
        • Update typo in topicrepresentation.md by @oliviercaron in #1537
        • Fix typo in FAQ by @sandijou in #1542
        • Fixed typos in best practices documentation by @poomkusa in #1557
        • Correct TopicMapper doc example by @chrisji in #1637
        • Fix typing in hierarchical_topics by @dschwalm in #1364
        • Fixed typing issue with threshold parameter in reduce_outliers by @dschwalm in #1380
        • Fix several typos by @mertyyanik in #1307 (#1307)
        • Fix inconsistent naming by @rolanderdei in #1073
        Merge Pre-trained BERTopic Models

        The new .merge_models feature allows for any number of fitted BERTopic models to be merged. Doing so allows for a number of use cases:

        • Incremental topic modeling -- Continuously merge models together to detect whether new topics have appeared
        • Federated Learning - Train BERTopic models on different clients and combine them on a central server
        • Minimal compute - We can essentially batch the training process into multiple instances to reduce compute
        • Different datasets - When you have different datasets that you want to train separately on, for example with different languages, you can train each model separately and join them after training

        To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain.

        First, we train three separate models on different parts of the data:

        from umap import UMAP\nfrom bertopic import BERTopic\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\n\n# Extract abstracts to train on and corresponding titles\nabstracts_1 = dataset[\"abstract\"][:5_000]\nabstracts_2 = dataset[\"abstract\"][5_000:10_000]\nabstracts_3 = dataset[\"abstract\"][10_000:15_000]\n\n# Create topic models\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\ntopic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1)\ntopic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2)\ntopic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3)\n

        Then, we can combine all three models into one with .merge_models:

        # Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n
        Zero-shot Topic Modeling

        Zeroshot Topic Modeling is a technique that allows you to find pre-defined topics in large amounts of documents. This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. This allows for extensive flexibility as there are three scenario's to explore.

        • No zeroshot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
        • Only zeroshot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.
        • Both zeroshot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.

        In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. We know the data and believe there to be at least the following topics: clustering, topic modeling, and large language models. However, we are not sure whether other topics exist and want to explore those.

        Using this feature is straightforward:

        from datasets import load_dataset\n\nfrom bertopic import BERTopic\nfrom bertopic.representation import KeyBERTInspired\n\n# We select a subsample of 5000 abstracts from ArXiv\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\ndocs = dataset[\"abstract\"][:5_000]\n\n# We define a number of topics that we know are in the documents\nzeroshot_topic_list = [\"Clustering\", \"Topic Modeling\", \"Large Language Models\"]\n\n# We fit our model using the zero-shot topics\n# and we define a minimum similarity. For each document,\n# if the similarity does not exceed that value, it will be used\n# for clustering instead.\ntopic_model = BERTopic(\n    embedding_model=\"thenlper/gte-small\", \n    min_topic_size=15,\n    zeroshot_topic_list=zeroshot_topic_list,\n    zeroshot_min_similarity=.85,\n    representation_model=KeyBERTInspired()\n)\ntopics, _ = topic_model.fit_transform(docs)\n

        When we run topic_model.get_topic_info() you will see something like this:

        Seed (Domain-specific) Words

        When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the \"TNM\" classification is a method for identifying the stage of most cancers. The word \"TNM\" is an abbreviation and might not be correctly captured in generic embedding models.

        To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of seed_words in the bertopic.vectorizer.ClassTfidfTransformer. To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like \"agent\" and \"robot\" should be important in such a topic were it to be found. Using the ClassTfidfTransformer, we can define those seed_words and also choose by how much their values are multiplied.

        The full example is then as follows:

        from umap import UMAP\nfrom datasets import load_dataset\nfrom bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\n# Let's take a subset of ArXiv abstracts as the training data\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\nabstracts = dataset[\"abstract\"][:5_000]\n\n# For illustration purposes, we make sure the output is fixed when running this code multiple times\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n\n# We can choose any number of seed words for which we want their representation\n# to be strengthen. We increase the importance of these words as we want them to be more\n# likely to end up in the topic representations.\nctfidf_model = ClassTfidfTransformer(\n    seed_words=[\"agent\", \"robot\", \"behavior\", \"policies\", \"environment\"], \n    seed_multiplier=2\n)\n\n# We run the topic model with the seeded words\ntopic_model = BERTopic(\n    umap_model=umap_model,\n    min_topic_size=15,\n    ctfidf_model=ctfidf_model,\n).fit(abstracts)\n
        Truncate Documents in LLMs

        When using LLMs with BERTopic, we can truncate the input documents in [DOCUMENTS] in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak:

        • doc_length - The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.
        • tokenizer - The tokenizer used to calculate to split the document into segments used to count the length of a document.
          • Options include 'char', 'whitespace', 'vectorizer', and a callable

        This means that the definition of doc_length changes depending on what constitutes a token in the tokenizer parameter. If a token is a character, then doc_length refers to max length in characters. If a token is a word, then doc_length refers to the max length in words.

        Let's illustrate this with an example. In the code below, we will use tiktoken to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated.

        We use bertopic.representation.OpenAI to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens:

        import openai\nimport tiktoken\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Tokenizer\ntokenizer= tiktoken.encoding_for_model(\"gpt-3.5-turbo\")\n\n# Create your representation model\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(\n    client,\n    model=\"gpt-3.5-turbo\",\n    delay_in_seconds=2, \n    chat=True,\n    nr_docs=4,\n    doc_length=100,\n    tokenizer=tokenizer\n)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"changelog.html#version-0150","title":"Version 0.15.0","text":"

        Release date: 29 May, 2023

        Highlights:
        • Multimodal Topic Modeling
          • Train your topic modeling on text, images, or images and text!
          • Use the bertopic.backend.MultiModalBackend to embed images, text, both or even caption images!
        • Multi-Aspect Topic Modeling
          • Create multiple topic representations simultaneously
        • Improved Serialization options
          • Push your model to the HuggingFace Hub with .push_to_hf_hub
          • Safer, smaller and more flexible serialization options with safetensors
          • Thanks to a great collaboration with HuggingFace and the authors of BERTransfer!
        • Added new embedding models
          • OpenAI: bertopic.backend.OpenAIBackend
          • Cohere: bertopic.backend.CohereBackend
        • Added example of summarizing topics with OpenAI's GPT-models
        • Added nr_docs and diversity parameters to OpenAI and Cohere representation models
        • Use custom_labels=\"Aspect1\" to use the aspect labels for visualizations instead
        • Added cuML support for probability calculation in .transform
        • Updated topic embeddings
          • Centroids by default and c-TF-IDF weighted embeddings for partial_fit and .update_topics
        • Added exponential_backoff parameter to OpenAI model
        Fixes:
        • Fixed custom prompt not working in TextGeneration
        • Fixed #1142
        • Add additional logic to handle cupy arrays by @metasyn in #1179
        • Fix hierarchy viz and handle any form of distance matrix by @elashrry in #1173
        • Updated languages list by @sam9111 in #1099
        • Added level_scale argument to visualize_hierarchical_documents by @zilch42 in #1106
        • Fix inconsistent naming by @rolanderdei in #1073
        Multimodal Topic Modeling

        With v0.15, we can now perform multimodal topic modeling in BERTopic! The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.

        In this example, we are going to use images from flickr that each have a caption associated to it:

        # NOTE: This requires the `datasets` package which you can \n# install with `pip install datasets`\nfrom datasets import load_dataset\n\nds = load_dataset(\"maderix/flickr_bw_rgb\")\nimages = ds[\"train\"][\"image\"]\ndocs = ds[\"train\"][\"caption\"]\n

        The docs variable contains the captions for each image in images. We can now use these variables to run our multimodal example:

        from bertopic import BERTopic\nfrom bertopic.representation import VisualRepresentation\n\n# Additional ways of representing a topic\nvisual_model = VisualRepresentation()\n\n# Make sure to add the `visual_model` to a dictionary\nrepresentation_model = {\n   \"Visual_Aspect\":  visual_model,\n}\ntopic_model = BERTopic(representation_model=representation_model, verbose=True)\n

        We can now access our image representations for each topic with topic_model.topic_aspects_[\"Visual_Aspect\"]. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:

        import base64\nfrom io import BytesIO\nfrom IPython.display import HTML\n\ndef image_base64(im):\n    if isinstance(im, str):\n        im = get_thumbnail(im)\n    with BytesIO() as buffer:\n        im.save(buffer, 'jpeg')\n        return base64.b64encode(buffer.getvalue()).decode()\n\n\ndef image_formatter(im):\n    return f'<img src=\"data:image/jpeg;base64,{image_base64(im)}\">'\n\n# Extract dataframe\ndf = topic_model.get_topic_info().drop(\"Representative_Docs\", 1).drop(\"Name\", 1)\n\n# Visualize the images\nHTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))\n

        Multi-aspect Topic Modeling

        In this new release, we introduce multi-aspect topic modeling! During the .fit or .fit_transform stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).

        The approach is rather straightforward. We might want to represent our topics using a PartOfSpeech representation model but we might also want to try out KeyBERTInspired and compare those representation models. We can do this as follows:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic.representation import PartOfSpeech\nfrom bertopic.representation import MaximalMarginalRelevance\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Documents to train on\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# The main representation of a topic\nmain_representation = KeyBERTInspired()\n\n# Additional ways of representing a topic\naspect_model1 = PartOfSpeech(\"en_core_web_sm\")\naspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]\n\n# Add all models together to be run in a single `fit`\nrepresentation_model = {\n   \"Main\": main_representation,\n   \"Aspect1\":  aspect_model1,\n   \"Aspect2\":  aspect_model2 \n}\ntopic_model = BERTopic(representation_model=representation_model).fit(docs)\n

        As show above, to perform multi-aspect topic modeling, we make sure that representation_model is a dictionary where each representation model pipeline is defined. The main pipeline, that is used in most visualization options, is defined with the \"Main\" key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as \"Aspect1\" and \"Aspect2\".

        After we have fitted our model, we can access all representations with topic_model.get_topic_info():

        As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in topic_model.topic_aspects_.

        Serialization

        Saving, loading, and sharing a BERTopic model can be done in several ways. With this new release, it is now advised to go with .safetensors as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as .pickle and pytorch .bin are also possible.

        The methods are used as follows:

        topic_model = BERTopic().fit(my_docs)\n\n# Method 1 - safetensors\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 2 - pytorch\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"pytorch\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 3 - pickle\ntopic_model.save(\"my_model\", serialization=\"pickle\")\n

        Saving the topic modeling with .safetensors or pytorch has a number of advantages:

        • .safetensors is a relatively safe format
        • The resulting model can be very small (often < 20MB>) since no sub-models need to be saved
        • Although version control is important, there is a bit more flexibility with respect to specific versions of packages
        • More easily used in production
        • Share models with the HuggingFace Hub

        The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing safetensors, pytorch, and pickle. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings.

        HuggingFace Hub

        When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account:

        from huggingface_hub import login\nlogin()\n

        When you have logged in to your HuggingFace account, you can save and upload the model as follows:

        from bertopic import BERTopic\n\n# Train model\ntopic_model = BERTopic().fit(my_docs)\n\n# Push to HuggingFace Hub\ntopic_model.push_to_hf_hub(\n    repo_id=\"MaartenGr/BERTopic_ArXiv\",\n    save_ctfidf=True\n)\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_ArXiv\")\n
        "},{"location":"changelog.html#version-0141","title":"Version 0.14.1","text":"

        Release date: 2 March, 2023

        Highlights:
        • Use ChatGPT to create topic representations!:
        • Added delay_in_seconds parameter to OpenAI and Cohere representation models for throttling the API
          • Setting this between 5 and 10 allows for trial users now to use more easily without hitting RateLimitErrors
        • Fixed missing title param to visualization methods
        • Fixed probabilities not correctly aligning (#1024)
        • Fix typo in textgenerator @dkopljar27 in #1002
        ChatGPT

        Within OpenAI's API, the ChatGPT models use a different API structure compared to the GPT-3 models. In order to use ChatGPT with BERTopic, we need to define the model and make sure to set chat=True:

        import openai\nfrom bertopic import BERTopic\nfrom bertopic.representation import OpenAI\n\n# Create your representation model\nopenai.api_key = MY_API_KEY\nrepresentation_model = OpenAI(model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Prompting with ChatGPT is very satisfying and can be customized in BERTopic by using certain tags. There are currently two tags, namely \"[KEYWORDS]\" and \"[DOCUMENTS]\". These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively. For example, if we have the following prompt:

        prompt = \"\"\"\nI have topic that contains the following documents: \\n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n\"\"\"\n

        then that will be rendered as follows and passed to OpenAI's API:

        \"\"\"\nI have a topic that contains the following documents: \n- Our videos are also made possible by your support on patreon.co.\n- If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop.\n- If you want to help us make more videos, you can do so there.\n- And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com.\n\nThe topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch \n\nBased on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n\"\"\"\n

        Note

        Whenever you create a custom prompt, it is important to add

        Based on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n
        at the end of your prompt as BERTopic extracts everything that comes after topic:. Having said that, if topic: is not in the output, then it will simply extract the entire response, so feel free to experiment with the prompts.

        "},{"location":"changelog.html#version-0140","title":"Version 0.14.0","text":"

        Release date: 14 February, 2023

        Highlights:
        • Fine-tune topic representations with bertopic.representation
          • Diverse range of models, including KeyBERT, MMR, POS, Transformers, OpenAI, and more!'
          • Create your own prompts for text generation models, like GPT3:
            • Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and set of representative documents need to be inserted.
          • Chain models to perform fine-grained fine-tuning
          • Create and customize your represention model
        • Improved the topic reduction technique when using nr_topics=int
        • Added title parameters for all graphs (#800)
        Fixes:
        • Improve documentation (#837, #769, #954, #912, #911)
        • Bump pyyaml (#903)
        • Fix large number of representative docs (#965)
        • Prevent stochastisch behavior in .visualize_topics (#952)
        • Add custom labels parameter to .visualize_topics (#976)
        • Fix cuML HDBSCAN type checks by @FelSiq in #981
        API Changes:
        • The diversity parameter was removed in favor of bertopic.representation.MaximalMarginalRelevance
        • The representation_model parameter was added to bertopic.BERTopic

        Representation Models

        Fine-tune the c-TF-IDF representation with a variety of models. Whether that is through a KeyBERT-Inspired model or GPT-3, the choice is up to you!

        KeyBERTInspired

        The algorithm follows some principles of KeyBERT but does some optimization in order to speed up inference. Usage is straightforward:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        PartOfSpeech

        Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of keywords and documents that best represent a topic.

        from bertopic.representation import PartOfSpeech\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        MaximalMarginalRelevance

        When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like \"car\" and \"cars\" essentially represent the same information and often redundant. We can use MaximalMarginalRelevance to improve diversity of our candidate topics:

        from bertopic.representation import MaximalMarginalRelevance\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = MaximalMarginalRelevance(diversity=0.3)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Zero-Shot Classification

        To perform zero-shot classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels. If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords.

        We use it in BERTopic as follows:

        from bertopic.representation import ZeroShotClassification\nfrom bertopic import BERTopic\n\n# Create your representation model\ncandidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\nrepresentation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: \ud83e\udd17 Transformers

        Nearly every week, there are new and improved models released on the \ud83e\udd17 Model Hub that, with some creativity, allow for further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these methods are created as a way to support whatever might be released in the future.

        Using a GPT-like model from the huggingface hub is rather straightforward:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = TextGeneration('gpt2')\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: Cohere

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use Cohere to extract our topic labels from the candidate documents and keywords. To use this, you will need to install cohere first:

        pip install cohere\n

        Then, get yourself an API key and use Cohere's API as follows:

        import cohere\nfrom bertopic.representation import Cohere\nfrom bertopic import BERTopic\n\n# Create your representation model\nco = cohere.Client(my_api_key)\nrepresentation_model = Cohere(co)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: OpenAI

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use OpenAI to extract our topic labels from the candidate documents and keywords. To use this, you will need to install openai first:

        pip install openai\n

        Then, get yourself an API key and use OpenAI's API as follows:

        import openai\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Create your representation model\nopenai.api_key = MY_API_KEY\nrepresentation_model = OpenAI()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        Text Generation: LangChain

        Langchain is a package that helps users with chaining large language models. In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this external knowledge are the most representative documents in each topic.

        To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai:

        pip install langchain, openai\n

        Then, you can create your chain as follows:

        from langchain.chains.question_answering import load_qa_chain\nfrom langchain.llms import OpenAI\nchain = load_qa_chain(OpenAI(temperature=0, openai_api_key=MY_API_KEY), chain_type=\"stuff\")\n

        Finally, you can pass the chain to BERTopic as follows:

        from bertopic.representation import LangChain\n\n# Create your representation model\nrepresentation_model = LangChain(chain)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"changelog.html#version-0130","title":"Version 0.13.0","text":"

        Release date: 4 January, 2023

        Highlights:
        • Calculate topic distributions with .approximate_distribution regardless of the cluster model used
          • Generates topic distributions on a document- and token-levels
          • Can be used for any document regardless of its size!
        • Fully supervised BERTopic
          • You can now use a classification model for the clustering step instead to create a fully supervised topic model
        • Manual topic modeling
          • Generate topic representations from labels directly
          • Allows for skipping the embedding and clustering steps in order to go directly to the topic representation step
        • Reduce outliers with 4 different strategies using .reduce_outliers
        • Install BERTopic without SentenceTransformers for a lightweight package:
          • pip install --no-deps bertopic
          • pip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml
        • Get meta data of trained documents such as topics and probabilities using .get_document_info(docs)
        • Added more support for cuML's HDBSCAN
          • Calculate and predict probabilities during fit_transform and transform respectively
          • This should give a major speed-up when setting calculate_probabilities=True
        • More images to the documentation and a lot of changes/updates/clarifications
        • Get representative documents for non-HDBSCAN models by comparing document and topic c-TF-IDF representations
        • Sklearn Pipeline Embedder by @koaning in #791
        Fixes:
        • Improve .partial_fit documentation (#837)
        • Fixed scipy linkage usage (#807)
        • Fixed shifted heatmap (#782)
        • Fixed SpaCy backend (#744)
        • Fixed representative docs with small clusters (<3) (#703)
        • Typo fixed by @timpal0l in #734
        • Typo fixed by @srulikbd in #842
        • Correcting iframe urls by @Mustapha-AJEGHRIR in #798
        • Refactor embedding methods by @zachschillaci27 in #855
        • Added diversity parameter to update_topics() function by @anubhabdaserrr in #887
        Documentation

        Personally, I believe that documentation can be seen as a feature and is an often underestimated aspect of open-source. So I went a bit overboard\ud83d\ude05... and created an animation about the three pillars of BERTopic using Manim. There are many other visualizations added, one of each variation of BERTopic, and many smaller changes.

        Topic Distributions

        The difficulty with a cluster-based topic modeling technique is that it does not directly consider that documents may contain multiple topics. With the new release, we can now model the distributions of topics! We even consider that a single word might be related to multiple topics. If a document is a mixture of topics, what is preventing a single word to be the same?

        To do so, we approximate the distribution of topics in a document by calculating and summing the similarities of tokensets (achieved by applying a sliding window) with the topics:

        # After fitting your model run the following for either your trained documents or even unseen documents\ntopic_distr, _ = topic_model.approximate_distribution(docs)\n

        To calculate and visualize the topic distributions in a document on a token-level, we can run the following:

        # We need to calculate the topic distributions on a token level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Create a visualization using a styled dataframe if Jinja2 is installed\ndf = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0]); df\n
        Supervised Topic Modeling

        BERTopic now supports fully-supervised classification! Instead of using a clustering algorithm, like HDBSCAN, we can replace it with a classifier, like Logistic Regression:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseDimensionalityReduction\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.linear_model import LogisticRegression\n\n# Get labeled data\ndata= fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n\n# Allows us to skip over the dimensionality reduction step\nempty_dimensionality_model = BaseDimensionalityReduction()\n\n# Create a classifier to be used instead of the cluster model\nclf= LogisticRegression()\n\n# Create a fully supervised BERTopic instance\ntopic_model= BERTopic(\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=clf\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n
        Manual Topic Modeling

        When you already have a bunch of labels and simply want to extract topic representations from them, you might not need to actually learn how those can predicted. We can bypass the embeddings -> dimensionality reduction -> clustering steps and go straight to the c-TF-IDF representation of our labels:

        from bertopic import BERTopic\nfrom bertopic.backend import BaseEmbedder\nfrom bertopic.cluster import BaseCluster\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\n# Prepare our empty sub-models and reduce frequent words while we are at it.\nempty_embedding_model = BaseEmbedder()\nempty_dimensionality_model = BaseDimensionalityReduction()\nempty_cluster_model = BaseCluster()\n\n# Fit BERTopic without actually performing any clustering\ntopic_model= BERTopic(\n        embedding_model=empty_embedding_model,\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=empty_cluster_model,\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n
        Outlier Reduction

        Outlier reduction is an frequently-discussed topic in BERTopic as its default cluster model, HDBSCAN, has a tendency to generate many outliers. This often helps in the topic representation steps, as we do not consider documents that are less relevant, but you might want to still assign those outliers to actual topics. In the modular philosophy of BERTopic, keeping training times in mind, it is now possible to perform outlier reduction after having trained your topic model. This allows for ease of iteration and prevents having to train BERTopic many times to find the parameters you are searching for. There are 4 different strategies that you can use, so make sure to check out the documentation!

        Using it is rather straightforward:

        new_topics = topic_model.reduce_outliers(docs, topics)\n
        Lightweight BERTopic

        The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely \"all-MiniLM-L6-v2\". Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires pytorch which often results in a rather large environment, memory-wise.

        Fortunately, it is possible to install BERTopic without sentence-transformers and use it as a lightweight solution instead. The installation can be done as follows:

        pip install --no-deps bertopic\npip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml\n

        Then, we can use BERTopic without sentence-transformers as follows using a CPU-based embedding technique:

        from sklearn.pipeline import make_pipeline\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\npipe = make_pipeline(\n    TfidfVectorizer(),\n    TruncatedSVD(100)\n)\n\ntopic_model = BERTopic(embedding_model=pipe)\n

        As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary!

        Document Information

        Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and the probability of the clustering if the cluster model supports it. There are also options to include other metadata, such as the topic distributions or the x and y coordinates of the reduced embeddings that you can learn more about here.

        To get the document info, you will only need to pass the documents on which the topic model was trained:

        >>> topic_model.get_document_info(docs)\n\nDocument                               Topic    Name                        Top_n_words                     Probability    ...\nI am sure some bashers of Pens...       0       0_game_team_games_season    game - team - games...          0.200010       ...\nMy brother is in the market for...      -1     -1_can_your_will_any         can - your - will...            0.420668       ...\nFinally you said what you dream...      -1     -1_can_your_will_any         can - your - will...            0.807259       ...\nThink! It is the SCSI card doing...     49     49_windows_drive_dos_file    windows - drive - docs...       0.071746       ...\n1) I have an old Jasmine drive...       49     49_windows_drive_dos_file    windows - drive - docs...       0.038983       ...\n
        "},{"location":"changelog.html#version-0120","title":"Version 0.12.0","text":"

        Release date: 5 September, 2022

        Highlights:

        • Perform online/incremental topic modeling with .partial_fit
        • Expose c-TF-IDF model for customization with bertopic.vectorizers.ClassTfidfTransformer
          • The parameters bm25_weighting and reduce_frequent_words were added to potentially improve representations:
        • Expose attributes for easier access to internal data
        • Major changes to the Algorithm page of the documentation, which now contains three overviews of the algorithm:
          • Visualize Overview
          • Code Overview
          • Detailed Overview
        • Added an example of combining BERTopic with KeyBERT
        • Added many tests with the intention of making development a bit more stable

        Fixes:

        • Fixed iteratively merging topics (#632 and (#648)
        • Fixed 0th topic not showing up in visualizations (#667)
        • Fixed lowercasing not being optional (#682)
        • Fixed spelling (#664 and (#673)
        • Fixed 0th topic not shown in .get_topic_info by @oxymor0n in #660
        • Fixed spelling by @domenicrosati in #674
        • Add custom labels and title options to barchart @leloykun in #694

        Online/incremental topic modeling:

        Online topic modeling (sometimes called \"incremental topic modeling\") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained on before. In Scikit-Learn, this technique is often modeled through a .partial_fit function, which is also used in BERTopic.

        At a minimum, the cluster model needs to support a .partial_fit function in order to use this feature. The default HDBSCAN model will not work as it does not support online updating.

        from sklearn.datasets import fetch_20newsgroups\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.decomposition import IncrementalPCA\nfrom bertopic.vectorizers import OnlineCountVectorizer\nfrom bertopic import BERTopic\n\n# Prepare documents\nall_docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\ndoc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)]\n\n# Prepare sub-models that support online learning\numap_model = IncrementalPCA(n_components=5)\ncluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\ntopic_model = BERTopic(umap_model=umap_model,\n                       hdbscan_model=cluster_model,\n                       vectorizer_model=vectorizer_model)\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n

        Only the topics for the most recent batch of documents are tracked. If you want to be using online topic modeling, not for a streaming setting but merely for low-memory use cases, then it is advised to also update the .topics_ attribute as variations such as hierarchical topic modeling will not work afterward:

        # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration\ntopics = []\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n    topics.extend(topic_model.topics_)\n\ntopic_model.topics_ = topics\n

        c-TF-IDF:

        Explicitly define, use, and adjust the ClassTfidfTransformer with new parameters, bm25_weighting and reduce_frequent_words, to potentially improve the topic representation:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(bm25_weighting=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model)\n

        Attributes:

        After having fitted your BERTopic instance, you can use the following attributes to have quick access to certain information, such as the topic assignment for each document in topic_model.topics_.

        Attribute Type Description topics_ List[int] The topics that are generated for each document after training or updating the topic model. The most recent topics are tracked. probabilities_ List[float] The probability of the assigned topic per document. These are only calculated if a HDBSCAN model is used for the clustering step. When calculate_probabilities=True, then it is the probabilities of all topics per document. topic_sizes_ Mapping[int, int] The size of each topic. topic_mapper_ TopicMapper A class for tracking topics and their mappings anytime they are merged, reduced, added, or removed. topic_representations_ Mapping[int, Tuple[int, float]] The top n terms per topic and their respective c-TF-IDF values. c_tf_idf_ csr_matrix The topic-term matrix as calculated through c-TF-IDF. To access its respective words, run .vectorizer_model.get_feature_names() or .vectorizer_model.get_feature_names_out() topic_labels_ Mapping[int, str] The default labels for each topic. custom_labels_ List[str] Custom labels for each topic as generated through .set_topic_labels. topic_embeddings_ np.ndarray The embeddings for each topic. It is calculated by taking the weighted average of word embeddings in a topic based on their c-TF-IDF values. representative_docs_ Mapping[int, str] The representative documents for each topic if HDBSCAN is used."},{"location":"changelog.html#version-0110","title":"Version 0.11.0","text":"

        Release date: 11 July, 2022

        Highlights:

        • Perform hierarchical topic modeling with .hierarchical_topics
        hierarchical_topics = topic_model.hierarchical_topics(docs, topics) \n
        • Visualize hierarchical topic representations with .visualize_hierarchy
        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n
        • Extract a text-based hierarchical topic representation with .get_topic_tree
        tree = topic_model.get_topic_tree(hierarchical_topics)\n
        • Visualize 2D documents with .visualize_documents()
        # Use input embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# or use 2D reduced embeddings through a method of your own (e.g., PCA, t-SNE, UMAP, etc.)\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n
        • Visualize 2D hierarchical documents with .visualize_hierarchical_documents()
        # Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already which speed things up quite a bit:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n
        • Create custom labels to the topics throughout most visualizations
        # Generate topic labels\ntopic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=10, separator=\", \")\n\n# Set them internally in BERTopic\ntopic_model.set_topic_labels(topics_labels)\n
        • Manually merge topics with .merge_topics()
        # Merge topics 1, 2, and 3\ntopics_to_merge = [1, 2, 3]\ntopic_model.merge_topics(docs, topics, topics_to_merge)\n\n# Merge topics 1 and 2, and separately merge topics 3 and 4\ntopics_to_merge = [[1, 2], [3, 4]]\ntopic_model.merge_topics(docs, topics, topics_to_merge)\n
        • Added example for finding similar topics between two models in the tips & tricks page
        • Add multi-modal example in the tips & tricks page
        • Added native Hugging Face transformers support

        Fixes:

        • Fix support for k-Means in .visualize_heatmap (#532)
        • Fix missing topic 0 in .visualize_topics (#533)
        • Fix inconsistencies in .get_topic_info (#572) and (#581)
        • Add optimal_ordering parameter to .visualize_hierarchy by @rafaelvalero in #390
        • Fix RuntimeError when used as sklearn estimator by @simonfelding in #448
        • Fix typo in visualization documentation by @dwhdai in #475
        • Fix typo in docstrings by @xwwwwww in #549
        • Support higher Flair versions
        "},{"location":"changelog.html#version-0100","title":"Version 0.10.0","text":"

        Release date: 30 April, 2022

        Highlights:

        • Use any dimensionality reduction technique instead of UMAP:
        from bertopic import BERTopic\nfrom sklearn.decomposition import PCA\n\ndim_model = PCA(n_components=5)\ntopic_model = BERTopic(umap_model=dim_model)\n
        • Use any clustering technique instead of HDBSCAN:
        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\n\ncluster_model = KMeans(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n

        Documentation:

        • Add a CountVectorizer page with tips and tricks on how to create topic representations that fit your use case
        • Added pages on how to use other dimensionality reduction and clustering algorithms
        • Additional instructions on how to reduce outliers in the FAQ:
        import numpy as np\nprobability_threshold = 0.01\nnew_topics = [np.argmax(prob) if max(prob) >= probability_threshold else -1 for prob in probs] \n

        Fixes:

        • Fixed None being returned for probabilities when transforming unseen documents
        • Replaced all instances of arg: with Arguments: for consistency
        • Before saving a fitted BERTopic instance, we remove the stopwords in the fitted CountVectorizer model as it can get quite large due to the number of words that end in stopwords if min_df is set to a value larger than 1
        • Set \"hdbscan>=0.8.28\" to prevent numpy issues
        • Although this was already fixed by the new release of HDBSCAN, it is technically still possible to install 0.8.27 with BERTopic which leads to these numpy issues
        • Update gensim dependency to >=4.0.0 (#371)
        • Fix topic 0 not appearing in visualizations (#472)
        • Fix (#506)
        • Fix (#429)
        • Fix typo in DTM documentation by @hp0404 in #386
        "},{"location":"changelog.html#version-094","title":"Version 0.9.4","text":"

        Release date: 14 December, 2021

        A number of fixes, documentation updates, and small features:

        • Expose diversity parameter
          • Use BERTopic(diversity=0.1) to change how diverse the words in a topic representation are (ranges from 0 to 1)
        • Improve stability of topic reduction by only computing the cosine similarity within c-TF-IDF and not the topic embeddings
        • Added property to c-TF-IDF that all IDF values should be positive (#351)
        • Improve stability of .visualize_barchart() and .visualize_hierarchy()
        • Major documentation overhaul (mkdocs, tutorials, FAQ, images, etc. ) (#330)
        • Drop python 3.6 (#333)
        • Relax plotly dependency (#88)
        • Additional logging for .transform (#356)
        "},{"location":"changelog.html#version-093","title":"Version 0.9.3","text":"

        Release date: 17 October, 2021

        • Fix #282
          • As it turns out the old implementation of topic mapping was still found in the transform function
        • Fix #285
          • Fix getting all representative docs
        • Fix #288
          • A recent issue with the package pyyaml that can be found in Google Colab
        "},{"location":"changelog.html#version-092","title":"Version 0.9.2","text":"

        Release date: 12 October, 2021

        A release focused on algorithmic optimization and fixing several issues:

        Highlights:

        • Update the non-multilingual paraphrase- models to the all- models due to improved performance
        • Reduce necessary RAM in c-TF-IDF top 30 word extraction

        Fixes:

        • Fix topic mapping
          • When reducing the number of topics, these need to be mapped to the correct input/output which had some issues in the previous version
          • A new class was created as a way to track these mappings regardless of how many times they were executed
          • In other words, you can iteratively reduce the number of topics after training the model without the need to continuously train the model
        • Fix typo in embeddings page (#200)
        • Fix link in README (#233)
        • Fix documentation .visualize_term_rank() (#253)
        • Fix getting correct representative docs (#258)
        • Update memory FAQ with HDBSCAN pr
        "},{"location":"changelog.html#version-091","title":"Version 0.9.1","text":"

        Release date: 1 September, 2021

        A release focused on fixing several issues:

        Fixes:

        • Fix TypeError when auto-reducing topics (#210)
        • Fix mapping representative docs when reducing topics (#208)
        • Fix visualization issues with probabilities (#205)
        • Fix missing normalize_frequency param in plots (#213)
        "},{"location":"changelog.html#version-090","title":"Version 0.9.0","text":"

        Release date: 9 August, 2021

        Highlights:

        • Implemented a Guided BERTopic -> Use seeds to steer the Topic Modeling
        • Get the most representative documents per topic: topic_model.get_representative_docs(topic=1)
          • This allows users to see which documents are good representations of a topic and better understand the topics that were created
        • Added normalize_frequency parameter to visualize_topics_per_class and visualize_topics_over_time in order to better compare the relative topic frequencies between topics
        • Return flat probabilities as default, only calculate the probabilities of all topics per document if calculate_probabilities is True
        • Added several FAQs

        Fixes:

        • Fix loading pre-trained BERTopic model
        • Fix mapping of probabilities
        • Fix #190

        Guided BERTopic:

        Guided BERTopic works in two ways:

        First, we create embeddings for each seeded topics by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics.

        Second, we take all words in seed_topic_list and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier.

        seed_topic_list = [[\"company\", \"billion\", \"quarter\", \"shrs\", \"earnings\"],\n                   [\"acquisition\", \"procurement\", \"merge\"],\n                   [\"exchange\", \"currency\", \"trading\", \"rate\", \"euro\"],\n                   [\"grain\", \"wheat\", \"corn\"],\n                   [\"coffee\", \"cocoa\"],\n                   [\"natural\", \"gas\", \"oil\", \"fuel\", \"products\", \"petrol\"]]\n\ntopic_model = BERTopic(seed_topic_list=seed_topic_list)\ntopics, probs = topic_model.fit_transform(docs)\n
        "},{"location":"changelog.html#version-081","title":"Version 0.8.1","text":"

        Release date: 8 June, 2021

        Highlights:

        • Improved models:
          • For English documents the default is now: \"paraphrase-MiniLM-L6-v2\"
          • For Non-English or multi-lingual documents the default is now: \"paraphrase-multilingual-MiniLM-L12-v2\"
          • Both models show not only great performance but are much faster!
        • Add interactive visualizations to the plotting API documentation

        For better performance, please use the following models:

        • English: \"paraphrase-mpnet-base-v2\"
        • Non-English or multi-lingual: \"paraphrase-multilingual-mpnet-base-v2\"

        Fixes:

        • Improved unit testing for more stability
        • Set transformers version for Flair
        "},{"location":"changelog.html#version-080","title":"Version 0.8.0","text":"

        Release date: 31 May, 2021

        Highlights:

        • Additional visualizations:
          • Topic Hierarchy: topic_model.visualize_hierarchy()
          • Topic Similarity Heatmap: topic_model.visualize_heatmap()
          • Topic Representation Barchart: topic_model.visualize_barchart()
          • Term Score Decline: topic_model.visualize_term_rank()
        • Created bertopic.plotting library to easily extend visualizations
        • Improved automatic topic reduction by using HDBSCAN to detect similar topics
        • Sort topic ids by their frequency. -1 is the outlier class and contains typically the most documents. After that 0 is the largest topic, 1 the second largest, etc.

        Fixes:

        • Fix typo #113, #117
        • Fix #121 by removing these two lines
        • Fix mapping of topics after reduction (it now excludes 0) (#103)
        "},{"location":"changelog.html#version-070","title":"Version 0.7.0","text":"

        Release date: 26 April, 2021

        The two main features are (semi-)supervised topic modeling and several backends to use instead of Flair and SentenceTransformers!

        Highlights:

        • (semi-)supervised topic modeling by leveraging supervised options in UMAP
          • model.fit(docs, y=target_classes)
        • Backends:
          • Added Spacy, Gensim, USE (TFHub)
          • Use a different backend for document embeddings and word embeddings
          • Create your own backends with bertopic.backend.BaseEmbedder
          • Click here for an overview of all new backends
        • Calculate and visualize topics per class
          • Calculate: topics_per_class = topic_model.topics_per_class(docs, topics, classes)
          • Visualize: topic_model.visualize_topics_per_class(topics_per_class)
        • Several tutorials were updated and added:
        Name Link Topic Modeling with BERTopic (Custom) Embedding Models in BERTopic Advanced Customization in BERTopic (semi-)Supervised Topic Modeling with BERTopic Dynamic Topic Modeling with Trump's Tweets

        Fixes:

        • Fixed issues with Torch req
        • Prevent saving term frequency matrix in CTFIDF class
        • Fixed DTM not working when reducing topics (#96)
        • Moved visualization dependencies to base BERTopic
          • pip install bertopic[visualization] becomes pip install bertopic
        • Allow precomputed embeddings in bertopic.find_topics() (#79):
        model = BERTopic(embedding_model=my_embedding_model)\nmodel.fit(docs, my_precomputed_embeddings)\nmodel.find_topics(search_term)\n
        "},{"location":"changelog.html#version-060","title":"Version 0.6.0","text":"

        Release date: 1 March, 2021

        Highlights:

        • DTM: Added a basic dynamic topic modeling technique based on the global c-TF-IDF representation
          • model.topics_over_time(docs, timestamps, global_tuning=True)
        • DTM: Option to evolve topics based on t-1 c-TF-IDF representation which results in evolving topics over time
          • Only uses topics at t-1 and skips evolution if there is a gap
          • model.topics_over_time(docs, timestamps, evolution_tuning=True)
        • DTM: Function to visualize topics over time
          • model.visualize_topics_over_time(topics_over_time)
        • DTM: Add binning of timestamps
          • model.topics_over_time(docs, timestamps, nr_bins=10)
        • Add function get general information about topics (id, frequency, name, etc.)
          • get_topic_info()
        • Improved stability of c-TF-IDF by taking the average number of words across all topics instead of the number of documents

        Fixes:

        • _map_probabilities() does not take into account that there is no probability of the outlier class and the probabilities are mutated instead of copied (#63, #64)
        "},{"location":"changelog.html#version-050","title":"Version 0.5.0","text":"

        Release date: 8 Februari, 2021

        Highlights:

        • Add Flair to allow for more (custom) token/document embeddings, including \ud83e\udd17 transformers
        • Option to use custom UMAP, HDBSCAN, and CountVectorizer
        • Added low_memory parameter to reduce memory during computation
        • Improved verbosity (shows progress bar)
        • Return the figure of visualize_topics()
        • Expose all parameters with a single function: get_params()

        Fixes:

        • To simplify the API, the parameters stop_words and n_neighbors were removed. These can still be used when a custom UMAP or CountVectorizer is used.
        • Set calculate_probabilities to False as a default. Calculating probabilities with HDBSCAN significantly increases computation time and memory usage. Better to remove calculating probabilities or only allow it by manually turning this on.
        • Use the newest version of sentence-transformers as it speeds ups encoding significantly
        "},{"location":"changelog.html#version-042","title":"Version 0.4.2","text":"

        Release date: 10 Januari, 2021

        Fixes:

        • Selecting embedding_model did not work when language was also used. This led to the user needing to set language to None before being able to use embedding_model. Fixed by using embedding_model when language is used (as a default parameter).
        "},{"location":"changelog.html#version-041","title":"Version 0.4.1","text":"

        Release date: 07 Januari, 2021

        Fixes:

        • Simple fix by lowering the languages variable to match the lowered input language.
        "},{"location":"changelog.html#version-040","title":"Version 0.4.0","text":"

        Release date: 21 December, 2020

        Highlights:

        • Visualize Topics similar to LDAvis
        • Added option to reduce topics after training
        • Added option to update topic representation after training
        • Added option to search topics using a search term
        • Significantly improved the stability of generating clusters
        • Finetune the topic words by selecting the most coherent words with the highest c-TF-IDF values
        • More extensive tutorials in the documentation

        Notable Changes:

        • Option to select language instead of sentence-transformers models to minimize the complexity of using BERTopic
        • Improved logging (remove duplicates)
        • Check if BERTopic is fitted
        • Added TF-IDF as an embedder instead of transformer models (see tutorial)
        • Numpy for Python 3.6 will be dropped and was therefore removed from the workflow.
        • Preprocess text before passing it through c-TF-IDF
        • Merged get_topics_freq() with get_topic_freq()

        Fixes:

        • Fix error handling topic probabilities
        "},{"location":"changelog.html#version-032","title":"Version 0.3.2","text":"

        Release date: 16 November, 2020

        Highlights:

        • Fixed a bug with the topic reduction method that seems to reduce the number of topics but not to the nr_topics as defined in the class. Since this was, to a certain extend, breaking the topic reduction method a new release was necessary.
        "},{"location":"changelog.html#version-031","title":"Version 0.3.1","text":"

        Release date: 4 November, 2020

        Highlights:

        • Adding the option to use custom embeddings or embeddings that you generated beforehand with whatever package you'd like to use. This allows users to further customize BERTopic to their liking.
        "},{"location":"changelog.html#version-030","title":"Version 0.3.0","text":"

        Release date: 29 October, 2020

        Highlights:

        • transform() and fit_transform() now also return the topic probability distributions
        • Added visualize_distribution() which visualizes the topic probability distribution for a single document
        "},{"location":"changelog.html#version-022","title":"Version 0.2.2","text":"

        Release date: 17 October, 2020

        Highlights:

        • Fixed n_gram_range not being used
        • Added option for using stopwords
        "},{"location":"changelog.html#version-021","title":"Version 0.2.1","text":"

        Release date: 11 October, 2020

        Highlights:

        • Improved the calculation of the class-based TF-IDF procedure by limiting the calculation to sparse matrices. This prevents out-of-memory problems when faced with large datasets.
        "},{"location":"changelog.html#version-020","title":"Version 0.2.0","text":"

        Release date: 11 October, 2020

        Highlights:

        • Changed c-TF-IDF procedure such that it implements a version of scikit-learns procedure. This should also speed up the calculation of the sparse matrix and prevent memory errors.
        • Added automated unit tests
        "},{"location":"changelog.html#version-012","title":"Version 0.1.2","text":"

        Release date: 1 October, 2020

        Highlights:

        • When transforming new documents, self.mapped_topics seemed to be missing. Added to the init.
        "},{"location":"changelog.html#version-011","title":"Version 0.1.1","text":"

        Release date: 24 September, 2020

        Highlights:

        • Fixed requirements --> Issue with pytorch
        • Update documentation
        "},{"location":"changelog.html#version-010","title":"Version 0.1.0","text":"

        Release date: 24 September, 2020

        Highlights:

        • First release of BERTopic
        • Added parameters for UMAP and HDBSCAN
        • Option to choose sentence-transformer model
        • Method for transforming unseen documents
        • Save and load trained models (UMAP and HDBSCAN)
        • Extract topics and their sizes

        Notable Changes:

        • Optimized c-TF-IDF
        • Improved documentation
        • Improved topic reduction
        "},{"location":"faq.html","title":"Frequently Asked Questions","text":""},{"location":"faq.html#why-are-the-results-not-consistent-between-runs","title":"Why are the results not consistent between runs?","text":"

        Due to the stochastic nature of UMAP, the results from BERTopic might differ even if you run the same code multiple times. Using custom embeddings allows you to try out BERTopic several times until you find the topics that suit you best. You only need to generate the embeddings themselves once and run BERTopic several times with different parameters.

        If you want to reproduce the results, at the expense of performance, you can set a random_state in UMAP to prevent any stochastic behavior:

        from bertopic import BERTopic\nfrom umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=5, \n                  min_dist=0.0, metric='cosine', random_state=42)\ntopic_model = BERTopic(umap_model=umap_model)\n
        "},{"location":"faq.html#which-embedding-model-should-i-choose","title":"Which embedding model should I choose?","text":"

        Unfortunately, there is not a definitive list of the best models for each language, this highly depends on your data, the model, and your specific use case. However, the default model in BERTopic (\"all-MiniLM-L6-v2\") works great for English documents. In contrast, for multi-lingual documents or any other language, \"paraphrase-multilingual-MiniLM-L12-v2\" has shown great performance.

        If you want to use a model that provides a higher quality, but takes more computing time, then I would advise using all-mpnet-base-v2 and paraphrase-multilingual-mpnet-base-v2 instead.

        MTEB Leaderboard New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the MTEB leaderboard. It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.

        Many of these models can be used with SentenceTransformers in BERTopic, like so:

        from bertopic import BERTopic\nfrom sentence_transformers import SentenceTransformer\n\nembedding_model = SentenceTransformer(\"BAAI/bge-base-en-v1.5\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n

        SentenceTransformers SentenceTransformers work typically quite well and are the preferred models to use. They are great at generating document embeddings and have several multi-lingual versions available.

        \ud83e\udd17 transformers BERTopic allows you to use any \ud83e\udd17 transformers model. These models are typically embeddings created on a word/sentence level but can easily be pooled using Flair (see Guides/Embeddings). If you have a specific language for which you want to generate embeddings, you can choose the model here.

        "},{"location":"faq.html#how-do-i-reduce-topic-outliers","title":"How do I reduce topic outliers?","text":"

        There are several ways we can reduce outliers.

        First, the amount of datapoint classified as outliers is handled by the min_samples parameters in HDBSCAN. This value is automatically set to the same value of min_cluster_size. However, you can set it independently if you want to reduce the number of generated outliers. Lowering this value will result in less noise being generated.

        from bertopic import BERTopic\nfrom hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', \n                        cluster_selection_method='eom', prediction_data=True, min_samples=5)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Note

        Although this will lower outliers found in the data, this might force outliers to be put into topics where they do not belong. So make sure to strike a balance between keeping noise and reducing outliers.

        Second, after training our BERTopic model, we can assign outliers to topics by making use of the .reduce_outliers function in BERTopic. An advantage of using this approach is that there are four built in strategies one can choose for reducing outliers. Moreover, this technique allows the user to experiment with reducing outliers across a number of strategies and parameters without actually having to re-train the topic model each time. You can learn more about the .reduce_outlier function here. The following is a minimal example of how to use this function:

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers\nnew_topics = topic_model.reduce_outliers(docs, topics)\n

        Third, we can replace HDBSCAN with any other clustering algorithm that we want. So we can choose a clustering algorithm, like k-Means, that does not produce any outliers at all. Using k-Means instead of HDBSCAN is straightforward:

        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\n\ncluster_model = KMeans(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n
        "},{"location":"faq.html#how-do-i-remove-stop-words","title":"How do I remove stop words?","text":"

        At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context to create accurate embeddings.

        Instead, we can use the CountVectorizer to preprocess our documents after having generated embeddings and clustered our documents. I have found almost no disadvantages to using the CountVectorizer to remove stop words and it is something I would strongly advise to try out:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        We can also use the ClassTfidfTransformer to reduce the impact of frequent words. The result is very similar to explicitly removing stop words but this process does this automatically:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model)\n
        "},{"location":"faq.html#how-can-i-speed-up-bertopic","title":"How can I speed up BERTopic?","text":"

        You can speed up BERTopic by either generating your embeddings beforehand or by setting calculate_probabilities to False. Calculating the probabilities is quite expensive and can significantly increase the computation time. Thus, only use it if you do not mind waiting a bit before the model is done running or if you have less than a couple of hundred thousand documents.

        Also, make sure to use a GPU when extracting the sentence/document embeddings. Transformer models typically require a GPU and using only a CPU can slow down computation time quite a lot. However, if you do not have access to a GPU, looking into quantization might help.

        Lastly, it is also possible to speed up BERTopic with cuML's GPU acceleration of UMAP and HDBSCAN:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\nfrom cuml.manifold import UMAP\n\n# Create instances of GPU-accelerated UMAP and HDBSCAN\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\n\n# Pass the above models to be used in BERTopic\ntopic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)\n
        "},{"location":"faq.html#i-am-facing-memory-issues-help","title":"I am facing memory issues. Help!","text":"

        There are several ways to perform computation with large datasets:

        • First, you can set low_memory to True when instantiating BERTopic. This may prevent blowing up the memory in UMAP.

        • Second, setting calculate_probabilities to False when instantiating BERTopic prevents a huge document-topic probability matrix from being created. Moreover, HDBSCAN is quite slow when it tries to calculate probabilities on large datasets.

        • Third, you can set the minimum frequency of words in the CountVectorizer class to reduce the size of the resulting sparse c-TF-IDF matrix. You can do this as follows:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\", min_df=10)\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        The min_df parameter is used to indicate the minimum frequency of words. Setting this value larger than 1 can significantly reduce memory.

        • Fourth, you can use online topic modeling instead to use BERTopic on big data by training the model in chunks

        If the problem persists, then this could be an issue related to your available memory. The processing of millions of documents is quite computationally expensive and sufficient RAM is necessary.

        "},{"location":"faq.html#i-have-only-a-few-topics-how-do-i-increase-them","title":"I have only a few topics, how do I increase them?","text":"

        There are several reasons why your topic model may result in only a few topics:

        • First, you might only have a few documents (~1000). This makes it very difficult to properly extract topics due to the little amount of data available. Increasing the number of documents might solve your issues.

        • Second, min_topic_size might be simply too large for your number of documents. If you decrease the minimum size of topics, then you are much more likely to increase the number of topics generated. You could also decrease the n_neighbors parameter used in UMAP if this does not work.

        • Third, although this does not happen very often, there simply aren't that many topics to be found in your documents. You can often see this when you have many -1 topics, which is not a topic but a category of outliers.

        "},{"location":"faq.html#i-have-too-many-topics-how-do-i-decrease-them","title":"I have too many topics, how do I decrease them?","text":"

        If you have a large dataset, then it is possible to generate thousands of topics. Especially with large datasets, there is a good chance they contain many small topics. In practice, you might want a few hundred topics at most to interpret them nicely.

        There are a few ways of decreasing the number of generated topics:

        • First, we can set the min_topic_size in the BERTopic initialization much higher (e.g., 300) to make sure that those small clusters will not be generated. This is an HDBSCAN parameter that specifies the minimum number of documents needed in a cluster. More documents in a cluster mean fewer topics will be generated.

        • Second, you can create a custom UMAP model and set n_neighbors much higher than the default 15 (e.g., 200). This also prevents those micro clusters to be generated as it will need many neighboring documents to create a cluster.

        • Third, we can set nr_topics to a value that seems logical to the user. Do note that topics are forced to merge which might result in a lower quality of topics. In practice, I would advise using nr_topic=\"auto\" as that will merge topics that are very similar. Dissimilar topics will therefore remain separated.

        "},{"location":"faq.html#how-do-i-calculate-the-probabilities-of-all-topics-in-a-document","title":"How do I calculate the probabilities of all topics in a document?","text":"

        Although it is possible to calculate all the probabilities, the process of doing so is quite computationally inefficient and might significantly increase the computation time. To prevent this, the probabilities are not calculated as a default. To calculate them, you will have to set calculate_probabilities to True:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs) \n

        Note

        The calculate_probabilities parameter is only used when using HDBSCAN or cuML's HDBSCAN model. In other words, this will not work when using a model other than HDBSCAN. Instead, we can approximate the topic distributions across all documents with .approximate_distribution.

        "},{"location":"faq.html#numpy-gives-me-an-error-when-running-bertopic","title":"Numpy gives me an error when running BERTopic","text":"

        With the release of Numpy 1.20.0, there have been significant issues with using that version (and previous ones) due to compilation issues and pypi.

        This is a known issue with the order of installation using pypi. You can find more details about this issue here and here.

        I would suggest doing one of the following:

        • Install the newest version from BERTopic (>= v0.5).
        • You can install hdbscan with pip install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation which might resolve the issue
        • Install BERTopic in a fresh environment using these steps.
        "},{"location":"faq.html#how-can-i-run-bertopic-without-an-internet-connection","title":"How can I run BERTopic without an internet connection?","text":"

        The great thing about using sentence-transformers is that it searches automatically for an embedding model locally. If it cannot find one, it will download the pre-trained model from its servers. Make sure that you set the correct path for sentence-transformers to work. You can find a bit more about that here.

        You can download the corresponding model here and unzip it. Then, simply use the following to create your embedding model:

        from sentence_transformers import SentenceTransformer\nembedding_model = SentenceTransformer('path/to/unzipped/model')\n

        Then, pass it to BERTopic:

        from bertopic import BERTopic\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"faq.html#can-i-use-the-gpu-to-speed-up-the-model","title":"Can I use the GPU to speed up the model?","text":"

        Yes. The GPU is automatically used when you use a SentenceTransformer or Flair embedding model. Using a CPU would then definitely slow things down. However, you can use other embeddings like TF-IDF or Doc2Vec embeddings in BERTopic which do not depend on GPU acceleration.

        You can use cuML to speed up both UMAP and HDBSCAN through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\nfrom cuml.manifold import UMAP\n\n# Create instances of GPU-accelerated UMAP and HDBSCAN\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\n\n# Pass the above models to be used in BERTopic\ntopic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Depending on the embeddings you are using, you might want to normalize them first to force a cosine-related distance metric in UMAP:

        from cuml.preprocessing import normalize\nembeddings = normalize(embeddings)\n
        "},{"location":"faq.html#how-can-i-use-bertopic-with-chinese-documents","title":"How can I use BERTopic with Chinese documents?","text":"

        Currently, CountVectorizer tokenizes text by splitting whitespace which does not work for Chinese. To get it to work, you will have to create a custom CountVectorizer with jieba:

        from sklearn.feature_extraction.text import CountVectorizer\nimport jieba\n\ndef tokenize_zh(text):\n    words = jieba.lcut(text)\n    return words\n\nvectorizer = CountVectorizer(tokenizer=tokenize_zh)\n

        Next, we pass our custom vectorizer to BERTopic and create our topic model:

        from bertopic import BERTopic\ntopic_model = BERTopic(embedding_model=model, verbose=True, vectorizer_model=vectorizer)\ntopics, _ = topic_model.fit_transform(docs, embeddings=embeddings)\n
        "},{"location":"faq.html#why-does-it-take-so-long-to-import-bertopic","title":"Why does it take so long to import BERTopic?","text":"

        The main culprit here seems to be UMAP. After running tests with Tuna we can see that most of the resources when importing BERTopic can be dedicated to UMAP:

        Unfortunately, there currently is no fix for this issue. The most recent ticket regarding this issue can be found here.

        "},{"location":"faq.html#should-i-preprocess-the-data","title":"Should I preprocess the data?","text":"

        No. By using document embeddings there is typically no need to preprocess the data as all parts of a document are important in understanding the general topic of the document. Although this holds in 99% of cases, if you have data that contains a lot of noise, for example, HTML-tags, then it would be best to remove them. HTML-tags typically do not contribute to the meaning of a document and should therefore be removed. However, if you apply topic modeling to HTML-code to extract topics of code, then it becomes important.

        "},{"location":"faq.html#i-run-into-issues-running-on-apple-silicon-what-should-i-do","title":"I run into issues running on Apple Silicon. What should I do?","text":"

        Apple Silicon chips (M1 & M2) are based on arm64 (aka AArch64, not to be confused with amd64/x86_64). There are known issues with upstream dependencies for this architecture, for example numba. You may not always run into this issue, depending on the extras that you need.

        One possible solution is to use VS Code Dev Containers, which allow you to setup a Linux-based environment. To run BERTopic effectively you need to be aware of two things:

        1. Make sure to use a Docker image specifically built for arm64
        2. Make sure to use a volume instead of a bind-mount \u2139\ufe0f the latter significantly reduces disk I/O

        Using the pre-configured Data Science Dev Containers makes sure these setting are optimized. To start using them, do the following:

        • Install and run Docker
        • Clone repository data-science-devcontainers
        • Open VS Code, build the Python base or Python scipy container and start working \u2139\ufe0f Change PYTHON_VERSION to 3.11 in the respective devcontainer.json to work with the latest patch release of Python 3.11
        • Note that data is persisted in the container
        • When using an unmodified devcontainer.json: Work in /home/vscode \ud83d\udc49 This is the home directory of user vscode
        • Python packages are installed to the home directory by default \ud83d\udc49 This is due to env variable PIP_USER=1
        • Note that the directory /workspaces is also persisted
        "},{"location":"faq.html#do-these-data-science-dev-containers-support-gpu-acceleration","title":"Do these Data Science Dev Containers support GPU acceleration?","text":"

        Yes, but only on Linux and Windows.

        The CUDA-enabled variants require the following in addition to Docker:

        • NVIDIA GPU
        • NVIDIA driver
        • Linux: NVIDIA Container Toolkit
        • Windows: GPU support in Docker Desktop

        \u2139\ufe0f The host running the GPU accelerated Dev Containers only requires the NVIDIA driver, the CUDA toolkit does not have to be installed.

        See the CUDA Version Matrix regarding Ubuntu/CUDA/Python versions and recommended NVIDIA drivers.

        "},{"location":"usecases.html","title":"Use Cases","text":"

        Over the last few years, BERTopic has been used on a wide variety of use cases and domains, from cancer research and voice perception, to employee surveys and social media. This diversity allows for interesting use cases but it might quickly become overwhelming. This page is meant to demonstrate how, when, and why BERTopic is used in practice.

        "},{"location":"usecases.html#examples","title":"Examples","text":"

        Below are a number of use cases that have been applied in practice. These use cases are collected from and written by data-professionals.

        Note

        If you would like to add your use case, feel free open up a PR! You only need to update this file and add your example. You can just copy-paste one of the existing examples and adjust it contain a description of your use case.

        "},{"location":"usecases.html#app-user-feedback","title":"App User Feedback","text":"

        \"Analyzing user reviews from the App Store and Play Store helps us reveal valuable customer information, fix technical or usability issues, and help constantly improve customer experience. We utilize BERTopic for topic modeling and supervised classification of predefined categories.\" \u2022\u2022\u2022Tibor Fabian, Ph.D.Lead/Master Data ScientistTelef\u00f3nica Germany

        "},{"location":"usecases.html#employee-surveys","title":"Employee Surveys","text":"

        \"We are using BERTopic to support analysis of employee surveys. Here, we use BERTopic to compute the topics of discussion found in employee responses to open-ended survey questions. To further understand how employees feel about certain topics, we combined BERTopic with sentiment analysis to identify the sentiments associated with different topics and vice versa.\" \u2022\u2022\u2022Steve Quirolgico, Ph.D.Principal Engineer U.S. Department of Homeland Security

        "},{"location":"usecases.html#voice-perception","title":"Voice Perception","text":"

        \"A research project on voice perception to categorize what people describe when they make first impressions based on hearing people say, \"Hi\".\" preprint | code \u2022\u2022\u2022David FeinbergAssociate ProfessorMcMaster University

        "},{"location":"usecases.html#social-media","title":"Social Media","text":"

        \"We use BERTopic to detect trending topics in social media, Our product (AIM Insights) is a social media monitoring tool so detecting trending topics in social media helps our clients to capitalize on them for their campaigns. We use BERTopic to group social media posts into clusters, sort them by engagement to detect the ones that are trending, and then use OpenAI's GPT-3 to generate a label for each of the top clusters based on the most relevant documents in it. This is all done on Arabic posts using an in-house sentence embeddings model.\" \u2022\u2022\u2022Ahmed RashwanAI leadAIM Technologies

        "},{"location":"usecases.html#it-service-management","title":"IT Service Management","text":"

        \"In IT Service Management systems (e.g., Service Now) we receive Incidents, Problems, Change requests etc. We use BERTopic to categorize them into a group of topics/clusters to understand the distribution of the work requests over the period of time to plan and act accordingly for the future.\" \u2022\u2022\u2022Rajesh ThanaseelanData Science ConsultantDXC Technology

        "},{"location":"usecases.html#colon-cancer","title":"Colon Cancer","text":"

        \"We use BERTopic to evaluate P53 in Ovarian cancer for Computational backgrounds researchers, who find it easier to relate Artificial Intelligence with advancing the transformer model and unstructured medical data. The paper explores the heterogeneity of keyBERT, BERTopic, PyCaret, and LDAs as key phrase generators and topic model extractors, with P53 in ovarian cancer as a use case.\" \u2022\u2022\u2022 Mary AdewunmiPhD Student in Colon Cancer and AIUTAS

        "},{"location":"usecases.html#telephone-help-line","title":"Telephone Help Line","text":"

        \"We analyzed 100K+ phone call memos from a telephone help line. The Help Line is open to all people, regardless of religion, culture, and origin. It follows the principles of IFOTES (International Federation Of Telephone Emergency Services). The regional offices each offer independent counseling services via telephone or online. The phone call memos are written by hundreds of independent volunteers and come in various shapes, lengths, forms, and wordings - additionally to have them in multiple languages. While using BERTopic we ran a few tests to figure out if the topic modeling works. Selecting only one language with ~60K data points and a mixed language model we achieved good results. It helped identify topics within the calls and therefore show the organization what reasons there are for people calling them. We identified in a workshop a few interesting topics, which they were not aware of, for example, religious topics. The identification of existing and new, arising topics is crucial for the service quality of the organization. It furthermore helps detect trends over time, which can then be reported directly to Public Health institutions, which can then come up with campaigns to inform the public and help reduce certain psychological concerns. It acts as a representative psychological health barometer of the population.\" \u2022\u2022\u2022Kevin KuhnChief Executive Officergopf

        "},{"location":"usecases.html#regional-newspaper","title":"Regional Newspaper","text":"

        \"Recently, we wanted to evaluate our overall section structure, especially our local news section. As you can imagine, local news is quite a big part of what we do in a regional newspaper. We used BERTopic on a year's worth of local news data to explore the topics in local news and define a new section structure. The results from this analysis helped to define the new section structure, which was implemented this month. \" \u2022\u2022\u2022Thomas HuskenData ScientistBergens Tidende

        "},{"location":"usecases.html#intelligent-virtual-assistants","title":"Intelligent Virtual Assistants","text":"

        \"We have been using BERTopic as an early step in our exploratory analysis for intelligent virtual assistants. It helps us get a quick read on what some of the intents may be. The results help in the design discussions with customers.\" \u2022\u2022\u2022Stephen DrewVP, AI and Automation SolutionsFive9

        "},{"location":"usecases.html#electronic-health-records","title":"Electronic Health Records","text":"

        \"Given physician-created documents from hospitals, find themes in the text as well as differentiate between \"relevant\" and \"irrelevant\" text, and disambiguate homonyms. \" \u2022\u2022\u2022 Alexis RaykhelSenior NLP EngineerIodine Software

        "},{"location":"usecases.html#teaching","title":"Teaching","text":"

        \"BERTopic was used to determine a taxonomy of climate change risks discussed in financial news, and to compute firms' related exposure. It was used in a context a course offering on Climate Risks modelling with NLP.\" \u2022\u2022\u2022 Thomas LoransSenior Associate, Quantitative Analyst

        "},{"location":"usecases.html#zero-hunger-lab","title":"Zero Hunger Lab","text":"

        \"I am a PhD student at Tilburg University, at a lab called Zero Hunger Lab, where we try to use data science methods to improve food insecurity. One key issue is classifying and predicting food insecurity in food-insecure nations. The Integrated Food Security Phase Classification (IPC) system serves this purpose. The IPC categorizes food insecurity into five phases, ranging from minimal food insecurity to famine, and serves as a guide for directing humanitarian resources to the most affected regions. The IPC system strives to be based on evidence, however, obtaining accurate information about food insecurity in remote regions can prove challenging. Despite the availability of weather data, data in the socio-economic domain, such as food prices and conflict, can be scarce or unreliable due to limited infrastructure and bureaucratic obstacles. These complications often result in infrequent releases of IPC classifications and projections, making it difficult to effectively respond to food insecurity in these areas. One large source of daily-updated information is local news. Thus, one can build a model that classifies/predicts IPC by relying on news features obtained by NLP methods in addition to stuff like weather data. Previous research shows this is possible (see https://arxiv.org/pdf/2111.15602.pdf). The authors find words related to food insecurity using semantic frame parsing. After which, they count the occurrence of these words to create features. The features are put into a linear classifier. We wanted to apply more advanced methods and use local news sources (which we suppose contain more localized information). We used BERTopic on over a million articles scraped from Somali news websites. Because articles are both in English and Somali, we use a multilingual sentence encoder (LaBSE, which outperforms newer models in Somali). The results are quite nice. For example, topics most strongly correlated with known conflict casualty data are topics about terrorist attacks, car bombings, etc. And topics most negatively correlated with known conflict casualty data are about peace talks. We can also get an indication of food price development and forced migration. Most importantly, we can track the development of topics relating to food insecurity over time. While topic modelling cannot replace evidence-based food insecurity assessment, it can give a quick insight into a local situation when 'hard data' is lacking. I applaud you on your success with BERTopic. The package is incredibly clean and easy to use, and the method works well with little parameter tuning. To me, the fact that you were able to deliver such a useful tool on your own is incredible, especially in the field of NLP, which is dominated by large organizations such as Google and Meta. \" \u2022\u2022\u2022Cascha van WanrooijPhD StudentTilburg University

        "},{"location":"usecases.html#papers","title":"Papers","text":"

        BERTopic has also been adopted more and more in the academic field. Here are a few from all different kinds of research domains with interesting applications:

        • Adewunmi, M., Sharma, S. K., Sharma, N., Sushma, N. S., & Mounmo, B. (2022). Cancer Health Disparities drivers with BERTopic modelling and PyCaret Evaluation. Cancer Health Disparities, 6.
        • Ebeling, R., S\u00e1enz, C. A. C., Nobre, J. C., & Becker, K. (2022, May). Analysis of the influence of political polarization in the vaccination stance: the Brazilian COVID-19 scenario. In Proceedings of the International AAAI Conference on Web and Social Media (Vol. 16, pp. 159-170).
        • Hoseini, M., Melo, P., Benevenuto, F., Feldmann, A., & Zannettou, S. (2021). On the globalization of the QAnon conspiracy theory through Telegram. arXiv preprint arXiv:2105.13020.
        • Falkenberg, M., Galeazzi, A., Torricelli, M., Di Marco, N., Larosa, F., Sas, M., ... & Baronchelli, A. (2022). Growing polarization around climate change on social media. Nature Climate Change, 1-8.
        • S\u00e1nchez\u2010Franco, M. J., & Rey\u2010Moreno, M. (2022). Do travelers' reviews depend on the destination? An analysis in coastal and urban peer\u2010to\u2010peer lodgings. Psychology & Marketing, 39(2), 441-459.
        • Zhunis, A., Lima, G., Song, H., Han, J., & Cha, M. (2022, April). Emotion bubbles: Emotional composition of online discourse before and after the COVID-19 outbreak. In Proceedings of the ACM Web Conference 2022 (pp. 2603-2613).
        • Alhaj, F., Al-Haj, A., Sharieh, A., & Jabri, R. (2022). Improving Arabic cognitive distortion classification in Twitter using BERTopic. International Journal of Advanced Computer Science and Applications, 13(1), 854-860.

        Click here for a full overview of papers citing BERTopic.

        "},{"location":"algorithm/algorithm.html","title":"The Algorithm","text":"

        Below, you will find different types of overviews of each step in BERTopic's main algorithm. Each successive overview will be more in-depth than the previous overview. This approach aims to make the underlying algorithm as intuitive as possible for a wide range of users.

        "},{"location":"algorithm/algorithm.html#visual-overview","title":"Visual Overview","text":"

        BERTopic can be viewed as a sequence of steps to create its topic representations. There are five steps to this process:

        Although these steps are the default, there is some modularity to BERTopic. Each step in this process was carefully selected such that they are all somewhat independent from one another. For example, the tokenization step is not directly influenced by the embedding model that was used to convert the documents which allow us to be creative in how we perform the tokenization step.

        This effect is especially strong in the clustering step. Models like HDBSCAN assume that clusters can have different shapes and forms. As a result, using a centroid-based technique to model the topic representations would not be beneficial since the centroid is not always representative of these types of clusters. A bag-of-words representation, however, makes very few assumptions concerning the shape and form of a cluster.

        As a result, BERTopic is quite modular and can maintain its quality of topic generation throughout a variety of sub-models. In other words, BERTopic essentially allows you to build your own topic model:

        There is extensive documentation on how to use each step in this pipeline:

        1. Embeddings
        2. Dimensionality Reduction
        3. Clustering
        4. Tokenizer
        5. Weighting Scheme
        6. Representation Tuning
          • Large Language Models (LLM)
        "},{"location":"algorithm/algorithm.html#code-overview","title":"Code Overview","text":"

        After going through the visual overview, this code overview demonstrates the algorithm using BERTopic. An advantage of using BERTopic is each major step in its algorithm can be explicitly defined, thereby making the process not only transparent but also more intuitive.

        from umap import UMAP\nfrom hdbscan import HDBSCAN\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nfrom bertopic import BERTopic\nfrom bertopic.representation import KeyBERTInspired\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\n\n# Step 1 - Extract embeddings\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n\n# Step 2 - Reduce dimensionality\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')\n\n# Step 3 - Cluster reduced embeddings\nhdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)\n\n# Step 4 - Tokenize topics\nvectorizer_model = CountVectorizer(stop_words=\"english\")\n\n# Step 5 - Create topic representation\nctfidf_model = ClassTfidfTransformer()\n\n# Step 6 - (Optional) Fine-tune topic representations with \n# a `bertopic.representation` model\nrepresentation_model = KeyBERTInspired()\n\n# All steps together\ntopic_model = BERTopic(\n  embedding_model=embedding_model,          # Step 1 - Extract embeddings\n  umap_model=umap_model,                    # Step 2 - Reduce dimensionality\n  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings\n  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics\n  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words\n  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations\n)\n
        "},{"location":"algorithm/algorithm.html#detailed-overview","title":"Detailed Overview","text":"

        This overview describes each step in more detail such that you can get an intuitive feeling as to what models might fit best at each step in your use case.

        "},{"location":"algorithm/algorithm.html#1-embed-documents","title":"1. Embed documents","text":"

        We start by converting our documents to numerical representations. Although there are many methods for doing so the default in BERTopic is sentence-transformers. These models are often optimized for semantic similarity which helps tremendously in our clustering task. Moreover, they are great for creating either document- or sentence-embeddings. In BERTopic, you can choose any sentence-transformers model but two models are set as defaults:

        • \"all-MiniLM-L6-v2\"
        • \"paraphrase-multilingual-MiniLM-L12-v2\"

        The first is an English language model trained specifically for semantic similarity tasks which works quite well for most use cases. The second model is very similar to the first with one major difference being that the multilingual models work for 50+ languages. This model is quite a bit larger than the first and is only selected if you select any language other than English.

        Tip

        Although BERTopic uses sentence-transformers models as a default, you can choose any embedding model that fits your use case. Follow the guide here for selecting and customizing your model.

        "},{"location":"algorithm/algorithm.html#2-dimensionality-reduction","title":"2. Dimensionality reduction","text":"

        After having created our numerical representations of the documents we have to reduce the dimensionality of these representations. Cluster models typically have difficulty handling high dimensional data due to the curse of dimensionality. There are great approaches that can reduce dimensionality, such as PCA, but as a default UMAP is selected in BERTopic. It is a technique that can keep some of a dataset's local and global structure when reducing its dimensionality. This structure is important to keep as it contains the information necessary to create clusters of semantically similar documents.

        Tip

        Although BERTopic uses UMAP as a default, you can choose any dimensionality reduction model that fits your use case. Follow the guide here for selecting and customizing your model.

        "},{"location":"algorithm/algorithm.html#3-cluster-documents","title":"3. Cluster Documents","text":"

        After having reduced our embeddings, we can start clustering our data. For that, we leverage a density-based clustering technique, HDBSCAN. It can find clusters of different shapes and has the nice feature of identifying outliers where possible. As a result, we do not force documents into a cluster where they might not belong. This will improve the resulting topic representation as there is less noise to draw from.

        Tip

        Although BERTopic uses HDBSCAN as a default, you can choose any cluster model that fits your use case. Follow the guide here for selecting and customizing your model.

        "},{"location":"algorithm/algorithm.html#4-bag-of-words","title":"4. Bag-of-words","text":"

        Before we can start creating the topic representation we first need to select a technique that allows for modularity in BERTopic's algorithm. When we use HDBSCAN as a cluster model, we may assume that our clusters have different degrees of density and different shapes. This means that a centroid-based topic representation technique might not be the best-fitting model. In other words, we want a topic representation technique that makes little to no assumption on the expected structure of the clusters. To do this, we first combine all documents in a cluster into a single document. That, very long, document then represents the cluster. Then, we can count how often each word appears in each cluster. This generates something called a bag-of-words representation in which the frequency of each word in each cluster can be found. This bag-of-words representation is therefore on a cluster level and not on a document level. This distinction is important as we are interested in words on a topic level (i.e., cluster level). By using a bag-of-words representation, no assumption is made concerning the structure of the clusters. Moreover, the bag-of-words representation is L1-normalized to account for clusters that have different sizes.

        Tip

        There are many ways you can tune or change the bag-of-words step. This step allows for processing the documents however you want without affecting the first step, embedding the documents. You can follow the guide here for more information about tokenization options in BERTopic.

        "},{"location":"algorithm/algorithm.html#5-topic-representation","title":"5. Topic representation","text":"

        From the generated bag-of-words representation, we want to know what makes one cluster different from another. Which words are typical for cluster 1 and not so much for all other clusters? To solve this, we need to modify TF-IDF such that it considers topics (i.e., clusters) instead of documents. When you apply TF-IDF as usual on a set of documents, what you are doing is comparing the importance of words between documents. Now, what if, we instead treat all documents in a single category (e.g., a cluster) as a single document and then apply TF-IDF? The result would be importance scores for words within a cluster. The more important words are within a cluster, the more it is representative of that topic. In other words, if we extract the most important words per cluster, we get descriptions of topics! This model is called class-based TF-IDF:

        Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word x in class c, where c refers to the cluster we created before. This results in our class-based tf representation. This representation is L1-normalized to account for the differences in topic sizes. Then, we take the logarithm of one plus the average number of words per class A divided by the frequency of word x across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based idf representation. Like with the classic TF-IDF, we then multiply tf with idf to get the importance score per word in each class. In other words, the classical TF-IDF procedure is not used here but a modified version of the algorithm that allows for a much better representation.

        Tip

        In the ClassTfidfTransformer, there are a few parameters that might be worth exploring, including an option to perform additional BM-25 weighting. You can find more information about that here.

        "},{"location":"algorithm/algorithm.html#6-optional-fine-tune-topic-representation","title":"6. (Optional) Fine-tune Topic representation","text":"

        After having generated the c-TF-IDF representations, we have a set of words that describe a collection of documents. c-TF-IDF is a method that can quickly generate accurate topic representations. However, with the fast developments in NLP-world, new and exciting methods are released weekly. In order to keep up with what is happening, there is the possibility to further fine-tune these c-TF-IDF topics using GPT, T5, KeyBERT, Spacy, and other techniques. Many are implemented in BERTopic for you to use and play around with.

        More specifically, we can consider the c-TF-IDF generated topics to be candidate topics. They each contain a set of keywords and representative documents that we can use to further fine-tune the topic representations. Having a set of representative documents for each topic is huge advantage as it allows for fine-tuning on a reduced number of documents. This reduces computation for large models as they only need to operate on that small set of representative documents for each topic. As a result, large language models like GPT and T5 becomes feasible in production settings and typically take less wall time than the dimensionality reduction and clustering steps.

        The following models are implemented in bertopic.representation:

        • MaximalMarginalRelevance
        • PartOfSpeech
        • KeyBERTInspired
        • ZeroShotClassification
        • TextGeneration
        • Cohere
        • OpenAI
        • LangChain
        "},{"location":"api/bertopic.html","title":"BERTopic","text":"

        BERTopic is a topic modeling technique that leverages BERT embeddings and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

        The default embedding model is all-MiniLM-L6-v2 when selecting language=\"english\" and paraphrase-multilingual-MiniLM-L12-v2 when selecting language=\"multilingual\".

        Attributes:

        Name Type Description topics_ List[int])

        The topics that are generated for each document after training or updating the topic model. The most recent topics are tracked.

        probabilities_ List[float]

        The probability of the assigned topic per document. These are only calculated if a HDBSCAN model is used for the clustering step. When calculate_probabilities=True, then it is the probabilities of all topics per document.

        topic_sizes_ Mapping[int, int])

        The size of each topic.

        topic_mapper_ TopicMapper)

        A class for tracking topics and their mappings anytime they are merged, reduced, added, or removed.

        topic_representations_ Mapping[int, Tuple[int, float]])

        The top n terms per topic and their respective c-TF-IDF values.

        c_tf_idf_ csr_matrix)

        The topic-term matrix as calculated through c-TF-IDF. To access its respective words, run .vectorizer_model.get_feature_names() or .vectorizer_model.get_feature_names_out()

        topic_labels_ Mapping[int, str])

        The default labels for each topic.

        custom_labels_ List[str])

        Custom labels for each topic.

        topic_embeddings_ np.ndarray)

        The embeddings for each topic. They are calculated by taking the centroid embedding of each cluster.

        representative_docs_ Mapping[int, str])

        The representative documents for each topic.

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic()\ntopics, probabilities = topic_model.fit_transform(docs)\n

        If you want to use your own embedding model, use it as follows:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\ntopic_model = BERTopic(embedding_model=sentence_model)\n

        Due to the stochastic nature of UMAP, the results from BERTopic might differ and the quality can degrade. Using your own embeddings allows you to try out BERTopic several times until you find the topics that suit you best.

        Source code in bertopic\\_bertopic.py
        class BERTopic:\n    \"\"\"BERTopic is a topic modeling technique that leverages BERT embeddings and\n    c-TF-IDF to create dense clusters allowing for easily interpretable topics\n    whilst keeping important words in the topic descriptions.\n\n    The default embedding model is `all-MiniLM-L6-v2` when selecting `language=\"english\"`\n    and `paraphrase-multilingual-MiniLM-L12-v2` when selecting `language=\"multilingual\"`.\n\n    Attributes:\n        topics_ (List[int]) : The topics that are generated for each document after training or updating\n                              the topic model. The most recent topics are tracked.\n        probabilities_ (List[float]): The probability of the assigned topic per document. These are\n                                      only calculated if a HDBSCAN model is used for the clustering step.\n                                      When `calculate_probabilities=True`, then it is the probabilities\n                                      of all topics per document.\n        topic_sizes_ (Mapping[int, int]) : The size of each topic.\n        topic_mapper_ (TopicMapper) : A class for tracking topics and their mappings anytime they are\n                                      merged, reduced, added, or removed.\n        topic_representations_ (Mapping[int, Tuple[int, float]]) : The top n terms per topic and their respective\n                                                                   c-TF-IDF values.\n        c_tf_idf_ (csr_matrix) : The topic-term matrix as calculated through c-TF-IDF. To access its respective\n                                 words, run `.vectorizer_model.get_feature_names()`  or\n                                 `.vectorizer_model.get_feature_names_out()`\n        topic_labels_ (Mapping[int, str]) : The default labels for each topic.\n        custom_labels_ (List[str]) : Custom labels for each topic.\n        topic_embeddings_ (np.ndarray) : The embeddings for each topic. They are calculated by taking the\n                                         centroid embedding of each cluster.\n        representative_docs_ (Mapping[int, str]) : The representative documents for each topic.\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic()\n    topics, probabilities = topic_model.fit_transform(docs)\n    ```\n\n    If you want to use your own embedding model, use it as follows:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    topic_model = BERTopic(embedding_model=sentence_model)\n    ```\n\n    Due to the stochastic nature of UMAP, the results from BERTopic might differ\n    and the quality can degrade. Using your own embeddings allows you to\n    try out BERTopic several times until you find the topics that suit\n    you best.\n    \"\"\"\n\n    def __init__(\n        self,\n        language: str = \"english\",\n        top_n_words: int = 10,\n        n_gram_range: Tuple[int, int] = (1, 1),\n        min_topic_size: int = 10,\n        nr_topics: Union[int, str] = None,\n        low_memory: bool = False,\n        calculate_probabilities: bool = False,\n        seed_topic_list: List[List[str]] = None,\n        zeroshot_topic_list: List[str] = None,\n        zeroshot_min_similarity: float = 0.7,\n        embedding_model=None,\n        umap_model: UMAP = None,\n        hdbscan_model: hdbscan.HDBSCAN = None,\n        vectorizer_model: CountVectorizer = None,\n        ctfidf_model: TfidfTransformer = None,\n        representation_model: BaseRepresentation = None,\n        verbose: bool = False,\n    ):\n        \"\"\"BERTopic initialization.\n\n        Arguments:\n            language: The main language used in your documents. The default sentence-transformers\n                      model for \"english\" is `all-MiniLM-L6-v2`. For a full overview of\n                      supported languages see bertopic.backend.languages. Select\n                      \"multilingual\" to load in the `paraphrase-multilingual-MiniLM-L12-v2`\n                      sentence-transformers model that supports 50+ languages.\n                      NOTE: This is not used if `embedding_model` is used.\n            top_n_words: The number of words per topic to extract. Setting this\n                         too high can negatively impact topic embeddings as topics\n                         are typically best represented by at most 10 words.\n            n_gram_range: The n-gram range for the CountVectorizer.\n                          Advised to keep high values between 1 and 3.\n                          More would likely lead to memory issues.\n                          NOTE: This param will not be used if you pass in your own\n                          CountVectorizer.\n            min_topic_size: The minimum size of the topic. Increasing this value will lead\n                            to a lower number of clusters/topics and vice versa.\n                            It is the same parameter as `min_cluster_size` in HDBSCAN.\n                            NOTE: This param will not be used if you are using `hdbscan_model`.\n            nr_topics: Specifying the number of topics will reduce the initial\n                       number of topics to the value specified. This reduction can take\n                       a while as each reduction in topics (-1) activates a c-TF-IDF\n                       calculation. If this is set to None, no reduction is applied. Use\n                       \"auto\" to automatically reduce topics using HDBSCAN.\n                       NOTE: Controlling the number of topics is best done by adjusting\n                       `min_topic_size` first before adjusting this parameter.\n            low_memory: Sets UMAP low memory to True to make sure less memory is used.\n                        NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP\n                        this parameter will not be used.\n            calculate_probabilities: Calculate the probabilities of all topics\n                                     per document instead of the probability of the assigned\n                                     topic per document. This could slow down the extraction\n                                     of topics if you have many documents (> 100_000).\n                                     NOTE: If false you cannot use the corresponding\n                                     visualization method `visualize_probabilities`.\n                                     NOTE: This is an approximation of topic probabilities\n                                     as used in HDBSCAN and not an exact representation.\n            seed_topic_list: A list of seed words per topic to converge around\n            zeroshot_topic_list: A list of topic names to use for zero-shot classification\n            zeroshot_min_similarity: The minimum similarity between a zero-shot topic and\n                                     a document for assignment. The higher this value, the more\n                                     confident the model needs to be to assign a zero-shot topic to a document.\n            verbose: Changes the verbosity of the model, Set to True if you want\n                     to track the stages of the model.\n            embedding_model: Use a custom embedding model.\n                             The following backends are currently supported\n                               * SentenceTransformers\n                               * Flair\n                               * Spacy\n                               * Gensim\n                               * USE (TF-Hub)\n                             You can also pass in a string that points to one of the following\n                             sentence-transformers models:\n                               * https://www.sbert.net/docs/pretrained_models.html\n            umap_model: Pass in a UMAP model to be used instead of the default.\n                        NOTE: You can also pass in any dimensionality reduction algorithm as long\n                        as it has `.fit` and `.transform` functions.\n            hdbscan_model: Pass in a hdbscan.HDBSCAN model to be used instead of the default\n                           NOTE: You can also pass in any clustering algorithm as long as it has\n                           `.fit` and `.predict` functions along with the `.labels_` variable.\n            vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model.\n            ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model.\n            representation_model: Pass in a model that fine-tunes the topic representations\n                                  calculated through c-TF-IDF. Models from `bertopic.representation`\n                                  are supported.\n        \"\"\"\n        # Topic-based parameters\n        if top_n_words > 100:\n            logger.warning(\n                \"Note that extracting more than 100 words from a sparse can slow down computation quite a bit.\"\n            )\n\n        self.top_n_words = top_n_words\n        self.min_topic_size = min_topic_size\n        self.nr_topics = nr_topics\n        self.low_memory = low_memory\n        self.calculate_probabilities = calculate_probabilities\n        self.verbose = verbose\n        self.seed_topic_list = seed_topic_list\n        self.zeroshot_topic_list = zeroshot_topic_list\n        self.zeroshot_min_similarity = zeroshot_min_similarity\n\n        # Embedding model\n        self.language = language if not embedding_model else None\n        self.embedding_model = embedding_model\n\n        # Vectorizer\n        self.n_gram_range = n_gram_range\n        self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range)\n        self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n\n        # Representation model\n        self.representation_model = representation_model\n\n        # UMAP or another algorithm that has .fit and .transform functions\n        self.umap_model = umap_model or UMAP(\n            n_neighbors=15,\n            n_components=5,\n            min_dist=0.0,\n            metric=\"cosine\",\n            low_memory=self.low_memory,\n        )\n\n        # HDBSCAN or another clustering algorithm that has .fit and .predict functions and\n        # the .labels_ variable to extract the labels\n        self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(\n            min_cluster_size=self.min_topic_size,\n            metric=\"euclidean\",\n            cluster_selection_method=\"eom\",\n            prediction_data=True,\n        )\n\n        # Public attributes\n        self.topics_ = None\n        self.probabilities_ = None\n        self.topic_sizes_ = None\n        self.topic_mapper_ = None\n        self.topic_representations_ = None\n        self.topic_embeddings_ = None\n        self._topic_id_to_zeroshot_topic_idx = {}\n        self.custom_labels_ = None\n        self.c_tf_idf_ = None\n        self.representative_images_ = None\n        self.representative_docs_ = {}\n        self.topic_aspects_ = {}\n\n        # Private attributes for internal tracking purposes\n        self._merged_topics = None\n\n        if verbose:\n            logger.set_level(\"DEBUG\")\n        else:\n            logger.set_level(\"WARNING\")\n\n    @property\n    def _outliers(self):\n        \"\"\"Some algorithms have outlier labels (-1) that can be tricky to work\n        with if you are slicing data based on that labels. Therefore, we\n        track if there are outlier labels and act accordingly when slicing.\n\n        Returns:\n            An integer indicating whether outliers are present in the topic model\n        \"\"\"\n        return 1 if -1 in self.topic_sizes_ else 0\n\n    @property\n    def topic_labels_(self):\n        \"\"\"Map topic IDs to their labels.\n        A label is the topic ID, along with the first four words of the topic representation, joined using '_'.\n        Zeroshot topic labels come from self.zeroshot_topic_list rather than the calculated representation.\n\n        Returns:\n            topic_labels: a dict mapping a topic ID (int) to its label (str)\n        \"\"\"\n        topic_labels = {\n            key: f\"{key}_\" + \"_\".join([word[0] for word in values[:4]])\n            for key, values in self.topic_representations_.items()\n        }\n        if self._is_zeroshot():\n            # Need to correct labels from zero-shot topics\n            topic_id_to_zeroshot_label = {\n                topic_id: self.zeroshot_topic_list[zeroshot_topic_idx]\n                for topic_id, zeroshot_topic_idx in self._topic_id_to_zeroshot_topic_idx.items()\n            }\n            topic_labels.update(topic_id_to_zeroshot_label)\n        return topic_labels\n\n    def fit(\n        self,\n        documents: List[str],\n        embeddings: np.ndarray = None,\n        images: List[str] = None,\n        y: Union[List[int], np.ndarray] = None,\n    ):\n        \"\"\"Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.\n\n        Arguments:\n            documents: A list of documents to fit on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model\n            images: A list of paths to the images to fit on or the images themselves\n            y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n               specific instance is specified.\n\n        Examples:\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all')['data']\n        topic_model = BERTopic().fit(docs)\n        ```\n\n        If you want to use your own embeddings, use it as follows:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n\n        # Create embeddings\n        docs = fetch_20newsgroups(subset='all')['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n        # Create topic model\n        topic_model = BERTopic().fit(docs, embeddings)\n        ```\n        \"\"\"\n        self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images)\n        return self\n\n    def fit_transform(\n        self,\n        documents: List[str],\n        embeddings: np.ndarray = None,\n        images: List[str] = None,\n        y: Union[List[int], np.ndarray] = None,\n    ) -> Tuple[List[int], Union[np.ndarray, None]]:\n        \"\"\"Fit the models on a collection of documents, generate topics,\n        and return the probabilities and topic per document.\n\n        Arguments:\n            documents: A list of documents to fit on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model\n            images: A list of paths to the images to fit on or the images themselves\n            y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n               specific instance is specified.\n\n        Returns:\n            predictions: Topic predictions for each documents\n            probabilities: The probability of the assigned topic per document.\n                           If `calculate_probabilities` in BERTopic is set to True, then\n                           it calculates the probabilities of all topics across all documents\n                           instead of only the assigned topic. This, however, slows down\n                           computation and may increase memory usage.\n\n        Examples:\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all')['data']\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        ```\n\n        If you want to use your own embeddings, use it as follows:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n\n        # Create embeddings\n        docs = fetch_20newsgroups(subset='all')['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n        # Create topic model\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs, embeddings)\n        ```\n        \"\"\"\n        if documents is not None:\n            check_documents_type(documents)\n            check_embeddings_shape(embeddings, documents)\n\n        doc_ids = range(len(documents)) if documents is not None else range(len(images))\n        documents = pd.DataFrame({\"Document\": documents, \"ID\": doc_ids, \"Topic\": None, \"Image\": images})\n\n        # Extract embeddings\n        if embeddings is None:\n            logger.info(\"Embedding - Transforming documents to embeddings.\")\n            self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)\n            embeddings = self._extract_embeddings(\n                documents.Document.values.tolist(),\n                images=images,\n                method=\"document\",\n                verbose=self.verbose,\n            )\n            logger.info(\"Embedding - Completed \\u2713\")\n        else:\n            if self.embedding_model is not None:\n                self.embedding_model = select_backend(\n                    self.embedding_model, language=self.language, verbose=self.verbose\n                )\n\n        # Guided Topic Modeling\n        if self.seed_topic_list is not None and self.embedding_model is not None:\n            y, embeddings = self._guided_topic_modeling(embeddings)\n\n        # Reduce dimensionality and fit UMAP model\n        umap_embeddings = self._reduce_dimensionality(embeddings, y)\n\n        # Zero-shot Topic Modeling\n        if self._is_zeroshot():\n            documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(\n                documents, embeddings\n            )\n            # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering\n            umap_embeddings = self.umap_model.transform(embeddings)\n\n        if len(documents) > 0:  # No zero-shot topics matched\n            # Cluster reduced embeddings\n            documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)\n            if self._is_zeroshot() and len(assigned_documents) > 0:\n                documents, embeddings = self._combine_zeroshot_topics(\n                    documents, embeddings, assigned_documents, assigned_embeddings\n                )\n        else:\n            # All documents matches zero-shot topics\n            documents = assigned_documents\n            embeddings = assigned_embeddings\n        topics_before_reduction = self.topics_\n\n        # Sort and Map Topic IDs by their frequency\n        if not self.nr_topics:\n            documents = self._sort_mappings_by_frequency(documents)\n\n        # Create documents from images if we have images only\n        if documents.Document.values[0] is None:\n            custom_documents = self._images_to_text(documents, embeddings)\n\n            # Extract topics by calculating c-TF-IDF\n            self._extract_topics(custom_documents, embeddings=embeddings)\n            self._create_topic_vectors(documents=documents, embeddings=embeddings)\n\n            # Reduce topics\n            if self.nr_topics:\n                custom_documents = self._reduce_topics(custom_documents)\n\n            # Save the top 3 most representative documents per topic\n            self._save_representative_docs(custom_documents)\n        else:\n            # Extract topics by calculating c-TF-IDF\n            self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)\n\n            # Reduce topics\n            if self.nr_topics:\n                documents = self._reduce_topics(documents)\n\n            # Save the top 3 most representative documents per topic\n            self._save_representative_docs(documents)\n\n        # In the case of zero-shot topics, probability will come from cosine similarity,\n        # and the HDBSCAN model will be removed\n        if self._is_zeroshot() and len(assigned_documents) > 0:\n            self.hdbscan_model = BaseCluster()\n            sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))\n\n            if self.calculate_probabilities:\n                probabilities = sim_matrix\n            else:\n                # Use `topics_before_reduction` because `self.topics_` may have already been updated from\n                # reducing topics, and the original probabilities are needed for `self._map_probabilities()`\n                probabilities = sim_matrix[\n                    np.arange(len(documents)),\n                    np.array(topics_before_reduction) + self._outliers,\n                ]\n\n        # Resulting output\n        self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)\n        predictions = documents.Topic.to_list()\n\n        return predictions, self.probabilities_\n\n    def transform(\n        self,\n        documents: Union[str, List[str]],\n        embeddings: np.ndarray = None,\n        images: List[str] = None,\n    ) -> Tuple[List[int], np.ndarray]:\n        \"\"\"After having fit a model, use transform to predict new instances.\n\n        Arguments:\n            documents: A single document or a list of documents to predict on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model.\n            images: A list of paths to the images to predict on or the images themselves\n\n        Returns:\n            predictions: Topic predictions for each documents\n            probabilities: The topic probability distribution which is returned by default.\n                           If `calculate_probabilities` in BERTopic is set to False, then the\n                           probabilities are not calculated to speed up computation and\n                           decrease memory usage.\n\n        Examples:\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all')['data']\n        topic_model = BERTopic().fit(docs)\n        topics, probs = topic_model.transform(docs)\n        ```\n\n        If you want to use your own embeddings:\n\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n\n        # Create embeddings\n        docs = fetch_20newsgroups(subset='all')['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n        # Create topic model\n        topic_model = BERTopic().fit(docs, embeddings)\n        topics, probs = topic_model.transform(docs, embeddings)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_embeddings_shape(embeddings, documents)\n\n        if isinstance(documents, str) or documents is None:\n            documents = [documents]\n\n        if embeddings is None:\n            embeddings = self._extract_embeddings(documents, images=images, method=\"document\", verbose=self.verbose)\n\n        # Check if an embedding model was found\n        if embeddings is None:\n            raise ValueError(\n                \"No embedding model was found to embed the documents.\"\n                \"Make sure when loading in the model using BERTopic.load()\"\n                \"to also specify the embedding model.\"\n            )\n\n        # Transform without hdbscan_model and umap_model using only cosine similarity\n        elif type(self.hdbscan_model) == BaseCluster:\n            logger.info(\"Predicting topic assignments through cosine similarity of topic and document embeddings.\")\n            sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))\n            predictions = np.argmax(sim_matrix, axis=1) - self._outliers\n\n            if self.calculate_probabilities:\n                probabilities = sim_matrix\n            else:\n                probabilities = np.max(sim_matrix, axis=1)\n\n        # Transform with full pipeline\n        else:\n            logger.info(\"Dimensionality - Reducing dimensionality of input embeddings.\")\n            umap_embeddings = self.umap_model.transform(embeddings)\n            logger.info(\"Dimensionality - Completed \\u2713\")\n\n            # Extract predictions and probabilities if it is a HDBSCAN-like model\n            logger.info(\"Clustering - Approximating new points with `hdbscan_model`\")\n            if is_supported_hdbscan(self.hdbscan_model):\n                predictions, probabilities = hdbscan_delegator(\n                    self.hdbscan_model, \"approximate_predict\", umap_embeddings\n                )\n\n                # Calculate probabilities\n                if self.calculate_probabilities:\n                    logger.info(\"Probabilities - Start calculation of probabilities with HDBSCAN\")\n                    probabilities = hdbscan_delegator(self.hdbscan_model, \"membership_vector\", umap_embeddings)\n                    logger.info(\"Probabilities - Completed \\u2713\")\n            else:\n                predictions = self.hdbscan_model.predict(umap_embeddings)\n                probabilities = None\n            logger.info(\"Cluster - Completed \\u2713\")\n\n            # Map probabilities and predictions\n            probabilities = self._map_probabilities(probabilities, original_topics=True)\n            predictions = self._map_predictions(predictions)\n        return predictions, probabilities\n\n    def partial_fit(\n        self,\n        documents: List[str],\n        embeddings: np.ndarray = None,\n        y: Union[List[int], np.ndarray] = None,\n    ):\n        \"\"\"Fit BERTopic on a subset of the data and perform online learning\n        with batch-like data.\n\n        Online topic modeling in BERTopic is performed by using dimensionality\n        reduction and cluster algorithms that support a `partial_fit` method\n        in order to incrementally train the topic model.\n\n        Likewise, the `bertopic.vectorizers.OnlineCountVectorizer` is used\n        to dynamically update its vocabulary when presented with new data.\n        It has several parameters for modeling decay and updating the\n        representations.\n\n        In other words, although the main algorithm stays the same, the training\n        procedure now works as follows:\n\n        For each subset of the data:\n\n        1. Generate embeddings with a pre-trained language model\n        2. Incrementally update the dimensionality reduction algorithm with `partial_fit`\n        3. Incrementally update the cluster algorithm with `partial_fit`\n        4. Incrementally update the OnlineCountVectorizer and apply some form of decay\n\n        Note that it is advised to use `partial_fit` with batches and\n        not single documents for the best performance.\n\n        Arguments:\n            documents: A list of documents to fit on\n            embeddings: Pre-trained document embeddings. These can be used\n                        instead of the sentence-transformer model\n            y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n               specific instance is specified.\n\n        Examples:\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sklearn.cluster import MiniBatchKMeans\n        from sklearn.decomposition import IncrementalPCA\n        from bertopic.vectorizers import OnlineCountVectorizer\n        from bertopic import BERTopic\n\n        # Prepare documents\n        docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\n        # Prepare sub-models that support online learning\n        umap_model = IncrementalPCA(n_components=5)\n        cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\n        vectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\n        topic_model = BERTopic(umap_model=umap_model,\n                               hdbscan_model=cluster_model,\n                               vectorizer_model=vectorizer_model)\n\n        # Incrementally fit the topic model by training on 1000 documents at a time\n        for index in range(0, len(docs), 1000):\n            topic_model.partial_fit(docs[index: index+1000])\n        ```\n        \"\"\"\n        # Checks\n        check_embeddings_shape(embeddings, documents)\n        if not hasattr(self.hdbscan_model, \"partial_fit\"):\n            raise ValueError(\n                \"In order to use `.partial_fit`, the cluster model should have \" \"a `.partial_fit` function.\"\n            )\n\n        # Prepare documents\n        if isinstance(documents, str):\n            documents = [documents]\n        documents = pd.DataFrame({\"Document\": documents, \"ID\": range(len(documents)), \"Topic\": None})\n\n        # Extract embeddings\n        if embeddings is None:\n            if self.topic_representations_ is None:\n                self.embedding_model = select_backend(\n                    self.embedding_model, language=self.language, verbose=self.verbose\n                )\n            embeddings = self._extract_embeddings(\n                documents.Document.values.tolist(),\n                method=\"document\",\n                verbose=self.verbose,\n            )\n        else:\n            if self.embedding_model is not None and self.topic_representations_ is None:\n                self.embedding_model = select_backend(\n                    self.embedding_model, language=self.language, verbose=self.verbose\n                )\n\n        # Reduce dimensionality\n        if self.seed_topic_list is not None and self.embedding_model is not None:\n            y, embeddings = self._guided_topic_modeling(embeddings)\n        umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True)\n\n        # Cluster reduced embeddings\n        documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True)\n        topics = documents.Topic.to_list()\n\n        # Map and find new topics\n        if not self.topic_mapper_:\n            self.topic_mapper_ = TopicMapper(topics)\n        mappings = self.topic_mapper_.get_mappings()\n        new_topics = set(topics).difference(set(mappings.keys()))\n        new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)}\n        self.topic_mapper_.add_new_topics(new_topic_ids)\n        updated_mappings = self.topic_mapper_.get_mappings()\n        updated_topics = [updated_mappings[topic] for topic in topics]\n        documents[\"Topic\"] = updated_topics\n\n        # Add missing topics (topics that were originally created but are now missing)\n        if self.topic_representations_:\n            missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics))\n            for missing_topic in missing_topics:\n                documents.loc[len(documents), :] = [\" \", len(documents), missing_topic]\n        else:\n            missing_topics = {}\n\n        # Prepare documents\n        documents_per_topic = documents.sort_values(\"Topic\").groupby([\"Topic\"], as_index=False)\n        updated_topics = documents_per_topic.first().Topic.astype(int)\n        documents_per_topic = documents_per_topic.agg({\"Document\": \" \".join})\n\n        # Update topic representations\n        self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True)\n        self.topic_representations_ = self._extract_words_per_topic(\n            updated_words, documents, self.c_tf_idf_, calculate_aspects=False\n        )\n        self._create_topic_vectors()\n\n        # Update topic sizes\n        if len(missing_topics) > 0:\n            documents = documents.iloc[: -len(missing_topics)]\n\n        if self.topic_sizes_ is None:\n            self._update_topic_size(documents)\n        else:\n            sizes = documents.groupby([\"Topic\"], as_index=False).count()\n            for _, row in sizes.iterrows():\n                topic = int(row.Topic)\n                if self.topic_sizes_.get(topic) is not None and topic not in missing_topics:\n                    self.topic_sizes_[topic] += int(row.Document)\n                elif self.topic_sizes_.get(topic) is None:\n                    self.topic_sizes_[topic] = int(row.Document)\n            self.topics_ = documents.Topic.astype(int).tolist()\n\n        return self\n\n    def topics_over_time(\n        self,\n        docs: List[str],\n        timestamps: Union[List[str], List[int]],\n        topics: List[int] = None,\n        nr_bins: int = None,\n        datetime_format: str = None,\n        evolution_tuning: bool = True,\n        global_tuning: bool = True,\n    ) -> pd.DataFrame:\n        \"\"\"Create topics over time.\n\n        To create the topics over time, BERTopic needs to be already fitted once.\n        From the fitted models, the c-TF-IDF representations are calculate at\n        each timestamp t. Then, the c-TF-IDF representations at timestamp t are\n        averaged with the global c-TF-IDF representations in order to fine-tune the\n        local representations.\n\n        Note:\n            Make sure to use a limited number of unique timestamps (<100) as the\n            c-TF-IDF representation will be calculated at each single unique timestamp.\n            Having a large number of unique timestamps can take some time to be calculated.\n            Moreover, there aren't many use-cases where you would like to see the difference\n            in topic representations over more than 100 different timestamps.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            timestamps: The timestamp of each document. This can be either a list of strings or ints.\n                        If it is a list of strings, then the datetime format will be automatically\n                        inferred. If it is a list of ints, then the documents will be ordered in\n                        ascending order.\n            topics: A list of topics where each topic is related to a document in `docs` and\n                    a timestamp in `timestamps`. You can use this to apply topics_over_time on\n                    a subset of the data. Make sure that `docs`, `timestamps`, and `topics`\n                    all correspond to one another and have the same size.\n            nr_bins: The number of bins you want to create for the timestamps. The left interval will\n                     be chosen as the timestamp. An additional column will be created with the\n                     entire interval.\n            datetime_format: The datetime format of the timestamps if they are strings, eg \u201c%d/%m/%Y\u201d.\n                             Set this to None if you want to have it automatically detect the format.\n                             See strftime documentation for more information on choices:\n                             https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.\n            evolution_tuning: Fine-tune each topic representation at timestamp *t* by averaging its\n                              c-TF-IDF matrix with the c-TF-IDF matrix at timestamp *t-1*. This creates\n                              evolutionary topic representations.\n            global_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix\n                       with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                       topic representations that could not be found in the documents at timestamp *t*.\n\n        Returns:\n            topics_over_time: A dataframe that contains the topic, words, and frequency of topic\n                              at timestamp *t*.\n\n        Examples:\n        The timestamps variable represents the timestamp of each document. If you have over\n        100 unique timestamps, it is advised to bin the timestamps as shown below:\n\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        selected_topics = topics if topics else self.topics_\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": selected_topics, \"Timestamps\": timestamps})\n        global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm=\"l1\", copy=False)\n\n        all_topics = sorted(list(documents.Topic.unique()))\n        all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}\n\n        if isinstance(timestamps[0], str):\n            infer_datetime_format = True if not datetime_format else False\n            documents[\"Timestamps\"] = pd.to_datetime(\n                documents[\"Timestamps\"],\n                infer_datetime_format=infer_datetime_format,\n                format=datetime_format,\n            )\n\n        if nr_bins:\n            documents[\"Bins\"] = pd.cut(documents.Timestamps, bins=nr_bins)\n            documents[\"Timestamps\"] = documents.apply(lambda row: row.Bins.left, 1)\n\n        # Sort documents in chronological order\n        documents = documents.sort_values(\"Timestamps\")\n        timestamps = documents.Timestamps.unique()\n        if len(timestamps) > 100:\n            logger.warning(\n                f\"There are more than 100 unique timestamps (i.e., {len(timestamps)}) \"\n                \"which significantly slows down the application. Consider setting `nr_bins` \"\n                \"to a value lower than 100 to speed up calculation. \"\n            )\n\n        # For each unique timestamp, create topic representations\n        topics_over_time = []\n        for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose):\n            # Calculate c-TF-IDF representation for a specific timestamp\n            selection = documents.loc[documents.Timestamps == timestamp, :]\n            documents_per_topic = selection.groupby([\"Topic\"], as_index=False).agg(\n                {\"Document\": \" \".join, \"Timestamps\": \"count\"}\n            )\n            c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n            if global_tuning or evolution_tuning:\n                c_tf_idf = normalize(c_tf_idf, axis=1, norm=\"l1\", copy=False)\n\n            # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF\n            # matrix at timestamp t-1\n            if evolution_tuning and index != 0:\n                current_topics = sorted(list(documents_per_topic.Topic.values))\n                overlapping_topics = sorted(\n                    list(set(previous_topics).intersection(set(current_topics)))  # noqa: F821\n                )\n\n                current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]\n                previous_overlap_idx = [\n                    previous_topics.index(topic)  # noqa: F821\n                    for topic in overlapping_topics\n                ]\n\n                c_tf_idf.tolil()[current_overlap_idx] = (\n                    (\n                        c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx]  # noqa: F821\n                    )\n                    / 2.0\n                ).tolil()\n\n            # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n            # by simply taking the average of the two\n            if global_tuning:\n                selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]\n                c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0\n\n            # Extract the words per topic\n            words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n            topic_frequency = pd.Series(\n                documents_per_topic.Timestamps.values, index=documents_per_topic.Topic\n            ).to_dict()\n\n            # Fill dataframe with results\n            topics_at_timestamp = [\n                (\n                    topic,\n                    \", \".join([words[0] for words in values][:5]),\n                    topic_frequency[topic],\n                    timestamp,\n                )\n                for topic, values in words_per_topic.items()\n            ]\n            topics_over_time.extend(topics_at_timestamp)\n\n            if evolution_tuning:\n                previous_topics = sorted(list(documents_per_topic.Topic.values))  # noqa: F841\n                previous_c_tf_idf = c_tf_idf.copy()  # noqa: F841\n\n        return pd.DataFrame(topics_over_time, columns=[\"Topic\", \"Words\", \"Frequency\", \"Timestamp\"])\n\n    def topics_per_class(\n        self,\n        docs: List[str],\n        classes: Union[List[int], List[str]],\n        global_tuning: bool = True,\n    ) -> pd.DataFrame:\n        \"\"\"Create topics per class.\n\n        To create the topics per class, BERTopic needs to be already fitted once.\n        From the fitted models, the c-TF-IDF representations are calculated at\n        each class c. Then, the c-TF-IDF representations at class c are\n        averaged with the global c-TF-IDF representations in order to fine-tune the\n        local representations. This can be turned off if the pure representation is\n        needed.\n\n        Note:\n            Make sure to use a limited number of unique classes (<100) as the\n            c-TF-IDF representation will be calculated at each single unique class.\n            Having a large number of unique classes can take some time to be calculated.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            classes: The class of each document. This can be either a list of strings or ints.\n            global_tuning: Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix\n                           with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                           topic representations that could not be found in the documents for class c.\n\n        Returns:\n            topics_per_class: A dataframe that contains the topic, words, and frequency of topics\n                              for each class.\n\n        Examples:\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        topics_per_class = topic_model.topics_per_class(docs, classes)\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Class\": classes})\n        global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm=\"l1\", copy=False)\n\n        # For each unique timestamp, create topic representations\n        topics_per_class = []\n        for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose):\n            # Calculate c-TF-IDF representation for a specific timestamp\n            selection = documents.loc[documents.Class == class_, :]\n            documents_per_topic = selection.groupby([\"Topic\"], as_index=False).agg(\n                {\"Document\": \" \".join, \"Class\": \"count\"}\n            )\n            c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n            # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n            # by simply taking the average of the two\n            if global_tuning:\n                c_tf_idf = normalize(c_tf_idf, axis=1, norm=\"l1\", copy=False)\n                c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0\n\n            # Extract the words per topic\n            words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n            topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict()\n\n            # Fill dataframe with results\n            topics_at_class = [\n                (\n                    topic,\n                    \", \".join([words[0] for words in values][:5]),\n                    topic_frequency[topic],\n                    class_,\n                )\n                for topic, values in words_per_topic.items()\n            ]\n            topics_per_class.extend(topics_at_class)\n\n        topics_per_class = pd.DataFrame(topics_per_class, columns=[\"Topic\", \"Words\", \"Frequency\", \"Class\"])\n\n        return topics_per_class\n\n    def hierarchical_topics(\n        self,\n        docs: List[str],\n        use_ctfidf: bool = True,\n        linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n        distance_function: Callable[[csr_matrix], csr_matrix] = None,\n    ) -> pd.DataFrame:\n        \"\"\"Create a hierarchy of topics.\n\n        To create this hierarchy, BERTopic needs to be already fitted once.\n        Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings\n        representation using `scipy.cluster.hierarchy.linkage`.\n\n        Based on that hierarchy, we calculate the topic representation at each\n        merged step. This is a local representation, as we only assume that the\n        chosen step is merged and not all others which typically improves the\n        topic representation.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                        embeddings from the embedding model are used.\n            linkage_function: The linkage function to use. Default is:\n                              `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n            distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                               `lambda x: 1 - cosine_similarity(x)`.\n                               You can pass any function that returns either a square matrix of\n                               shape (n_samples, n_samples) with zeros on the diagonal and\n                               non-negative values or condensed distance matrix of shape\n                               (n_samples * (n_samples - 1) / 2,) containing the upper\n                               triangular of the distance matrix.\n\n        Returns:\n            hierarchical_topics: A dataframe that contains a hierarchy of topics\n                                 represented by their parents and their children\n\n        Examples:\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n        ```\n\n        A custom linkage function can be used as follows:\n\n        ```python\n        from scipy.cluster import hierarchy as sch\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n\n        # Hierarchical topics\n        linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n        hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        if distance_function is None:\n            distance_function = lambda x: 1 - cosine_similarity(x)\n\n        if linkage_function is None:\n            linkage_function = lambda x: sch.linkage(x, \"ward\", optimal_ordering=True)\n\n        # Calculate distance\n        embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][\n            self._outliers :\n        ]\n        X = distance_function(embeddings)\n        X = validate_distance_matrix(X, embeddings.shape[0])\n\n        # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix\n        Z = linkage_function(X)\n\n        # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with\n        # `sch.fcluster(...)` would produce incorrect values for \"Topics\" for these clusters\n        if len(Z[:, 2]) != len(np.unique(Z[:, 2])):\n            Z[:, 2] = get_unique_distances(Z[:, 2])\n\n        # Calculate basic bag-of-words to be iteratively merged later\n        documents = pd.DataFrame({\"Document\": docs, \"ID\": range(len(docs)), \"Topic\": self.topics_})\n        documents_per_topic = documents.groupby([\"Topic\"], as_index=False).agg({\"Document\": \" \".join})\n        documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :]\n        clean_documents = self._preprocess_text(documents_per_topic.Document.values)\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = self.vectorizer_model.get_feature_names_out()\n        else:\n            words = self.vectorizer_model.get_feature_names()\n\n        bow = self.vectorizer_model.transform(clean_documents)\n\n        # Extract clusters\n        hier_topics = pd.DataFrame(\n            columns=[\n                \"Parent_ID\",\n                \"Parent_Name\",\n                \"Topics\",\n                \"Child_Left_ID\",\n                \"Child_Left_Name\",\n                \"Child_Right_ID\",\n                \"Child_Right_Name\",\n            ]\n        )\n        for index in tqdm(range(len(Z))):\n            # Find clustered documents\n            clusters = sch.fcluster(Z, t=Z[index][2], criterion=\"distance\") - self._outliers\n            nr_clusters = len(clusters)\n\n            # Extract first topic we find to get the set of topics in a merged topic\n            topic = None\n            val = Z[index][0]\n            while topic is None:\n                if val - len(clusters) < 0:\n                    topic = int(val)\n                else:\n                    val = Z[int(val - len(clusters))][0]\n            clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]]\n\n            # Group bow per cluster, calculate c-TF-IDF and extract words\n            grouped = csr_matrix(bow[clustered_topics].sum(axis=0))\n            c_tf_idf = self.ctfidf_model.transform(grouped)\n            selection = documents.loc[documents.Topic.isin(clustered_topics), :]\n            selection.Topic = 0\n            words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n\n            # Extract parent's name and ID\n            parent_id = index + len(clusters)\n            parent_name = \"_\".join([x[0] for x in words_per_topic[0]][:5])\n\n            # Extract child's name and ID\n            Z_id = Z[index][0]\n            child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n            if Z_id - nr_clusters < 0:\n                child_left_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n            else:\n                child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name\n\n            # Extract child's name and ID\n            Z_id = Z[index][1]\n            child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n            if Z_id - nr_clusters < 0:\n                child_right_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n            else:\n                child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name\n\n            # Save results\n            hier_topics.loc[len(hier_topics), :] = [\n                parent_id,\n                parent_name,\n                clustered_topics,\n                int(Z[index][0]),\n                child_left_name,\n                int(Z[index][1]),\n                child_right_name,\n            ]\n\n        hier_topics[\"Distance\"] = Z[:, 2]\n        hier_topics = hier_topics.sort_values(\"Parent_ID\", ascending=False)\n        hier_topics[[\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]] = hier_topics[\n            [\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]\n        ].astype(str)\n\n        return hier_topics\n\n    def approximate_distribution(\n        self,\n        documents: Union[str, List[str]],\n        window: int = 4,\n        stride: int = 1,\n        min_similarity: float = 0.1,\n        batch_size: int = 1000,\n        padding: bool = False,\n        use_embedding_model: bool = False,\n        calculate_tokens: bool = False,\n        separator: str = \" \",\n    ) -> Tuple[np.ndarray, Union[List[np.ndarray], None]]:\n        \"\"\"A post-hoc approximation of topic distributions across documents.\n\n        In order to perform this approximation, each document is split into tokens\n        according to the provided tokenizer in the `CountVectorizer`. Then, a\n        sliding window is applied on each document creating subsets of the document.\n        For example, with a window size of 3 and stride of 1, the sentence:\n\n        `Solving the right problem is difficult.`\n\n        can be split up into `solving the right`, `the right problem`, `right problem is`,\n        and `problem is difficult`. These are called tokensets. For each of these\n        tokensets, we calculate their c-TF-IDF representation and find out\n        how similar they are to the previously generated topics. Then, the\n        similarities to the topics for each tokenset are summed up in order to\n        create a topic distribution for the entire document.\n\n        We can also dive into this a bit deeper by then splitting these tokensets\n        up into individual tokens and calculate how much a word, in a specific sentence,\n        contributes to the topics found in that document. This can be enabled by\n        setting `calculate_tokens=True` which can be used for visualization purposes\n        in `topic_model.visualize_approximate_distribution`.\n\n        The main output, `topic_distributions`, can also be used directly in\n        `.visualize_distribution(topic_distributions[index])` by simply selecting\n        a single distribution.\n\n        Arguments:\n            documents: A single document or a list of documents for which we\n                       approximate their topic distributions\n            window: Size of the moving window which indicates the number of\n                    tokens being considered.\n            stride: How far the window should move at each step.\n            min_similarity: The minimum similarity of a document's tokenset\n                            with respect to the topics.\n            batch_size: The number of documents to process at a time. If None,\n                        then all documents are processed at once.\n                        NOTE: With a large number of documents, it is not\n                        advised to process all documents at once.\n            padding: Whether to pad the beginning and ending of a document with\n                     empty tokens.\n            use_embedding_model: Whether to use the topic model's embedding\n                                 model to calculate the similarity between\n                                 tokensets and topics instead of using c-TF-IDF.\n            calculate_tokens: Calculate the similarity of tokens with all topics.\n                              NOTE: This is computation-wise more expensive and\n                              can require more memory. Using this over batches of\n                              documents might be preferred.\n            separator: The separator used to merge tokens into tokensets.\n\n        Returns:\n            topic_distributions: A `n` x `m` matrix containing the topic distributions\n                                 for all input documents with `n` being the documents\n                                 and `m` the topics.\n            topic_token_distributions: A list of `t` x `m` arrays with `t` being the\n                                       number of tokens for the respective document\n                                       and `m` the topics.\n\n        Examples:\n        After fitting the model, the topic distributions can be calculated regardless\n        of the clustering model and regardless of whether the documents were previously\n        seen or not:\n\n        ```python\n        topic_distr, _ = topic_model.approximate_distribution(docs)\n        ```\n\n        As a result, the topic distributions are calculated in `topic_distr` for the\n        entire document based on a token set with a specific window size and stride.\n\n        If you want to calculate the topic distributions on a token-level:\n\n        ```python\n        topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n        ```\n\n        The `topic_token_distr` then contains, for each token, the best fitting topics.\n        As with `topic_distr`, it can contain multiple topics for a single token.\n        \"\"\"\n        if isinstance(documents, str):\n            documents = [documents]\n\n        if batch_size is None:\n            batch_size = len(documents)\n            batches = 1\n        else:\n            batches = math.ceil(len(documents) / batch_size)\n\n        topic_distributions = []\n        topic_token_distributions = []\n\n        for i in tqdm(range(batches), disable=not self.verbose):\n            doc_set = documents[i * batch_size : (i + 1) * batch_size]\n\n            # Extract tokens\n            analyzer = self.vectorizer_model.build_tokenizer()\n            tokens = [analyzer(document) for document in doc_set]\n\n            # Extract token sets\n            all_sentences = []\n            all_indices = [0]\n            all_token_sets_ids = []\n\n            for tokenset in tokens:\n                if len(tokenset) < window:\n                    token_sets = [tokenset]\n                    token_sets_ids = [list(range(len(tokenset)))]\n                else:\n                    # Extract tokensets using window and stride parameters\n                    stride_indices = list(range(len(tokenset)))[::stride]\n                    token_sets = []\n                    token_sets_ids = []\n                    for stride_index in stride_indices:\n                        selected_tokens = tokenset[stride_index : stride_index + window]\n\n                        if padding or len(selected_tokens) == window:\n                            token_sets.append(selected_tokens)\n                            token_sets_ids.append(\n                                list(\n                                    range(\n                                        stride_index,\n                                        stride_index + len(selected_tokens),\n                                    )\n                                )\n                            )\n\n                    # Add empty tokens at the beginning and end of a document\n                    if padding:\n                        padded = []\n                        padded_ids = []\n                        t = math.ceil(window / stride) - 1\n                        for i in range(math.ceil(window / stride) - 1):\n                            padded.append(tokenset[: window - ((t - i) * stride)])\n                            padded_ids.append(list(range(0, window - ((t - i) * stride))))\n\n                        token_sets = padded + token_sets\n                        token_sets_ids = padded_ids + token_sets_ids\n\n                # Join the tokens\n                sentences = [separator.join(token) for token in token_sets]\n                all_sentences.extend(sentences)\n                all_token_sets_ids.extend(token_sets_ids)\n                all_indices.append(all_indices[-1] + len(sentences))\n\n            # Calculate similarity between embeddings of token sets and the topics\n            if use_embedding_model:\n                embeddings = self._extract_embeddings(all_sentences, method=\"document\", verbose=True)\n                similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :])\n\n            # Calculate similarity between c-TF-IDF of token sets and the topics\n            else:\n                bow_doc = self.vectorizer_model.transform(all_sentences)\n                c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n                similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :])\n\n            # Only keep similarities that exceed the minimum\n            similarity[similarity < min_similarity] = 0\n\n            # Aggregate results on an individual token level\n            if calculate_tokens:\n                topic_distribution = []\n                topic_token_distribution = []\n                for index, token in enumerate(tokens):\n                    start = all_indices[index]\n                    end = all_indices[index + 1]\n\n                    if start == end:\n                        end = end + 1\n\n                    # Assign topics to individual tokens\n                    token_id = [i for i in range(len(token))]\n                    token_val = {index: [] for index in token_id}\n                    for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]):\n                        for token in token_set:\n                            if token in token_val:\n                                token_val[token].append(sim)\n\n                    matrix = []\n                    for _, value in token_val.items():\n                        matrix.append(np.add.reduce(value))\n\n                    # Take empty documents into account\n                    matrix = np.array(matrix)\n                    if len(matrix.shape) == 1:\n                        matrix = np.zeros((1, len(self.topic_labels_) - self._outliers))\n\n                    topic_token_distribution.append(np.array(matrix))\n                    topic_distribution.append(np.add.reduce(matrix))\n\n                topic_distribution = normalize(topic_distribution, norm=\"l1\", axis=1)\n\n            # Aggregate on a tokenset level indicated by the window and stride\n            else:\n                topic_distribution = []\n                for index in range(len(all_indices) - 1):\n                    start = all_indices[index]\n                    end = all_indices[index + 1]\n\n                    if start == end:\n                        end = end + 1\n                    group = similarity[start:end].sum(axis=0)\n                    topic_distribution.append(group)\n                topic_distribution = normalize(np.array(topic_distribution), norm=\"l1\", axis=1)\n                topic_token_distribution = None\n\n            # Combine results\n            topic_distributions.append(topic_distribution)\n            if topic_token_distribution is None:\n                topic_token_distributions = None\n            else:\n                topic_token_distributions.extend(topic_token_distribution)\n\n        topic_distributions = np.vstack(topic_distributions)\n\n        return topic_distributions, topic_token_distributions\n\n    def find_topics(self, search_term: str = None, image: str = None, top_n: int = 5) -> Tuple[List[int], List[float]]:\n        \"\"\"Find topics most similar to a search_term.\n\n        Creates an embedding for a search query and compares that with\n        the topic embeddings. The most similar topics are returned\n        along with their similarity values.\n\n        The query is specified using search_term for text queries or image for image queries.\n\n        The search_term can be of any size but since it is compared\n        with the topic representation it is advised to keep it\n        below 5 words.\n\n        Arguments:\n            search_term: the term you want to use to search for topics.\n            image: path to the image you want to use to search for topics.\n            top_n: the number of topics to return\n\n        Returns:\n            similar_topics: the most similar topics from high to low\n            similarity: the similarity scores from high to low\n\n        Examples:\n        You can use the underlying embedding model to find topics that\n        best represent the search term:\n\n        ```python\n        topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n        ```\n\n        Note that the search query is typically more accurate if the\n        search_term consists of a phrase or multiple words.\n        \"\"\"\n        if self.embedding_model is None:\n            raise Exception(\"This method can only be used if you did not use custom embeddings.\")\n\n        topic_list = list(self.topic_representations_.keys())\n        topic_list.sort()\n\n        # Extract search_term embeddings and compare with topic embeddings\n        if search_term is not None:\n            search_embedding = self._extract_embeddings([search_term], method=\"word\", verbose=False).flatten()\n        elif image is not None:\n            search_embedding = self._extract_embeddings(\n                [None], images=[image], method=\"document\", verbose=False\n            ).flatten()\n        sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten()\n\n        # Extract topics most similar to search_term\n        ids = np.argsort(sims)[-top_n:]\n        similarity = [sims[i] for i in ids][::-1]\n        similar_topics = [topic_list[index] for index in ids][::-1]\n\n        return similar_topics, similarity\n\n    def update_topics(\n        self,\n        docs: List[str],\n        images: List[str] = None,\n        topics: List[int] = None,\n        top_n_words: int = 10,\n        n_gram_range: Tuple[int, int] = None,\n        vectorizer_model: CountVectorizer = None,\n        ctfidf_model: ClassTfidfTransformer = None,\n        representation_model: BaseRepresentation = None,\n    ):\n        \"\"\"Updates the topic representation by recalculating c-TF-IDF with the new\n        parameters as defined in this function.\n\n        When you have trained a model and viewed the topics and the words that represent them,\n        you might not be satisfied with the representation. Perhaps you forgot to remove\n        stop_words or you want to try out a different n_gram_range. This function allows you\n        to update the topic representation after they have been formed.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            images: The images you used when calling either `fit` or `fit_transform`\n            topics: A list of topics where each topic is related to a document in `docs`.\n                    Use this variable to change or map the topics.\n                    NOTE: Using a custom list of topic assignments may lead to errors if\n                          topic reduction techniques are used afterwards. Make sure that\n                          manually assigning topics is the last step in the pipeline\n            top_n_words: The number of words per topic to extract. Setting this\n                         too high can negatively impact topic embeddings as topics\n                         are typically best represented by at most 10 words.\n            n_gram_range: The n-gram range for the CountVectorizer.\n            vectorizer_model: Pass in your own CountVectorizer from scikit-learn\n            ctfidf_model: Pass in your own c-TF-IDF model to update the representations\n            representation_model: Pass in a model that fine-tunes the topic representations\n                                  calculated through c-TF-IDF. Models from `bertopic.representation`\n                                  are supported.\n\n        Examples:\n        In order to update the topic representation, you will need to first fit the topic\n        model and extract topics from them. Based on these, you can update the representation:\n\n        ```python\n        topic_model.update_topics(docs, n_gram_range=(2, 3))\n        ```\n\n        You can also use a custom vectorizer to update the representation:\n\n        ```python\n        from sklearn.feature_extraction.text import CountVectorizer\n        vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\")\n        topic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n        ```\n\n        You can also use this function to change or map the topics to something else.\n        You can update them as follows:\n\n        ```python\n        topic_model.update_topics(docs, my_updated_topics)\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        check_is_fitted(self)\n        if not n_gram_range:\n            n_gram_range = self.n_gram_range\n\n        if top_n_words > 100:\n            logger.warning(\n                \"Note that extracting more than 100 words from a sparse \" \"can slow down computation quite a bit.\"\n            )\n        self.top_n_words = top_n_words\n        self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)\n        self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n        self.representation_model = representation_model\n\n        if topics is None:\n            topics = self.topics_\n        else:\n            logger.warning(\n                \"Using a custom list of topic assignments may lead to errors if \"\n                \"topic reduction techniques are used afterwards. Make sure that \"\n                \"manually assigning topics is the last step in the pipeline.\"\n                \"Note that topic embeddings will also be created through weighted\"\n                \"c-TF-IDF embeddings instead of centroid embeddings.\"\n            )\n\n        documents = pd.DataFrame({\"Document\": docs, \"Topic\": topics, \"ID\": range(len(docs)), \"Image\": images})\n        documents_per_topic = documents.groupby([\"Topic\"], as_index=False).agg({\"Document\": \" \".join})\n\n        # Update topic sizes and assignments\n        self._update_topic_size(documents)\n\n        # Extract words and update topic labels\n        self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)\n        self.topic_representations_ = self._extract_words_per_topic(words, documents)\n\n        # Update topic vectors\n        if set(topics) != self.topics_:\n            # Remove outlier topic embedding if all that has changed is the outlier class\n            same_position = all(\n                [\n                    True if old_topic == new_topic else False\n                    for old_topic, new_topic in zip(self.topics_, topics)\n                    if old_topic != -1\n                ]\n            )\n            if same_position and -1 not in topics and -1 in self.topics_:\n                self.topic_embeddings_ = self.topic_embeddings_[1:]\n            else:\n                self._create_topic_vectors()\n\n    def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:\n        \"\"\"Return topics with top n words and their c-TF-IDF score.\n\n        Arguments:\n            full: If True, returns all different forms of topic representations\n                  for each topic, including aspects\n\n        Returns:\n            self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score\n\n        Examples:\n        ```python\n        all_topics = topic_model.get_topics()\n        ```\n        \"\"\"\n        check_is_fitted(self)\n\n        if full:\n            topic_representations = {\"Main\": self.topic_representations_}\n            topic_representations.update(self.topic_aspects_)\n            return topic_representations\n        else:\n            return self.topic_representations_\n\n    def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]:\n        \"\"\"Return top n words for a specific topic and their c-TF-IDF scores.\n\n        Arguments:\n            topic: A specific topic for which you want its representation\n            full: If True, returns all different forms of topic representations\n                  for a topic, including aspects\n\n        Returns:\n            The top n words for a specific word and its respective c-TF-IDF scores\n\n        Examples:\n        ```python\n        topic = topic_model.get_topic(12)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        if topic in self.topic_representations_:\n            if full:\n                representations = {\"Main\": self.topic_representations_[topic]}\n                aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()}\n                representations.update(aspects)\n                return representations\n            else:\n                return self.topic_representations_[topic]\n        else:\n            return False\n\n    def get_topic_info(self, topic: int = None) -> pd.DataFrame:\n        \"\"\"Get information about each topic including its ID, frequency, and name.\n\n        Arguments:\n            topic: A specific topic for which you want the frequency\n\n        Returns:\n            info: The information relating to either a single topic or all topics\n\n        Examples:\n        ```python\n        info_df = topic_model.get_topic_info()\n        ```\n        \"\"\"\n        check_is_fitted(self)\n\n        info = pd.DataFrame(self.topic_sizes_.items(), columns=[\"Topic\", \"Count\"]).sort_values(\"Topic\")\n        info[\"Name\"] = info.Topic.map(self.topic_labels_)\n\n        # Custom label\n        if self.custom_labels_ is not None:\n            if len(self.custom_labels_) == len(info):\n                labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)}\n                info[\"CustomName\"] = info[\"Topic\"].map(labels)\n\n        # Main Keywords\n        values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()}\n        info[\"Representation\"] = info[\"Topic\"].map(values)\n\n        # Extract all topic aspects\n        if self.topic_aspects_:\n            for aspect, values in self.topic_aspects_.items():\n                if isinstance(list(values.values())[-1], list):\n                    if isinstance(list(values.values())[-1][0], tuple) or isinstance(\n                        list(values.values())[-1][0], list\n                    ):\n                        values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()}\n                    elif isinstance(list(values.values())[-1][0], str):\n                        values = {topic: \" \".join(value).strip() for topic, value in values.items()}\n                info[aspect] = info[\"Topic\"].map(values)\n\n        # Representative Docs / Images\n        if self.representative_docs_ is not None:\n            info[\"Representative_Docs\"] = info[\"Topic\"].map(self.representative_docs_)\n        if self.representative_images_ is not None:\n            info[\"Representative_Images\"] = info[\"Topic\"].map(self.representative_images_)\n\n        # Select specific topic to return\n        if topic is not None:\n            info = info.loc[info.Topic == topic, :]\n\n        return info.reset_index(drop=True)\n\n    def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:\n        \"\"\"Return the size of topics (descending order).\n\n        Arguments:\n            topic: A specific topic for which you want the frequency\n\n        Returns:\n            Either the frequency of a single topic or dataframe with\n            the frequencies of all topics\n\n        Examples:\n        To extract the frequency of all topics:\n\n        ```python\n        frequency = topic_model.get_topic_freq()\n        ```\n\n        To get the frequency of a single topic:\n\n        ```python\n        frequency = topic_model.get_topic_freq(12)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        if isinstance(topic, int):\n            return self.topic_sizes_[topic]\n        else:\n            return pd.DataFrame(self.topic_sizes_.items(), columns=[\"Topic\", \"Count\"]).sort_values(\n                \"Count\", ascending=False\n            )\n\n    def get_document_info(\n        self,\n        docs: List[str],\n        df: pd.DataFrame = None,\n        metadata: Mapping[str, Any] = None,\n    ) -> pd.DataFrame:\n        \"\"\"Get information about the documents on which the topic was trained\n        including the documents themselves, their respective topics, the name\n        of each topic, the top n words of each topic, whether it is a\n        representative document, and probability of the clustering if the cluster\n        model supports it.\n\n        There are also options to include other meta data, such as the topic\n        distributions or the x and y coordinates of the reduced embeddings.\n\n        Arguments:\n            docs: The documents on which the topic model was trained.\n            df: A dataframe containing the metadata and the documents on which\n                the topic model was originally trained on.\n            metadata: A dictionary with meta data for each document in the form\n                      of column name (key) and the respective values (value).\n\n        Returns:\n            document_info: A dataframe with several statistics regarding\n                           the documents on which the topic model was trained.\n\n        Usage:\n\n        To get the document info, you will only need to pass the documents on which\n        the topic model was trained:\n\n        ```python\n        document_info = topic_model.get_document_info(docs)\n        ```\n\n        There are additionally options to include meta data, such as the topic\n        distributions. Moreover, we can pass the original dataframe that contains\n        the documents and extend it with the information retrieved from BERTopic:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n\n        # The original data in a dataframe format to include the target variable\n        data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\n        df = pd.DataFrame({\"Document\": data['data'], \"Class\": data['target']})\n\n        # Add information about the percentage of the document that relates to the topic\n        topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)\n        distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]\n\n        # Create our documents dataframe using the original dataframe and meta data about\n        # the topic distributions\n        document_info = topic_model.get_document_info(docs, df=df,\n                                                      metadata={\"Topic_distribution\": distributions})\n        ```\n        \"\"\"\n        check_documents_type(docs)\n        if df is not None:\n            document_info = df.copy()\n            document_info[\"Document\"] = docs\n            document_info[\"Topic\"] = self.topics_\n        else:\n            document_info = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_})\n\n        # Add topic info through `.get_topic_info()`\n        topic_info = self.get_topic_info().drop(\"Count\", axis=1)\n        document_info = pd.merge(document_info, topic_info, on=\"Topic\", how=\"left\")\n\n        # Add top n words\n        top_n_words = {topic: \" - \".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)}\n        document_info[\"Top_n_words\"] = document_info.Topic.map(top_n_words)\n\n        # Add flat probabilities\n        if self.probabilities_ is not None:\n            if len(self.probabilities_.shape) == 1:\n                document_info[\"Probability\"] = self.probabilities_\n            else:\n                document_info[\"Probability\"] = [\n                    max(probs) if topic != -1 else 1 - sum(probs)\n                    for topic, probs in zip(self.topics_, self.probabilities_)\n                ]\n\n        # Add representative document labels\n        repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs]\n        document_info[\"Representative_document\"] = False\n        document_info.loc[document_info.Document.isin(repr_docs), \"Representative_document\"] = True\n\n        # Add custom meta data provided by the user\n        if metadata is not None:\n            for column, values in metadata.items():\n                document_info[column] = values\n        return document_info\n\n    def get_representative_docs(self, topic: int = None) -> List[str]:\n        \"\"\"Extract the best representing documents per topic.\n\n        Note:\n            This does not extract all documents per topic as all documents\n            are not saved within BERTopic. To get all documents, please\n            run the following:\n\n            ```python\n            # When you used `.fit_transform`:\n            df = pd.DataFrame({\"Document\": docs, \"Topic\": topic})\n\n            # When you used `.fit`:\n            df = pd.DataFrame({\"Document\": docs, \"Topic\": topic_model.topics_})\n            ```\n\n        Arguments:\n            topic: A specific topic for which you want\n                   the representative documents\n\n        Returns:\n            Representative documents of the chosen topic\n\n        Examples:\n        To extract the representative docs of all topics:\n\n        ```python\n        representative_docs = topic_model.get_representative_docs()\n        ```\n\n        To get the representative docs of a single topic:\n\n        ```python\n        representative_docs = topic_model.get_representative_docs(12)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        if isinstance(topic, int):\n            if self.representative_docs_.get(topic):\n                return self.representative_docs_[topic]\n            else:\n                return None\n        else:\n            return self.representative_docs_\n\n    @staticmethod\n    def get_topic_tree(\n        hier_topics: pd.DataFrame,\n        max_distance: float = None,\n        tight_layout: bool = False,\n    ) -> str:\n        \"\"\"Extract the topic tree such that it can be printed.\n\n        Arguments:\n            hier_topics: A dataframe containing the structure of the topic tree.\n                         This is the output of `topic_model.hierarchical_topics()`\n            max_distance: The maximum distance between two topics. This value is\n                          based on the Distance column in `hier_topics`.\n            tight_layout: Whether to use a tight layout (narrow width) for\n                          easier readability if you have hundreds of topics.\n\n        Returns:\n            A tree that has the following structure when printed:\n                .\n                .\n                \u2514\u2500health_medical_disease_patients_hiv\n                    \u251c\u2500patients_medical_disease_candida_health\n                    \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n                    \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n                    \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n                    \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n                    \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n\n            The blocks (\u25a0) indicate that the topic is one you can directly access\n            from `topic_model.get_topic`. In other words, they are the original un-grouped topics.\n\n        Examples:\n        ```python\n        # Train model\n        from bertopic import BERTopic\n        topic_model = BERTopic()\n        topics, probs = topic_model.fit_transform(docs)\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n        # Print topic tree\n        tree = topic_model.get_topic_tree(hierarchical_topics)\n        print(tree)\n        ```\n        \"\"\"\n        width = 1 if tight_layout else 4\n        if max_distance is None:\n            max_distance = hier_topics.Distance.max() + 1\n\n        max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1\n\n        # Extract mapping from ID to name\n        topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name))\n        topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)))\n        topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()}\n\n        # Create tree\n        tree = {\n            str(row[1].Parent_ID): [\n                str(row[1].Child_Left_ID),\n                str(row[1].Child_Right_ID),\n            ]\n            for row in hier_topics.iterrows()\n        }\n\n        def get_tree(start, tree):\n            \"\"\"Based on: https://stackoverflow.com/a/51920869/10532563.\"\"\"\n\n            def _tree(to_print, start, parent, tree, grandpa=None, indent=\"\"):\n                # Get distance between merged topics\n                distance = hier_topics.loc[\n                    (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent),\n                    \"Distance\",\n                ]\n                distance = distance.values[0] if len(distance) > 0 else 10\n\n                if parent != start:\n                    if grandpa is None:\n                        to_print += topic_to_name[parent]\n                    else:\n                        if int(parent) <= max_original_topic:\n                            # Do not append topic ID if they are not merged\n                            if distance < max_distance:\n                                to_print += \"\u25a0\u2500\u2500\" + topic_to_name[parent] + f\" \u2500\u2500 Topic: {parent}\" + \"\\n\"\n                            else:\n                                to_print += \"O \\n\"\n                        else:\n                            to_print += topic_to_name[parent] + \"\\n\"\n\n                if parent not in tree:\n                    return to_print\n\n                for child in tree[parent][:-1]:\n                    to_print += indent + \"\u251c\" + \"\u2500\"\n                    to_print = _tree(to_print, start, child, tree, parent, indent + \"\u2502\" + \" \" * width)\n\n                child = tree[parent][-1]\n                to_print += indent + \"\u2514\" + \"\u2500\"\n                to_print = _tree(to_print, start, child, tree, parent, indent + \" \" * (width + 1))\n\n                return to_print\n\n            to_print = \".\" + \"\\n\"\n            to_print = _tree(to_print, start, start, tree)\n            return to_print\n\n        start = str(hier_topics.Parent_ID.astype(int).max())\n        return get_tree(start, tree)\n\n    def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None:\n        \"\"\"Set custom topic labels in your fitted BERTopic model.\n\n        Arguments:\n            topic_labels: If a list of topic labels, it should contain the same number\n                          of labels as there are topics. This must be ordered\n                          from the topic with the lowest ID to the highest ID,\n                          including topic -1 if it exists.\n                          If a dictionary of `topic ID`: `topic_label`, it can have\n                          any number of topics as it will only map the topics found\n                          in the dictionary.\n\n        Examples:\n        First, we define our topic labels with `.generate_topic_labels` in which\n        we can customize our topic labels:\n\n        ```python\n        topic_labels = topic_model.generate_topic_labels(nr_words=2,\n                                                    topic_prefix=True,\n                                                    word_length=10,\n                                                    separator=\", \")\n        ```\n\n        Then, we pass these `topic_labels` to our topic model which\n        can be accessed at any time with `.custom_labels_`:\n\n        ```python\n        topic_model.set_topic_labels(topic_labels)\n        topic_model.custom_labels_\n        ```\n\n        You might want to change only a few topic labels instead of all of them.\n        To do so, you can pass a dictionary where the keys are the topic IDs and\n        its keys the topic labels:\n\n        ```python\n        topic_model.set_topic_labels({0: \"Space\", 1: \"Sports\", 2: \"Medicine\"})\n        topic_model.custom_labels_\n        ```\n        \"\"\"\n        unique_topics = sorted(set(self.topics_))\n\n        if isinstance(topic_labels, dict):\n            if self.custom_labels_ is not None:\n                original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)}\n            else:\n                info = self.get_topic_info()\n                original_labels = dict(zip(info.Topic, info.Name))\n            custom_labels = [\n                topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic]\n                for topic in unique_topics\n            ]\n\n        elif isinstance(topic_labels, list):\n            if len(topic_labels) == len(unique_topics):\n                custom_labels = topic_labels\n            else:\n                raise ValueError(\n                    \"Make sure that `topic_labels` contains the same number \" \"of labels as there are topics.\"\n                )\n\n        self.custom_labels_ = custom_labels\n\n    def generate_topic_labels(\n        self,\n        nr_words: int = 3,\n        topic_prefix: bool = True,\n        word_length: int = None,\n        separator: str = \"_\",\n        aspect: str = None,\n    ) -> List[str]:\n        \"\"\"Get labels for each topic in a user-defined format.\n\n        Arguments:\n            nr_words: Top `n` words per topic to use\n            topic_prefix: Whether to use the topic ID as a prefix.\n                          If set to True, the topic ID will be separated\n                          using the `separator`\n            word_length: The maximum length of each word in the topic label.\n                         Some words might be relatively long and setting this\n                         value helps to make sure that all labels have relatively\n                         similar lengths.\n            separator: The string with which the words and topic prefix will be\n                       separated. Underscores are the default but a nice alternative\n                       is `\", \"`.\n            aspect: The aspect from which to generate topic labels\n\n        Returns:\n            topic_labels: A list of topic labels sorted from the lowest topic ID to the highest.\n                          If the topic model was trained using HDBSCAN, the lowest topic ID is -1,\n                          otherwise it is 0.\n\n        Examples:\n        To create our custom topic labels, usage is rather straightforward:\n\n        ```python\n        topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=\", \")\n        ```\n        \"\"\"\n        unique_topics = sorted(set(self.topics_))\n\n        topic_labels = []\n        for topic in unique_topics:\n            if aspect:\n                words, _ = zip(*self.topic_aspects_[aspect][topic])\n            else:\n                words, _ = zip(*self.get_topic(topic))\n\n            if word_length:\n                words = [word[:word_length] for word in words][:nr_words]\n            else:\n                words = list(words)[:nr_words]\n\n            if topic_prefix:\n                topic_label = f\"{topic}{separator}\" + separator.join(words)\n            else:\n                topic_label = separator.join(words)\n\n            topic_labels.append(topic_label)\n\n        return topic_labels\n\n    def merge_topics(\n        self,\n        docs: List[str],\n        topics_to_merge: List[Union[Iterable[int], int]],\n        images: List[str] = None,\n    ) -> None:\n        \"\"\"Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            topics_to_merge: Either a list of topics or a list of list of topics\n                             to merge. For example:\n                                [1, 2, 3] will merge topics 1, 2 and 3\n                                [[1, 2], [3, 4]] will merge topics 1 and 2, and\n                                separately merge topics 3 and 4.\n            images: A list of paths to the images used when calling either\n                    `fit` or `fit_transform`.\n\n        Examples:\n        If you want to merge topics 1, 2, and 3:\n\n        ```python\n        topics_to_merge = [1, 2, 3]\n        topic_model.merge_topics(docs, topics_to_merge)\n        ```\n\n        or if you want to merge topics 1 and 2, and separately\n        merge topics 3 and 4:\n\n        ```python\n        topics_to_merge = [[1, 2],\n                            [3, 4]]\n        topic_model.merge_topics(docs, topics_to_merge)\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        documents = pd.DataFrame(\n            {\n                \"Document\": docs,\n                \"Topic\": self.topics_,\n                \"Image\": images,\n                \"ID\": range(len(docs)),\n            }\n        )\n\n        mapping = {topic: topic for topic in set(self.topics_)}\n        if isinstance(topics_to_merge[0], int):\n            for topic in sorted(topics_to_merge):\n                mapping[topic] = topics_to_merge[0]\n        elif isinstance(topics_to_merge[0], Iterable):\n            for topic_group in sorted(topics_to_merge):\n                for topic in topic_group:\n                    mapping[topic] = topic_group[0]\n        else:\n            raise ValueError(\n                \"Make sure that `topics_to_merge` is either\" \"a list of topics or a list of list of topics.\"\n            )\n\n        # Track mappings and sizes of topics for merging topic embeddings\n        mappings = defaultdict(list)\n        for key, val in sorted(mapping.items()):\n            mappings[val].append(key)\n        mappings = {\n            topic_to: {\n                \"topics_from\": topics_from,\n                \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_from],\n            }\n            for topic_to, topics_from in mappings.items()\n        }\n\n        # Update topics\n        documents.Topic = documents.Topic.map(mapping)\n        self.topic_mapper_.add_mappings(mapping)\n        documents = self._sort_mappings_by_frequency(documents)\n        self._extract_topics(documents, mappings=mappings)\n        self._update_topic_size(documents)\n        self._save_representative_docs(documents)\n        self.probabilities_ = self._map_probabilities(self.probabilities_)\n\n    def reduce_topics(\n        self,\n        docs: List[str],\n        nr_topics: Union[int, str] = 20,\n        images: List[str] = None,\n        use_ctfidf: bool = False,\n    ) -> None:\n        \"\"\"Reduce the number of topics to a fixed number of topics\n        or automatically.\n\n        If nr_topics is an integer, then the number of topics is reduced\n        to nr_topics using `AgglomerativeClustering` on the cosine distance matrix\n        of the topic c-TF-IDF or semantic embeddings.\n\n        If nr_topics is `\"auto\"`, then HDBSCAN is used to automatically\n        reduce the number of topics by running it on the topic embeddings.\n\n        The topics, their sizes, and representations are updated.\n\n        Arguments:\n            docs: The docs you used when calling either `fit` or `fit_transform`\n            nr_topics: The number of topics you want reduced to\n            images: A list of paths to the images used when calling either\n                    `fit` or `fit_transform`\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                        embeddings from the embedding model are used.\n\n        Updates:\n            topics_ : Assigns topics to their merged representations.\n            probabilities_ : Assigns probabilities to their merged representations.\n\n        Examples:\n        You can further reduce the topics by passing the documents with their\n        topics and probabilities (if they were calculated):\n\n        ```python\n        topic_model.reduce_topics(docs, nr_topics=30)\n        ```\n\n        You can then access the updated topics and probabilities with:\n\n        ```python\n        topics = topic_model.topics_\n        probabilities = topic_model.probabilities_\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n\n        self.nr_topics = nr_topics\n        documents = pd.DataFrame(\n            {\n                \"Document\": docs,\n                \"Topic\": self.topics_,\n                \"Image\": images,\n                \"ID\": range(len(docs)),\n            }\n        )\n\n        # Reduce number of topics\n        documents = self._reduce_topics(documents, use_ctfidf)\n        self._merged_topics = None\n        self._save_representative_docs(documents)\n        self.probabilities_ = self._map_probabilities(self.probabilities_)\n\n        return self\n\n    def reduce_outliers(\n        self,\n        documents: List[str],\n        topics: List[int],\n        images: List[str] = None,\n        strategy: str = \"distributions\",\n        probabilities: np.ndarray = None,\n        threshold: float = 0,\n        embeddings: np.ndarray = None,\n        distributions_params: Mapping[str, Any] = {},\n    ) -> List[int]:\n        \"\"\"Reduce outliers by merging them with their nearest topic according\n        to one of several strategies.\n\n        When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created\n        that do not fall within any of the created topics. These are labeled as -1.\n        This function allows the user to match outlier documents with their nearest topic\n        using one of the following strategies using the `strategy` parameter:\n            * \"probabilities\"\n                This uses the soft-clustering as performed by HDBSCAN to find the\n                best matching topic for each outlier document. To use this, make\n                sure to calculate the `probabilities` beforehand by instantiating\n                BERTopic with `calculate_probabilities=True`.\n            * \"distributions\"\n                Use the topic distributions, as calculated with `.approximate_distribution`\n                to find the most frequent topic in each outlier document. You can use the\n                `distributions_params` variable to tweak the parameters of\n                `.approximate_distribution`.\n            * \"c-tf-idf\"\n                Calculate the c-TF-IDF representation for each outlier document and\n                find the best matching c-TF-IDF topic representation using\n                cosine similarity.\n            * \"embeddings\"\n                Using the embeddings of each outlier documents, find the best\n                matching topic embedding using cosine similarity.\n\n        Arguments:\n            documents: A list of documents for which we reduce or remove the outliers.\n            topics: The topics that correspond to the documents\n            images: A list of paths to the images used when calling either\n                    `fit` or `fit_transform`\n            strategy: The strategy used for reducing outliers.\n                    Options:\n                        * \"probabilities\"\n                            This uses the soft-clustering as performed by HDBSCAN\n                            to find the best matching topic for each outlier document.\n\n                        * \"distributions\"\n                            Use the topic distributions, as calculated with `.approximate_distribution`\n                            to find the most frequent topic in each outlier document.\n\n                        * \"c-tf-idf\"\n                            Calculate the c-TF-IDF representation for outlier documents and\n                            find the best matching c-TF-IDF topic representation.\n\n                        * \"embeddings\"\n                            Calculate the embeddings for outlier documents and\n                            find the best matching topic embedding.\n            probabilities: Probabilities generated by HDBSCAN for each document when using the strategy `\"probabilities\"`.\n            threshold: The threshold for assigning topics to outlier documents. This value\n                       represents the minimum probability when `strategy=\"probabilities\"`.\n                       For all other strategies, it represents the minimum similarity.\n            embeddings: The pre-computed embeddings to be used when `strategy=\"embeddings\"`.\n                        If this is None, then it will compute the embeddings for the outlier documents.\n            distributions_params: The parameters used in `.approximate_distribution` when using\n                                  the strategy `\"distributions\"`.\n\n        Returns:\n            new_topics: The updated topics\n\n        Usage:\n\n        The default settings uses the `\"distributions\"` strategy:\n\n        ```python\n        new_topics = topic_model.reduce_outliers(docs, topics)\n        ```\n\n        When you use the `\"probabilities\"` strategy, make sure to also pass the probabilities\n        as generated through HDBSCAN:\n\n        ```python\n        from bertopic import BERTopic\n        topic_model = BERTopic(calculate_probabilities=True)\n        topics, probs = topic_model.fit_transform(docs)\n\n        new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n        ```\n        \"\"\"\n        if not self._outliers:\n            raise ValueError(\"No outliers to reduce.\")\n\n        if images is not None:\n            strategy = \"embeddings\"\n\n        # Check correct use of parameters\n        if strategy.lower() == \"probabilities\" and probabilities is None:\n            raise ValueError(\"Make sure to pass in `probabilities` in order to use the probabilities strategy\")\n\n        # Reduce outliers by extracting most likely topics through the topic-term probability matrix\n        if strategy.lower() == \"probabilities\":\n            new_topics = [\n                np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic\n                for topic, prob in zip(topics, probabilities)\n            ]\n\n        # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions\n        elif strategy.lower() == \"distributions\":\n            outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n            outlier_docs = [documents[index] for index in outlier_ids]\n            topic_distr, _ = self.approximate_distribution(\n                outlier_docs, min_similarity=threshold, **distributions_params\n            )\n            outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr])\n            new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n        # Reduce outliers by finding the most similar c-TF-IDF representations\n        elif strategy.lower() == \"c-tf-idf\":\n            outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n            outlier_docs = [documents[index] for index in outlier_ids]\n\n            # Calculate c-TF-IDF of outlier documents with all topics\n            bow_doc = self.vectorizer_model.transform(outlier_docs)\n            c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n            similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :])\n\n            # Update topics\n            similarity[similarity < threshold] = 0\n            outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n            new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n        # Reduce outliers by finding the most similar topic embeddings\n        elif strategy.lower() == \"embeddings\":\n            if self.embedding_model is None and embeddings is None:\n                raise ValueError(\n                    \"To use this strategy, you will need to pass a model to `embedding_model`\"\n                    \"when instantiating BERTopic.\"\n                )\n            outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n            if images is not None:\n                outlier_docs = [images[index] for index in outlier_ids]\n            else:\n                outlier_docs = [documents[index] for index in outlier_ids]\n\n            # Extract or calculate embeddings for outlier documents\n            if embeddings is not None:\n                outlier_embeddings = np.array([embeddings[index] for index in outlier_ids])\n            elif images is not None:\n                outlier_images = [images[index] for index in outlier_ids]\n                outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose)\n            else:\n                outlier_embeddings = self.embedding_model.embed_documents(outlier_docs)\n            similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :])\n\n            # Update topics\n            similarity[similarity < threshold] = 0\n            outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n            new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n        return new_topics\n\n    def visualize_topics(\n        self,\n        topics: List[int] = None,\n        top_n_topics: int = None,\n        use_ctfidf: bool = False,\n        custom_labels: bool = False,\n        title: str = \"<b>Intertopic Distance Map</b>\",\n        width: int = 650,\n        height: int = 650,\n    ) -> go.Figure:\n        \"\"\"Visualize topics, their sizes, and their corresponding words.\n\n        This visualization is highly inspired by LDAvis, a great visualization\n        technique typically reserved for LDA.\n\n        Arguments:\n            topics: A selection of topics to visualize\n                    Not to be confused with the topics that you get from `.fit_transform`.\n                    For example, if you want to visualize only topics 1 through 5:\n                    `topics = [1, 2, 3, 4, 5]`.\n            top_n_topics: Only select the top n most frequent topics\n            use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_topics()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_topics()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_topics(\n            self,\n            topics=topics,\n            top_n_topics=top_n_topics,\n            use_ctfidf=use_ctfidf,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_documents(\n        self,\n        docs: List[str],\n        topics: List[int] = None,\n        embeddings: np.ndarray = None,\n        reduced_embeddings: np.ndarray = None,\n        sample: float = None,\n        hide_annotations: bool = False,\n        hide_document_hover: bool = False,\n        custom_labels: bool = False,\n        title: str = \"<b>Documents and Topics</b>\",\n        width: int = 1200,\n        height: int = 750,\n    ) -> go.Figure:\n        \"\"\"Visualize documents and their topics in 2D.\n\n        Arguments:\n            topic_model: A fitted BERTopic instance.\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            topics: A selection of topics to visualize.\n                    Not to be confused with the topics that you get from `.fit_transform`.\n                    For example, if you want to visualize only topics 1 through 5:\n                    `topics = [1, 2, 3, 4, 5]`.\n            embeddings: The embeddings of all documents in `docs`.\n            reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n            sample: The percentage of documents in each topic that you would like to keep.\n                    Value can be between 0 and 1. Setting this value to, for example,\n                    0.1 (10% of documents in each topic) makes it easier to visualize\n                    millions of documents as a subset is chosen.\n            hide_annotations: Hide the names of the traces on top of each cluster.\n            hide_document_hover: Hide the content of the documents when hovering over\n                                specific points. Helps to speed up generation of visualization.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_documents(docs)\n        ```\n\n        Do note that this re-calculates the embeddings and reduces them to 2D.\n        The advised and preferred pipeline for using this function is as follows:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n        from bertopic import BERTopic\n        from umap import UMAP\n\n        # Prepare embeddings\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n        # Train BERTopic\n        topic_model = BERTopic().fit(docs, embeddings)\n\n        # Reduce dimensionality of embeddings, this step is optional\n        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n        # Run the visualization with the original embeddings\n        topic_model.visualize_documents(docs, embeddings=embeddings)\n\n        # Or, if you have reduced the original embeddings already:\n        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n        fig.write_html(\"path/to/file.html\")\n        ```\n\n        <iframe src=\"../getting_started/visualization/documents.html\"\n        style=\"width:1000px; height: 800px; border: 0px;\"\"></iframe>\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        return plotting.visualize_documents(\n            self,\n            docs=docs,\n            topics=topics,\n            embeddings=embeddings,\n            reduced_embeddings=reduced_embeddings,\n            sample=sample,\n            hide_annotations=hide_annotations,\n            hide_document_hover=hide_document_hover,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_document_datamap(\n        self,\n        docs: List[str],\n        topics: List[int] = None,\n        embeddings: np.ndarray = None,\n        reduced_embeddings: np.ndarray = None,\n        custom_labels: Union[bool, str] = False,\n        title: str = \"Documents and Topics\",\n        sub_title: Union[str, None] = None,\n        width: int = 1200,\n        height: int = 1200,\n        **datamap_kwds,\n    ):\n        \"\"\"Visualize documents and their topics in 2D as a static plot for publication using\n        DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best\n        to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model.\n\n        Arguments:\n            topic_model:  A fitted BERTopic instance.\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            topics: A selection of topics to visualize.\n            Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points.\n            embeddings:  The embeddings of all documents in `docs`.\n            reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.\n            custom_labels:  If bool, whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n                           If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n            title: Title of the plot.\n            sub_title: Sub-title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n            **datamap_kwds:  All further keyword args will be passed on to DataMapPlot's\n                             `create_plot` function. See the DataMapPlot documentation\n                             for more details.\n\n        Returns:\n            figure: A Matplotlib Figure object.\n\n        Examples:\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_document_datamap(docs)\n        ```\n\n        Do note that this re-calculates the embeddings and reduces them to 2D.\n        The advised and preferred pipeline for using this function is as follows:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n        from bertopic import BERTopic\n        from umap import UMAP\n\n        # Prepare embeddings\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n        # Train BERTopic\n        topic_model = BERTopic(min_topic_size=36).fit(docs, embeddings)\n\n        # Reduce dimensionality of embeddings, this step is optional\n        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n        # Run the visualization with the original embeddings\n        topic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n        # Or, if you have reduced the original embeddings already:\n        topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n        fig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        return plotting.visualize_document_datamap(\n            self,\n            docs,\n            topics,\n            embeddings,\n            reduced_embeddings,\n            custom_labels,\n            title,\n            sub_title,\n            width,\n            height,\n            **datamap_kwds,\n        )\n\n    def visualize_hierarchical_documents(\n        self,\n        docs: List[str],\n        hierarchical_topics: pd.DataFrame,\n        topics: List[int] = None,\n        embeddings: np.ndarray = None,\n        reduced_embeddings: np.ndarray = None,\n        sample: Union[float, int] = None,\n        hide_annotations: bool = False,\n        hide_document_hover: bool = True,\n        nr_levels: int = 10,\n        level_scale: str = \"linear\",\n        custom_labels: bool = False,\n        title: str = \"<b>Hierarchical Documents and Topics</b>\",\n        width: int = 1200,\n        height: int = 750,\n    ) -> go.Figure:\n        \"\"\"Visualize documents and their topics in 2D at different levels of hierarchy.\n\n        Arguments:\n            docs: The documents you used when calling either `fit` or `fit_transform`\n            hierarchical_topics: A dataframe that contains a hierarchy of topics\n                                represented by their parents and their children\n            topics: A selection of topics to visualize.\n                    Not to be confused with the topics that you get from `.fit_transform`.\n                    For example, if you want to visualize only topics 1 through 5:\n                    `topics = [1, 2, 3, 4, 5]`.\n            embeddings: The embeddings of all documents in `docs`.\n            reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n            sample: The percentage of documents in each topic that you would like to keep.\n                    Value can be between 0 and 1. Setting this value to, for example,\n                    0.1 (10% of documents in each topic) makes it easier to visualize\n                    millions of documents as a subset is chosen.\n            hide_annotations: Hide the names of the traces on top of each cluster.\n            hide_document_hover: Hide the content of the documents when hovering over\n                                 specific points. Helps to speed up generation of visualizations.\n            nr_levels: The number of levels to be visualized in the hierarchy. First, the distances\n                       in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with\n                       equal length. Then, for each list of distances, the merged topics, that have\n                       a distance less or equal to the maximum distance of the selected list of distances, are selected.\n                       NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to\n                       the length of `hierarchical_topics`.\n            level_scale: Whether to apply a linear or logarithmic ('log') scale levels of the distance\n                         vector. Linear scaling will perform an equal number of merges at each level\n                         while logarithmic scaling will perform more mergers in earlier levels to\n                         provide more resolution at higher levels (this can be used for when the number\n                         of topics is large).\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n                           NOTE: Custom labels are only generated for the original\n                           un-merged topics.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n        To visualize the topics simply run:\n\n        ```python\n        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n        ```\n\n        Do note that this re-calculates the embeddings and reduces them to 2D.\n        The advised and preferred pipeline for using this function is as follows:\n\n        ```python\n        from sklearn.datasets import fetch_20newsgroups\n        from sentence_transformers import SentenceTransformer\n        from bertopic import BERTopic\n        from umap import UMAP\n\n        # Prepare embeddings\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n        sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n        embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n        # Train BERTopic and extract hierarchical topics\n        topic_model = BERTopic().fit(docs, embeddings)\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n        # Reduce dimensionality of embeddings, this step is optional\n        # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n        # Run the visualization with the original embeddings\n        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n        # Or, if you have reduced the original embeddings already:\n        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n        fig.write_html(\"path/to/file.html\")\n        ```\n\n        <iframe src=\"../getting_started/visualization/hierarchical_documents.html\"\n        style=\"width:1000px; height: 770px; border: 0px;\"\"></iframe>\n        \"\"\"\n        check_is_fitted(self)\n        check_documents_type(docs)\n        return plotting.visualize_hierarchical_documents(\n            self,\n            docs=docs,\n            hierarchical_topics=hierarchical_topics,\n            topics=topics,\n            embeddings=embeddings,\n            reduced_embeddings=reduced_embeddings,\n            sample=sample,\n            hide_annotations=hide_annotations,\n            hide_document_hover=hide_document_hover,\n            nr_levels=nr_levels,\n            level_scale=level_scale,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_term_rank(\n        self,\n        topics: List[int] = None,\n        log_scale: bool = False,\n        custom_labels: bool = False,\n        title: str = \"<b>Term score decline per Topic</b>\",\n        width: int = 800,\n        height: int = 500,\n    ) -> go.Figure:\n        \"\"\"Visualize the ranks of all terms across all topics.\n\n        Each topic is represented by a set of words. These words, however,\n        do not all equally represent the topic. This visualization shows\n        how many words are needed to represent a topic and at which point\n        the beneficial effect of adding words starts to decline.\n\n        Arguments:\n            topics: A selection of topics to visualize. These will be colored\n                    red where all others will be colored black.\n            log_scale: Whether to represent the ranking on a log scale\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n        To visualize the ranks of all words across\n        all topics simply run:\n\n        ```python\n        topic_model.visualize_term_rank()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_term_rank()\n        fig.write_html(\"path/to/file.html\")\n        ```\n\n        Reference:\n\n        This visualization was heavily inspired by the\n        \"Term Probability Decline\" visualization found in an\n        analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/).\n        Reference to that specific analysis can be found\n        [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html).\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_term_rank(\n            self,\n            topics=topics,\n            log_scale=log_scale,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_topics_over_time(\n        self,\n        topics_over_time: pd.DataFrame,\n        top_n_topics: int = None,\n        topics: List[int] = None,\n        normalize_frequency: bool = False,\n        custom_labels: bool = False,\n        title: str = \"<b>Topics over Time</b>\",\n        width: int = 1250,\n        height: int = 450,\n    ) -> go.Figure:\n        \"\"\"Visualize topics over time.\n\n        Arguments:\n            topics_over_time: The topics you would like to be visualized with the\n                              corresponding topic representation\n            top_n_topics: To visualize the most frequent topics instead of all\n            topics: Select which topics you would like to be visualized\n            normalize_frequency: Whether to normalize each topic's frequency individually\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            A plotly.graph_objects.Figure including all traces\n\n        Examples:\n        To visualize the topics over time, simply run:\n\n        ```python\n        topics_over_time = topic_model.topics_over_time(docs, timestamps)\n        topic_model.visualize_topics_over_time(topics_over_time)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_topics_over_time(topics_over_time)\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_topics_over_time(\n            self,\n            topics_over_time=topics_over_time,\n            top_n_topics=top_n_topics,\n            topics=topics,\n            normalize_frequency=normalize_frequency,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_topics_per_class(\n        self,\n        topics_per_class: pd.DataFrame,\n        top_n_topics: int = 10,\n        topics: List[int] = None,\n        normalize_frequency: bool = False,\n        custom_labels: bool = False,\n        title: str = \"<b>Topics per Class</b>\",\n        width: int = 1250,\n        height: int = 900,\n    ) -> go.Figure:\n        \"\"\"Visualize topics per class.\n\n        Arguments:\n            topics_per_class: The topics you would like to be visualized with the\n                              corresponding topic representation\n            top_n_topics: To visualize the most frequent topics instead of all\n            topics: Select which topics you would like to be visualized\n            normalize_frequency: Whether to normalize each topic's frequency individually\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            A plotly.graph_objects.Figure including all traces\n\n        Examples:\n        To visualize the topics per class, simply run:\n\n        ```python\n        topics_per_class = topic_model.topics_per_class(docs, classes)\n        topic_model.visualize_topics_per_class(topics_per_class)\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_topics_per_class(topics_per_class)\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_topics_per_class(\n            self,\n            topics_per_class=topics_per_class,\n            top_n_topics=top_n_topics,\n            topics=topics,\n            normalize_frequency=normalize_frequency,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_distribution(\n        self,\n        probabilities: np.ndarray,\n        min_probability: float = 0.015,\n        custom_labels: bool = False,\n        title: str = \"<b>Topic Probability Distribution</b>\",\n        width: int = 800,\n        height: int = 600,\n    ) -> go.Figure:\n        \"\"\"Visualize the distribution of topic probabilities.\n\n        Arguments:\n            probabilities: An array of probability scores\n            min_probability: The minimum probability score to visualize.\n                             All others are ignored.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Examples:\n        Make sure to fit the model before and only input the\n        probabilities of a single document:\n\n        ```python\n        topic_model.visualize_distribution(topic_model.probabilities_[0])\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_distribution(topic_model.probabilities_[0])\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_distribution(\n            self,\n            probabilities=probabilities,\n            min_probability=min_probability,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_approximate_distribution(\n        self,\n        document: str,\n        topic_token_distribution: np.ndarray,\n        normalize: bool = False,\n    ):\n        \"\"\"Visualize the topic distribution calculated by `.approximate_topic_distribution`\n        on a token level. Thereby indicating the extent to which a certain word or phrase belongs\n        to a specific topic. The assumption here is that a single word can belong to multiple\n        similar topics and as such can give information about the broader set of topics within\n        a single document.\n\n        Arguments:\n            topic_model: A fitted BERTopic instance.\n            document: The document for which you want to visualize\n                      the approximated topic distribution.\n            topic_token_distribution: The topic-token distribution of the document as\n                                      extracted by `.approximate_topic_distribution`\n            normalize: Whether to normalize, between 0 and 1 (summing up to 1), the\n                       topic distribution values.\n\n        Returns:\n            df: A stylized dataframe indicating the best fitting topics\n                for each token.\n\n        Examples:\n        ```python\n        # Calculate the topic distributions on a token level\n        # Note that we need to have `calculate_token_level=True`\n        topic_distr, topic_token_distr = topic_model.approximate_distribution(\n                docs, calculate_token_level=True\n        )\n\n        # Visualize the approximated topic distributions\n        df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])\n        df\n        ```\n\n        To revert this stylized dataframe back to a regular dataframe,\n        you can run the following:\n\n        ```python\n        df.data.columns = [column.strip() for column in df.data.columns]\n        df = df.data\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_approximate_distribution(\n            self,\n            document=document,\n            topic_token_distribution=topic_token_distribution,\n            normalize=normalize,\n        )\n\n    def visualize_hierarchy(\n        self,\n        orientation: str = \"left\",\n        topics: List[int] = None,\n        top_n_topics: int = None,\n        use_ctfidf: bool = True,\n        custom_labels: bool = False,\n        title: str = \"<b>Hierarchical Clustering</b>\",\n        width: int = 1000,\n        height: int = 600,\n        hierarchical_topics: pd.DataFrame = None,\n        linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n        distance_function: Callable[[csr_matrix], csr_matrix] = None,\n        color_threshold: int = 1,\n    ) -> go.Figure:\n        \"\"\"Visualize a hierarchical structure of the topics.\n\n        A ward linkage function is used to perform the\n        hierarchical clustering based on the cosine distance\n        matrix between c-TF-IDF or semantic embeddings of the topics.\n\n        Arguments:\n            topic_model: A fitted BERTopic instance.\n            orientation: The orientation of the figure.\n                         Either 'left' or 'bottom'\n            topics: A selection of topics to visualize\n            top_n_topics: Only select the top n most frequent topics\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                        embeddings from the embedding model are used.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n                           NOTE: Custom labels are only generated for the original\n                           un-merged topics.\n            title: Title of the plot.\n            width: The width of the figure. Only works if orientation is set to 'left'\n            height: The height of the figure. Only works if orientation is set to 'bottom'\n            hierarchical_topics: A dataframe that contains a hierarchy of topics\n                                 represented by their parents and their children.\n                                 NOTE: The hierarchical topic names are only visualized\n                                 if both `topics` and `top_n_topics` are not set.\n            linkage_function: The linkage function to use. Default is:\n                              `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n                              NOTE: Make sure to use the same `linkage_function` as used\n                              in `topic_model.hierarchical_topics`.\n            distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                               `lambda x: 1 - cosine_similarity(x)`\n                               NOTE: Make sure to use the same `distance_function` as used\n                               in `topic_model.hierarchical_topics`.\n            color_threshold: Value at which the separation of clusters will be made which\n                             will result in different colors for different clusters.\n                             A higher value will typically lead to less colored clusters.\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n        To visualize the hierarchical structure of\n        topics simply run:\n\n        ```python\n        topic_model.visualize_hierarchy()\n        ```\n\n        If you also want the labels of hierarchical topics visualized,\n        run the following:\n\n        ```python\n        # Extract hierarchical topics and their representations\n        hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n        # Visualize these representations\n        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n        ```\n\n        If you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_hierarchy()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        <iframe src=\"../getting_started/visualization/hierarchy.html\"\n        style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_hierarchy(\n            self,\n            orientation=orientation,\n            topics=topics,\n            top_n_topics=top_n_topics,\n            use_ctfidf=use_ctfidf,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n            hierarchical_topics=hierarchical_topics,\n            linkage_function=linkage_function,\n            distance_function=distance_function,\n            color_threshold=color_threshold,\n        )\n\n    def visualize_heatmap(\n        self,\n        topics: List[int] = None,\n        top_n_topics: int = None,\n        n_clusters: int = None,\n        use_ctfidf: bool = False,\n        custom_labels: bool = False,\n        title: str = \"<b>Similarity Matrix</b>\",\n        width: int = 800,\n        height: int = 800,\n    ) -> go.Figure:\n        \"\"\"Visualize a heatmap of the topic's similarity matrix.\n\n        Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics,\n        a heatmap is created showing the similarity between topics.\n\n        Arguments:\n            topics: A selection of topics to visualize.\n            top_n_topics: Only select the top n most frequent topics.\n            n_clusters: Create n clusters and order the similarity\n                        matrix by those clusters.\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                        embeddings from the embedding model are used.\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of the figure.\n            height: The height of the figure.\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n        To visualize the similarity matrix of\n        topics simply run:\n\n        ```python\n        topic_model.visualize_heatmap()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_heatmap()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_heatmap(\n            self,\n            topics=topics,\n            top_n_topics=top_n_topics,\n            n_clusters=n_clusters,\n            use_ctfidf=use_ctfidf,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n        )\n\n    def visualize_barchart(\n        self,\n        topics: List[int] = None,\n        top_n_topics: int = 8,\n        n_words: int = 5,\n        custom_labels: bool = False,\n        title: str = \"Topic Word Scores\",\n        width: int = 250,\n        height: int = 250,\n        autoscale: bool = False,\n    ) -> go.Figure:\n        \"\"\"Visualize a barchart of selected topics.\n\n        Arguments:\n            topics: A selection of topics to visualize.\n            top_n_topics: Only select the top n most frequent topics.\n            n_words: Number of words to show in a topic\n            custom_labels: Whether to use custom topic labels that were defined using\n                           `topic_model.set_topic_labels`.\n            title: Title of the plot.\n            width: The width of each figure.\n            height: The height of each figure.\n            autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text\n\n        Returns:\n            fig: A plotly figure\n\n        Examples:\n        To visualize the barchart of selected topics\n        simply run:\n\n        ```python\n        topic_model.visualize_barchart()\n        ```\n\n        Or if you want to save the resulting figure:\n\n        ```python\n        fig = topic_model.visualize_barchart()\n        fig.write_html(\"path/to/file.html\")\n        ```\n        \"\"\"\n        check_is_fitted(self)\n        return plotting.visualize_barchart(\n            self,\n            topics=topics,\n            top_n_topics=top_n_topics,\n            n_words=n_words,\n            custom_labels=custom_labels,\n            title=title,\n            width=width,\n            height=height,\n            autoscale=autoscale,\n        )\n\n    def save(\n        self,\n        path,\n        serialization: Literal[\"safetensors\", \"pickle\", \"pytorch\"] = \"pickle\",\n        save_embedding_model: Union[bool, str] = True,\n        save_ctfidf: bool = False,\n    ):\n        \"\"\"Saves the model to the specified path or folder.\n\n        When saving the model, make sure to also keep track of the versions\n        of dependencies and Python used. Loading and saving the model should\n        be done using the same dependencies and Python. Moreover, models\n        saved in one version of BERTopic should not be loaded in other versions.\n\n        Arguments:\n            path: If `serialization` is 'safetensors' or `pytorch`, this is a directory.\n                  If `serialization` is `pickle`, then this is a file.\n            serialization: If `pickle`, the entire model will be pickled. If `safetensors`\n                           or `pytorch` the model will be saved without the embedding,\n                           dimensionality reduction, and clustering algorithms.\n                           This is a very efficient format and typically advised.\n            save_embedding_model: If serialization is `pickle`, then you can choose to skip\n                                  saving the embedding model. If serialization is `safetensors`\n                                  or `pytorch`, this variable can be used as a string pointing\n                                  towards a huggingface model.\n            save_ctfidf: Whether to save c-TF-IDF information if serialization is `safetensors`\n                         or `pytorch`\n\n        Examples:\n        To save the model in an efficient and safe format (safetensors) with c-TF-IDF information:\n\n        ```python\n        topic_model.save(\"model_dir\", serialization=\"safetensors\", save_ctfidf=True)\n        ```\n\n        If you wish to also add a pointer to the embedding model, which will be downloaded from\n        HuggingFace upon loading:\n\n        ```python\n        embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\n        topic_model.save(\"model_dir\", serialization=\"safetensors\", save_embedding_model=embedding_model)\n        ```\n\n        or if you want save the full model with pickle:\n\n        ```python\n        topic_model.save(\"my_model\")\n        ```\n\n        NOTE: Pickle can run arbitrary code and is generally considered to be less safe than\n        safetensors.\n        \"\"\"\n        if serialization == \"pickle\":\n            logger.warning(\n                \"When you use `pickle` to save/load a BERTopic model,\"\n                \"please make sure that the environments in which you save\"\n                \"and load the model are **exactly** the same. The version of BERTopic,\"\n                \"its dependencies, and python need to remain the same.\"\n            )\n\n            with open(path, \"wb\") as file:\n                # This prevents the vectorizer from being too large in size if `min_df` was\n                # set to a value higher than 1\n                self.vectorizer_model.stop_words_ = None\n\n                if not save_embedding_model:\n                    embedding_model = self.embedding_model\n                    self.embedding_model = None\n                    joblib.dump(self, file)\n                    self.embedding_model = embedding_model\n                else:\n                    joblib.dump(self, file)\n        elif serialization == \"safetensors\" or serialization == \"pytorch\":\n            # Directory\n            save_directory = Path(path)\n            save_directory.mkdir(exist_ok=True, parents=True)\n\n            # Check embedding model\n            if (\n                save_embedding_model\n                and hasattr(self.embedding_model, \"_hf_model\")\n                and not isinstance(save_embedding_model, str)\n            ):\n                save_embedding_model = self.embedding_model._hf_model\n            elif not save_embedding_model:\n                logger.warning(\n                    \"You are saving a BERTopic model without explicitly defining an embedding model.\"\n                    \"If you are using a sentence-transformers model or a HuggingFace model supported\"\n                    \"by sentence-transformers, please save the model by using a pointer towards that model.\"\n                    \"For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`\"\n                )\n\n            # Minimal\n            save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization)\n            save_utils.save_topics(model=self, path=save_directory / \"topics.json\")\n            save_utils.save_images(model=self, path=save_directory / \"images\")\n            save_utils.save_config(\n                model=self,\n                path=save_directory / \"config.json\",\n                embedding_model=save_embedding_model,\n            )\n\n            # Additional\n            if save_ctfidf:\n                save_utils.save_ctfidf(\n                    model=self,\n                    save_directory=save_directory,\n                    serialization=serialization,\n                )\n                save_utils.save_ctfidf_config(model=self, path=save_directory / \"ctfidf_config.json\")\n\n    @classmethod\n    def load(cls, path: str, embedding_model=None):\n        \"\"\"Loads the model from the specified path or directory.\n\n        Arguments:\n            path: Either load a BERTopic model from a file (`.pickle`) or a folder containing\n                  `.safetensors` or `.bin` files.\n            embedding_model: Additionally load in an embedding model if it was not saved\n                             in the BERTopic model file or directory.\n\n        Examples:\n        ```python\n        BERTopic.load(\"model_dir\")\n        ```\n\n        or if you did not save the embedding model:\n\n        ```python\n        BERTopic.load(\"model_dir\", embedding_model=\"all-MiniLM-L6-v2\")\n        ```\n        \"\"\"\n        file_or_dir = Path(path)\n\n        # Load from Pickle\n        if file_or_dir.is_file():\n            with open(file_or_dir, \"rb\") as file:\n                if embedding_model:\n                    topic_model = joblib.load(file)\n                    topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose)\n                else:\n                    topic_model = joblib.load(file)\n                return topic_model\n\n        # Load from directory or HF\n        if file_or_dir.is_dir():\n            topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir)\n        elif \"/\" in str(path):\n            topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path)\n        else:\n            raise ValueError(\"Make sure to either pass a valid directory or HF model.\")\n        topic_model = _create_model_from_files(\n            topics,\n            params,\n            tensors,\n            ctfidf_tensors,\n            ctfidf_config,\n            images,\n            warn_no_backend=(embedding_model is None),\n        )\n\n        # Replace embedding model if one is specifically chosen\n        if embedding_model is not None:\n            topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose)\n\n        return topic_model\n\n    @classmethod\n    def merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None):\n        \"\"\"Merge multiple pre-trained BERTopic models into a single model.\n\n        The models are merged as if they were all saved using pytorch or\n        safetensors, so a minimal version without c-TF-IDF.\n\n        To do this, we choose the first model in the list of\n        models as a baseline. Then, we check each model whether\n        they contain topics that are not in the baseline.\n        This check is based on the cosine similarity between\n        topics embeddings. If topic embeddings between two models\n        are similar, then the topic of the second model is re-assigned\n        to the first. If they are dissimilar, the topic of the second\n        model is assigned to the first.\n\n        In essence, we simply check whether sufficiently \"new\"\n        topics emerge and add them.\n\n        Arguments:\n            models: A list of fitted BERTopic models\n            min_similarity: The minimum similarity for when topics are merged.\n            embedding_model: Additionally load in an embedding model if necessary.\n\n        Returns:\n            A new BERTopic model that was created as if you were\n            loading a model from the HuggingFace Hub without c-TF-IDF\n\n        Examples:\n        ```python\n        from bertopic import BERTopic\n        from sklearn.datasets import fetch_20newsgroups\n\n        docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n        # Create three separate models\n        topic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000])\n        topic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000])\n        topic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:])\n\n        # Combine all models into one\n        merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n        ```\n        \"\"\"\n        import torch\n\n        # Temporarily save model and push to HF\n        with TemporaryDirectory() as tmpdir:\n            # Save model weights and config.\n            all_topics, all_params, all_tensors = [], [], []\n            for index, model in enumerate(models):\n                model.save(tmpdir, serialization=\"pytorch\")\n                topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir))\n                all_topics.append(topics)\n                all_params.append(params)\n                all_tensors.append(np.array(tensors[\"topic_embeddings\"]))\n\n                # Create a base set of parameters\n                if index == 0:\n                    merged_topics = topics\n                    merged_params = params\n                    merged_tensors = np.array(tensors[\"topic_embeddings\"])\n                    merged_topics[\"custom_labels\"] = None\n\n        for tensors, selected_topics in zip(all_tensors[1:], all_topics[1:]):\n            # Calculate similarity matrix\n            sim_matrix = cosine_similarity(tensors, merged_tensors)\n            sims = np.max(sim_matrix, axis=1)\n\n            # Extract new topics\n            new_topics = sorted(\n                [index - selected_topics[\"_outliers\"] for index, sim in enumerate(sims) if sim < min_similarity]\n            )\n            max_topic = max(set(merged_topics[\"topics\"]))\n\n            # Merge Topic Representations\n            new_topics_dict = {}\n            for new_topic in new_topics:\n                if new_topic != -1:\n                    max_topic += 1\n                    new_topics_dict[new_topic] = max_topic\n                    merged_topics[\"topic_representations\"][str(max_topic)] = selected_topics[\"topic_representations\"][\n                        str(new_topic)\n                    ]\n                    merged_topics[\"topic_labels\"][str(max_topic)] = selected_topics[\"topic_labels\"][str(new_topic)]\n\n                    # Add new aspects\n                    if selected_topics[\"topic_aspects\"]:\n                        aspects_1 = set(merged_topics[\"topic_aspects\"].keys())\n                        aspects_2 = set(selected_topics[\"topic_aspects\"].keys())\n                        aspects_diff = aspects_2.difference(aspects_1)\n                        if aspects_diff:\n                            for aspect in aspects_diff:\n                                merged_topics[\"topic_aspects\"][aspect] = {}\n\n                        # If the original model does not have topic aspects but the to be added model does\n                        if not merged_topics.get(\"topic_aspects\"):\n                            merged_topics[\"topic_aspects\"] = selected_topics[\"topic_aspects\"]\n\n                        # If they both contain topic aspects, add to the existing set of aspects\n                        else:\n                            for aspect, values in selected_topics[\"topic_aspects\"].items():\n                                merged_topics[\"topic_aspects\"][aspect][str(max_topic)] = values[str(new_topic)]\n\n                    # Add new embeddings\n                    new_tensors = tensors[new_topic + selected_topics[\"_outliers\"]]\n                    merged_tensors = np.vstack([merged_tensors, new_tensors])\n\n            # Topic Mapper\n            merged_topics[\"topic_mapper\"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_\n\n            # Find similar topics and re-assign those from the new models\n            sims_idx = np.argmax(sim_matrix, axis=1)\n            sims = np.max(sim_matrix, axis=1)\n            to_merge = {\n                a - selected_topics[\"_outliers\"]: b - merged_topics[\"_outliers\"]\n                for a, (b, val) in enumerate(zip(sims_idx, sims))\n                if val >= min_similarity\n            }\n            to_merge.update(new_topics_dict)\n            to_merge[-1] = -1\n            topics = [to_merge[topic] for topic in selected_topics[\"topics\"]]\n            merged_topics[\"topics\"].extend(topics)\n            merged_topics[\"topic_sizes\"] = dict(Counter(merged_topics[\"topics\"]))\n\n        # Create a new model from the merged parameters\n        merged_tensors = {\"topic_embeddings\": torch.from_numpy(merged_tensors)}\n        merged_model = _create_model_from_files(\n            merged_topics,\n            merged_params,\n            merged_tensors,\n            None,\n            None,\n            None,\n            warn_no_backend=False,\n        )\n        merged_model.embedding_model = models[0].embedding_model\n\n        # Replace embedding model if one is specifically chosen\n        verbose = any([model.verbose for model in models])\n        if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder:\n            merged_model.embedding_model = select_backend(embedding_model, verbose=verbose)\n        return merged_model\n\n    def push_to_hf_hub(\n        self,\n        repo_id: str,\n        commit_message: str = \"Add BERTopic model\",\n        token: str = None,\n        revision: str = None,\n        private: bool = False,\n        create_pr: bool = False,\n        model_card: bool = True,\n        serialization: str = \"safetensors\",\n        save_embedding_model: Union[str, bool] = True,\n        save_ctfidf: bool = False,\n    ):\n        \"\"\"Push your BERTopic model to a HuggingFace Hub.\n\n        Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:\n\n        * Log in to your HuggingFace account with the following command:\n            ```bash\n            huggingface-cli login\n\n            # or using an environment variable\n            huggingface-cli login --token $HUGGINGFACE_TOKEN\n            ```\n        * Alternatively, you can programmatically login using login() in a notebook or a script:\n            ```python\n            from huggingface_hub import login\n            login()\n            ```\n        * Or you can give a token with the `token` variable\n\n        Arguments:\n            repo_id: The name of your HuggingFace repository\n            commit_message: A commit message\n            token: Token to add if not already logged in\n            revision: Repository revision\n            private: Whether to create a private repository\n            create_pr: Whether to upload the model as a Pull Request\n            model_card: Whether to automatically create a modelcard\n            serialization: The type of serialization.\n                           Either `safetensors` or `pytorch`\n            save_embedding_model: A pointer towards a HuggingFace model to be loaded in with\n                                  SentenceTransformers. E.g.,\n                                  `sentence-transformers/all-MiniLM-L6-v2`\n            save_ctfidf: Whether to save c-TF-IDF information\n\n\n        Examples:\n        ```python\n        topic_model.push_to_hf_hub(\n            repo_id=\"ArXiv\",\n            save_ctfidf=True,\n            save_embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n        )\n        ```\n        \"\"\"\n        return save_utils.push_to_hf_hub(\n            model=self,\n            repo_id=repo_id,\n            commit_message=commit_message,\n            token=token,\n            revision=revision,\n            private=private,\n            create_pr=create_pr,\n            model_card=model_card,\n            serialization=serialization,\n            save_embedding_model=save_embedding_model,\n            save_ctfidf=save_ctfidf,\n        )\n\n    def get_params(self, deep: bool = False) -> Mapping[str, Any]:\n        \"\"\"Get parameters for this estimator.\n\n        Adapted from:\n            https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178\n\n        Arguments:\n            deep: bool, default=True\n                  If True, will return the parameters for this estimator and\n                  contained subobjects that are estimators.\n\n        Returns:\n            out: Parameter names mapped to their values.\n        \"\"\"\n        out = dict()\n        for key in self._get_param_names():\n            value = getattr(self, key)\n            if deep and hasattr(value, \"get_params\"):\n                deep_items = value.get_params().items()\n                out.update((key + \"__\" + k, val) for k, val in deep_items)\n            out[key] = value\n        return out\n\n    def _extract_embeddings(\n        self,\n        documents: Union[List[str], str],\n        images: List[str] = None,\n        method: str = \"document\",\n        verbose: bool = None,\n    ) -> np.ndarray:\n        \"\"\"Extract sentence/document embeddings through pre-trained embeddings\n        For an overview of pre-trained models: https://www.sbert.net/docs/pretrained_models.html.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            images: A list of paths to the images to fit on or the images themselves\n            method: Whether to extract document or word-embeddings, options are \"document\" and \"word\"\n            verbose: Whether to show a progressbar demonstrating the time to extract embeddings\n\n        Returns:\n            embeddings: The extracted embeddings.\n        \"\"\"\n        if isinstance(documents, str):\n            documents = [documents]\n\n        if images is not None and hasattr(self.embedding_model, \"embed_images\"):\n            embeddings = self.embedding_model.embed(documents=documents, images=images, verbose=verbose)\n        elif method == \"word\":\n            embeddings = self.embedding_model.embed_words(words=documents, verbose=verbose)\n        elif method == \"document\":\n            embeddings = self.embedding_model.embed_documents(documents, verbose=verbose)\n        elif documents[0] is None and images is None:\n            raise ValueError(\n                \"Make sure to use an embedding model that can either embed documents\"\n                \"or images depending on which you want to embed.\"\n            )\n        else:\n            raise ValueError(\n                \"Wrong method for extracting document/word embeddings. \"\n                \"Either choose 'word' or 'document' as the method. \"\n            )\n        return embeddings\n\n    def _images_to_text(self, documents: pd.DataFrame, embeddings: np.ndarray) -> pd.DataFrame:\n        \"\"\"Convert images to text.\"\"\"\n        logger.info(\"Images - Converting images to text. This might take a while.\")\n        if isinstance(self.representation_model, dict):\n            for tuner in self.representation_model.values():\n                if getattr(tuner, \"image_to_text_model\", False):\n                    documents = tuner.image_to_text(documents, embeddings)\n        elif isinstance(self.representation_model, list):\n            for tuner in self.representation_model:\n                if getattr(tuner, \"image_to_text_model\", False):\n                    documents = tuner.image_to_text(documents, embeddings)\n        elif isinstance(self.representation_model, BaseRepresentation):\n            if getattr(self.representation_model, \"image_to_text_model\", False):\n                documents = self.representation_model.image_to_text(documents, embeddings)\n        logger.info(\"Images - Completed \\u2713\")\n        return documents\n\n    def _map_predictions(self, predictions: List[int]) -> List[int]:\n        \"\"\"Map predictions to the correct topics if topics were reduced.\"\"\"\n        mappings = self.topic_mapper_.get_mappings(original_topics=True)\n        mapped_predictions = [mappings[prediction] if prediction in mappings else -1 for prediction in predictions]\n        return mapped_predictions\n\n    def _reduce_dimensionality(\n        self,\n        embeddings: Union[np.ndarray, csr_matrix],\n        y: Union[List[int], np.ndarray] = None,\n        partial_fit: bool = False,\n    ) -> np.ndarray:\n        \"\"\"Reduce dimensionality of embeddings using UMAP and train a UMAP model.\n\n        Arguments:\n            embeddings: The extracted embeddings using the sentence transformer module.\n            y: The target class for (semi)-supervised dimensionality reduction\n            partial_fit: Whether to run `partial_fit` for online learning\n\n        Returns:\n            umap_embeddings: The reduced embeddings\n        \"\"\"\n        logger.info(\"Dimensionality - Fitting the dimensionality reduction algorithm\")\n        # Partial fit\n        if partial_fit:\n            if hasattr(self.umap_model, \"partial_fit\"):\n                self.umap_model = self.umap_model.partial_fit(embeddings)\n            elif self.topic_representations_ is None:\n                self.umap_model.fit(embeddings)\n\n        # Regular fit\n        else:\n            try:\n                # cuml umap needs y to be an numpy array\n                y = np.array(y) if y is not None else None\n                self.umap_model.fit(embeddings, y=y)\n            except TypeError:\n                self.umap_model.fit(embeddings)\n\n        umap_embeddings = self.umap_model.transform(embeddings)\n        logger.info(\"Dimensionality - Completed \\u2713\")\n        return np.nan_to_num(umap_embeddings)\n\n    def _cluster_embeddings(\n        self,\n        umap_embeddings: np.ndarray,\n        documents: pd.DataFrame,\n        partial_fit: bool = False,\n        y: np.ndarray = None,\n    ) -> Tuple[pd.DataFrame, np.ndarray]:\n        \"\"\"Cluster UMAP embeddings with HDBSCAN.\n\n        Arguments:\n            umap_embeddings: The reduced sentence embeddings with UMAP\n            documents: Dataframe with documents and their corresponding IDs\n            partial_fit: Whether to run `partial_fit` for online learning\n            y: Array of topics to use\n\n        Returns:\n            documents: Updated dataframe with documents and their corresponding IDs\n                       and newly added Topics\n            probabilities: The distribution of probabilities\n        \"\"\"\n        logger.info(\"Cluster - Start clustering the reduced embeddings\")\n        if partial_fit:\n            self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings)\n            labels = self.hdbscan_model.labels_\n            documents[\"Topic\"] = labels\n            self.topics_ = labels\n        else:\n            try:\n                self.hdbscan_model.fit(umap_embeddings, y=y)\n            except TypeError:\n                self.hdbscan_model.fit(umap_embeddings)\n\n            try:\n                labels = self.hdbscan_model.labels_\n            except AttributeError:\n                labels = y\n            documents[\"Topic\"] = labels\n            self._update_topic_size(documents)\n\n        # Extract probabilities\n        probabilities = None\n        if hasattr(self.hdbscan_model, \"probabilities_\"):\n            probabilities = self.hdbscan_model.probabilities_\n\n            if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model):\n                probabilities = hdbscan_delegator(self.hdbscan_model, \"all_points_membership_vectors\")\n\n        if not partial_fit:\n            self.topic_mapper_ = TopicMapper(self.topics_)\n        logger.info(\"Cluster - Completed \\u2713\")\n        return documents, probabilities\n\n    def _zeroshot_topic_modeling(\n        self, documents: pd.DataFrame, embeddings: np.ndarray\n    ) -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]:\n        \"\"\"Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list.\n\n        We transform the topics in `self.zeroshot_topic_list` to embeddings and\n        compare them through cosine similarity with the document embeddings.\n        If they pass the `self.zeroshot_min_similarity` threshold, they are assigned.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            embeddings: The document embeddings\n\n        Returns:\n            documents: The leftover documents that were not assigned to any topic\n            embeddings: The leftover embeddings that were not assigned to any topic\n        \"\"\"\n        logger.info(\"Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics\")\n        # Similarity between document and zero-shot topic embeddings\n        zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list)\n        cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings)\n        assignment = np.argmax(cosine_similarities, 1)\n        assignment_vals = np.max(cosine_similarities, 1)\n        assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity]\n        non_assigned_ids = [\n            index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity\n        ]\n\n        # Assign topics\n        assigned_documents = documents.iloc[assigned_ids]\n        assigned_documents[\"Topic\"] = [topic for topic in assignment[assigned_ids]]\n        assigned_documents[\"Old_ID\"] = assigned_documents[\"ID\"].copy()\n        assigned_documents[\"ID\"] = range(len(assigned_documents))\n        assigned_embeddings = embeddings[assigned_ids]\n\n        # Check that if a number of topics was specified, it exceeds the number of zeroshot topics matched\n        num_zeroshot_topics = len(assigned_documents[\"Topic\"].unique())\n        if self.nr_topics and not self.nr_topics > num_zeroshot_topics:\n            raise ValueError(\n                f\"The set nr_topics ({self.nr_topics}) must exceed the number of matched zero-shot topics \"\n                f\"({num_zeroshot_topics}). Consider raising nr_topics or raising the \"\n                f\"zeroshot_min_similarity ({self.zeroshot_min_similarity}).\"\n            )\n\n        # Select non-assigned topics to be clustered\n        documents = documents.iloc[non_assigned_ids]\n        documents[\"Old_ID\"] = documents[\"ID\"].copy()\n        documents[\"ID\"] = range(len(documents))\n        embeddings = embeddings[non_assigned_ids]\n\n        logger.info(\"Zeroshot Step 1 - Completed \\u2713\")\n        return documents, embeddings, assigned_documents, assigned_embeddings\n\n    def _is_zeroshot(self):\n        \"\"\"Check whether zero-shot topic modeling is possible.\n\n        * Embedding model is necessary to convert zero-shot topics to embeddings\n        * Zero-shot topics should be defined\n        \"\"\"\n        if self.zeroshot_topic_list is not None and self.embedding_model is not None:\n            return True\n        return False\n\n    def _combine_zeroshot_topics(\n        self,\n        documents: pd.DataFrame,\n        embeddings: np.ndarray,\n        assigned_documents: pd.DataFrame,\n        assigned_embeddings: np.ndarray,\n    ) -> Tuple[pd.DataFrame, np.ndarray]:\n        \"\"\"Combine the zero-shot topics with the clustered topics.\n\n        The zero-shot topics will be inserted between the outlier topic (that may or may not exist) and the rest of the\n        topics from clustering. The rest of the topics from clustering will be given new IDs to correspond to topics\n        after zero-shot topics.\n\n        Documents and embeddings used in zero-shot topic modeling and clustering and re-merged.\n\n        Arguments:\n            documents: DataFrame with clustered documents and their corresponding IDs\n            embeddings: The document embeddings for clustered documents\n            assigned_documents: DataFrame with documents and their corresponding IDs\n                                that were assigned to a zero-shot topic\n            assigned_embeddings: The document embeddings for documents that were assigned to a zero-shot topic\n\n        Returns:\n            documents: DataFrame with all the original documents with their topic assignments\n            embeddings: np.ndarray of embeddings aligned with the documents\n        \"\"\"\n        logger.info(\"Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...\")\n        # Combine Zero-shot topics with topics from clustering\n        zeroshot_topic_idx_to_topic_id = {\n            zeroshot_topic_id: new_topic_id\n            for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic))\n        }\n        self._topic_id_to_zeroshot_topic_idx = {\n            new_topic_id: zeroshot_topic_id\n            for new_topic_id, zeroshot_topic_id in enumerate(set(assigned_documents.Topic))\n        }\n        assigned_documents.Topic = assigned_documents.Topic.map(zeroshot_topic_idx_to_topic_id)\n        num_zeroshot_topics = len(zeroshot_topic_idx_to_topic_id)\n\n        # Insert zeroshot topics between outlier cluster and other clusters\n        documents.Topic = documents.Topic.apply(\n            lambda topic_id: topic_id + num_zeroshot_topics if topic_id != -1 else topic_id\n        )\n\n        # Combine the clustered documents/embeddings with assigned documents/embeddings in the original order\n        documents = pd.concat([documents, assigned_documents])\n        embeddings = np.vstack([embeddings, assigned_embeddings])\n        sorted_indices = documents.Old_ID.argsort()\n        documents = documents.iloc[sorted_indices]\n        embeddings = embeddings[sorted_indices]\n\n        # Update topic sizes and topic mapper\n        self._update_topic_size(documents)\n        self.topic_mapper_ = TopicMapper(self.topics_)\n\n        logger.info(\"Zeroshot Step 2 - Completed \\u2713\")\n        return documents, embeddings\n\n    def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]:\n        \"\"\"Apply Guided Topic Modeling.\n\n        We transform the seeded topics to embeddings using the\n        same embedder as used for generating document embeddings.\n\n        Then, we apply cosine similarity between the embeddings\n        and set labels for documents that are more similar to\n        one of the topics than the average document.\n\n        If a document is more similar to the average document\n        than any of the topics, it gets the -1 label and is\n        thereby not included in UMAP.\n\n        Arguments:\n            embeddings: The document embeddings\n\n        Returns:\n            y: The labels for each seeded topic\n            embeddings: Updated embeddings\n        \"\"\"\n        logger.info(\"Guided - Find embeddings highly related to seeded topics.\")\n        # Create embeddings from the seeded topics\n        seed_topic_list = [\" \".join(seed_topic) for seed_topic in self.seed_topic_list]\n        seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose)\n        seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)])\n\n        # Label documents that are most similar to one of the seeded topics\n        sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings)\n        y = [np.argmax(sim_matrix[index]) for index in range(sim_matrix.shape[0])]\n        y = [val if val != len(seed_topic_list) else -1 for val in y]\n\n        # Average the document embeddings related to the seeded topics with the\n        # embedding of the seeded topic to force the documents in a cluster\n        for seed_topic in range(len(seed_topic_list)):\n            indices = [index for index, topic in enumerate(y) if topic == seed_topic]\n            embeddings[indices] = np.average([embeddings[indices], seed_topic_embeddings[seed_topic]], weights=[3, 1])\n        logger.info(\"Guided - Completed \\u2713\")\n        return y, embeddings\n\n    def _extract_topics(\n        self,\n        documents: pd.DataFrame,\n        embeddings: np.ndarray = None,\n        mappings=None,\n        verbose: bool = False,\n    ):\n        \"\"\"Extract topics from the clusters using a class-based TF-IDF.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n            embeddings: The document embeddings\n            mappings: The mappings from topic to word\n            verbose: Whether to log the process of extracting topics\n\n        Returns:\n            c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic\n        \"\"\"\n        if verbose:\n            logger.info(\"Representation - Extracting topics from clusters using representation models.\")\n        documents_per_topic = documents.groupby([\"Topic\"], as_index=False).agg({\"Document\": \" \".join})\n        self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)\n        self.topic_representations_ = self._extract_words_per_topic(words, documents)\n        self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)\n        if verbose:\n            logger.info(\"Representation - Completed \\u2713\")\n\n    def _save_representative_docs(self, documents: pd.DataFrame):\n        \"\"\"Save the 3 most representative docs per topic.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs\n\n        Updates:\n            self.representative_docs_: Populate each topic with 3 representative docs\n        \"\"\"\n        repr_docs, _, _, _ = self._extract_representative_docs(\n            self.c_tf_idf_,\n            documents,\n            self.topic_representations_,\n            nr_samples=500,\n            nr_repr_docs=3,\n        )\n        self.representative_docs_ = repr_docs\n\n    def _extract_representative_docs(\n        self,\n        c_tf_idf: csr_matrix,\n        documents: pd.DataFrame,\n        topics: Mapping[str, List[Tuple[str, float]]],\n        nr_samples: int = 500,\n        nr_repr_docs: int = 5,\n        diversity: float = None,\n    ) -> Union[List[str], List[List[int]]]:\n        \"\"\"Approximate most representative documents per topic by sampling\n        a subset of the documents in each topic and calculating which are\n        most representative to their topic based on the cosine similarity between\n        c-TF-IDF representations.\n\n        Arguments:\n            c_tf_idf: The topic c-TF-IDF representation\n            documents: All input documents\n            topics: The candidate topics as calculated with c-TF-IDF\n            nr_samples: The number of candidate documents to extract per topic\n            nr_repr_docs: The number of representative documents to extract per topic\n            diversity: The diversity between the most representative documents.\n                       If None, no MMR is used. Otherwise, accepts values between 0 and 1.\n\n        Returns:\n            repr_docs_mappings: A dictionary from topic to representative documents\n            representative_docs: A flat list of representative documents\n            repr_doc_indices: Ordered indices of representative documents\n                              that belong to each topic\n            repr_doc_ids: The indices of representative documents\n                          that belong to each topic\n        \"\"\"\n        # Sample documents per topic\n        documents_per_topic = (\n            documents.drop(\"Image\", axis=1, errors=\"ignore\")\n            .groupby(\"Topic\")\n            .sample(n=nr_samples, replace=True, random_state=42)\n            .drop_duplicates()\n        )\n\n        # Find and extract documents that are most similar to the topic\n        repr_docs = []\n        repr_docs_indices = []\n        repr_docs_mappings = {}\n        repr_docs_ids = []\n        labels = sorted(list(topics.keys()))\n        for index, topic in enumerate(labels):\n            # Slice data\n            selection = documents_per_topic.loc[documents_per_topic.Topic == topic, :]\n            selected_docs = selection[\"Document\"].values\n            selected_docs_ids = selection.index.tolist()\n\n            # Calculate similarity\n            nr_docs = nr_repr_docs if len(selected_docs) > nr_repr_docs else len(selected_docs)\n            bow = self.vectorizer_model.transform(selected_docs)\n            ctfidf = self.ctfidf_model.transform(bow)\n            sim_matrix = cosine_similarity(ctfidf, c_tf_idf[index])\n\n            # Use MMR to find representative but diverse documents\n            if diversity:\n                docs = mmr(\n                    c_tf_idf[index],\n                    ctfidf,\n                    selected_docs,\n                    top_n=nr_docs,\n                    diversity=diversity,\n                )\n\n            # Extract top n most representative documents\n            else:\n                indices = np.argpartition(sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:]\n                docs = [selected_docs[index] for index in indices]\n\n            doc_ids = [selected_docs_ids[index] for index, doc in enumerate(selected_docs) if doc in docs]\n            repr_docs_ids.append(doc_ids)\n            repr_docs.extend(docs)\n            repr_docs_indices.append([repr_docs_indices[-1][-1] + i + 1 if index != 0 else i for i in range(nr_docs)])\n        repr_docs_mappings = {topic: repr_docs[i[0] : i[-1] + 1] for topic, i in zip(topics.keys(), repr_docs_indices)}\n\n        return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids\n\n    def _create_topic_vectors(\n        self,\n        documents: pd.DataFrame = None,\n        embeddings: np.ndarray = None,\n        mappings=None,\n    ):\n        \"\"\"Creates embeddings per topics based on their topic representation.\n\n        As a default, topic vectors (topic embeddings) are created by taking\n        the average of all document embeddings within a topic. If topics are\n        merged, then a weighted average of topic embeddings is taken based on\n        the initial topic sizes.\n\n        For the `.partial_fit` and `.update_topics` method, the average\n        of all document embeddings is not taken since those are not known.\n        Instead, the weighted average of the embeddings of the top n words\n        is taken for each topic. The weighting is done based on the c-TF-IDF\n        score. This will put more emphasis to words that represent a topic best.\n        \"\"\"\n        # Topic embeddings based on input embeddings\n        if embeddings is not None and documents is not None:\n            topic_embeddings = []\n            topics = documents.sort_values(\"Topic\").Topic.unique()\n            for topic in topics:\n                indices = documents.loc[documents.Topic == topic, \"ID\"].values\n                indices = [int(index) for index in indices]\n                topic_embedding = np.mean(embeddings[indices], axis=0)\n                topic_embeddings.append(topic_embedding)\n            self.topic_embeddings_ = np.array(topic_embeddings)\n\n        # Topic embeddings when merging topics\n        elif self.topic_embeddings_ is not None and mappings is not None:\n            topic_embeddings_dict = {}\n            for topic_to, topics_from in mappings.items():\n                topic_ids = topics_from[\"topics_from\"]\n                topic_sizes = topics_from[\"topic_sizes\"]\n                if topic_ids:\n                    embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers]\n                    topic_embedding = np.average(embds, axis=0, weights=topic_sizes)\n                    topic_embeddings_dict[topic_to] = topic_embedding\n\n            # Re-order topic embeddings\n            topics_to_map = {\n                topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]\n            }\n            topic_embeddings = {}\n            for topic, embds in topic_embeddings_dict.items():\n                topic_embeddings[topics_to_map[topic]] = embds\n            unique_topics = sorted(list(topic_embeddings.keys()))\n            self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics])\n\n        # Topic embeddings based on keyword representations\n        elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder:\n            topic_list = list(self.topic_representations_.keys())\n            topic_list.sort()\n\n            # Only extract top n words\n            n = len(self.topic_representations_[topic_list[0]])\n            if self.top_n_words < n:\n                n = self.top_n_words\n\n            # Extract embeddings for all words in all topics\n            topic_words = [self.get_topic(topic) for topic in topic_list]\n            topic_words = [word[0] for topic in topic_words for word in topic]\n            word_embeddings = self._extract_embeddings(topic_words, method=\"word\", verbose=False)\n\n            # Take the weighted average of word embeddings in a topic based on their c-TF-IDF value\n            # The embeddings var is a single numpy matrix and therefore slicing is necessary to\n            # access the words per topic\n            topic_embeddings = []\n            for i, topic in enumerate(topic_list):\n                word_importance = [val[1] for val in self.get_topic(topic)]\n                if sum(word_importance) == 0:\n                    word_importance = [1 for _ in range(len(self.get_topic(topic)))]\n                topic_embedding = np.average(\n                    word_embeddings[i * n : n + (i * n)],\n                    weights=word_importance,\n                    axis=0,\n                )\n                topic_embeddings.append(topic_embedding)\n\n            self.topic_embeddings_ = np.array(topic_embeddings)\n\n    def _c_tf_idf(\n        self,\n        documents_per_topic: pd.DataFrame,\n        fit: bool = True,\n        partial_fit: bool = False,\n    ) -> Tuple[csr_matrix, List[str]]:\n        \"\"\"Calculate a class-based TF-IDF where m is the number of total documents.\n\n        Arguments:\n            documents_per_topic: The joined documents per topic such that each topic has a single\n                                 string made out of multiple documents\n            m: The total number of documents (unjoined)\n            fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model\n            partial_fit: Whether to run `partial_fit` for online learning\n\n        Returns:\n            tf_idf: The resulting matrix giving a value (importance score) for each word per topic\n            words: The names of the words to which values were given\n        \"\"\"\n        documents = self._preprocess_text(documents_per_topic.Document.values)\n\n        if partial_fit:\n            X = self.vectorizer_model.partial_fit(documents).update_bow(documents)\n        elif fit:\n            X = self.vectorizer_model.fit_transform(documents)\n        else:\n            X = self.vectorizer_model.transform(documents)\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = self.vectorizer_model.get_feature_names_out()\n        else:\n            words = self.vectorizer_model.get_feature_names()\n\n        multiplier = None\n        if self.ctfidf_model.seed_words and self.seed_topic_list:\n            seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]\n            multiplier = np.array(\n                [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]\n            )\n            multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])\n        elif self.ctfidf_model.seed_words:\n            multiplier = np.array(\n                [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]\n            )\n        elif self.seed_topic_list:\n            seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]\n            multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])\n\n        if fit:\n            self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)\n\n        c_tf_idf = self.ctfidf_model.transform(X)\n\n        return c_tf_idf, words\n\n    def _update_topic_size(self, documents: pd.DataFrame):\n        \"\"\"Calculate the topic sizes.\n\n        Arguments:\n            documents: Updated dataframe with documents and their corresponding IDs and newly added Topics\n        \"\"\"\n        self.topic_sizes_ = collections.Counter(documents.Topic.values.tolist())\n        self.topics_ = documents.Topic.astype(int).tolist()\n\n    def _extract_words_per_topic(\n        self,\n        words: List[str],\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix = None,\n        calculate_aspects: bool = True,\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Based on tf_idf scores per topic, extract the top n words per topic.\n\n        If the top words per topic need to be extracted, then only the `words` parameter\n        needs to be passed. If the top words per topic in a specific timestamp, then it\n        is important to pass the timestamp-based c-TF-IDF matrix and its corresponding\n        labels.\n\n        Arguments:\n            words: List of all words (sorted according to tf_idf matrix position)\n            documents: DataFrame with documents and their topic IDs\n            c_tf_idf: A c-TF-IDF matrix from which to calculate the top words\n            calculate_aspects: Whether to calculate additional topic aspects\n\n        Returns:\n            topics: The top words per topic\n        \"\"\"\n        if c_tf_idf is None:\n            c_tf_idf = self.c_tf_idf_\n\n        labels = sorted(list(documents.Topic.unique()))\n        labels = [int(label) for label in labels]\n\n        # Get at least the top 30 indices and values per row in a sparse c-TF-IDF matrix\n        top_n_words = max(self.top_n_words, 30)\n        indices = self._top_n_idx_sparse(c_tf_idf, top_n_words)\n        scores = self._top_n_values_sparse(c_tf_idf, indices)\n        sorted_indices = np.argsort(scores, 1)\n        indices = np.take_along_axis(indices, sorted_indices, axis=1)\n        scores = np.take_along_axis(scores, sorted_indices, axis=1)\n\n        # Get top 30 words per topic based on c-TF-IDF score\n        base_topics = {\n            label: [\n                (words[word_index], score) if word_index is not None and score > 0 else (\"\", 0.00001)\n                for word_index, score in zip(indices[index][::-1], scores[index][::-1])\n            ]\n            for index, label in enumerate(labels)\n        }\n\n        # Fine-tune the topic representations\n        topics = base_topics.copy()\n        if not self.representation_model:\n            # Default representation: c_tf_idf + top_n_words\n            topics = {label: values[: self.top_n_words] for label, values in topics.items()}\n        elif isinstance(self.representation_model, list):\n            for tuner in self.representation_model:\n                topics = tuner.extract_topics(self, documents, c_tf_idf, topics)\n        elif isinstance(self.representation_model, BaseRepresentation):\n            topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)\n        elif isinstance(self.representation_model, dict):\n            if self.representation_model.get(\"Main\"):\n                main_model = self.representation_model[\"Main\"]\n                if isinstance(main_model, BaseRepresentation):\n                    topics = main_model.extract_topics(self, documents, c_tf_idf, topics)\n                elif isinstance(main_model, list):\n                    for tuner in main_model:\n                        topics = tuner.extract_topics(self, documents, c_tf_idf, topics)\n                else:\n                    raise TypeError(f\"unsupported type {type(main_model).__name__} for representation_model['Main']\")\n            else:\n                # Default representation: c_tf_idf + top_n_words\n                topics = {label: values[: self.top_n_words] for label, values in topics.items()}\n        else:\n            raise TypeError(f\"unsupported type {type(self.representation_model).__name__} for representation_model\")\n\n        # Extract additional topic aspects\n        if calculate_aspects and isinstance(self.representation_model, dict):\n            for aspect, aspect_model in self.representation_model.items():\n                if aspect != \"Main\":\n                    aspects = base_topics.copy()\n                    if not aspect_model:\n                        # Default representation: c_tf_idf + top_n_words\n                        aspects = {label: values[: self.top_n_words] for label, values in aspects.items()}\n                    if isinstance(aspect_model, list):\n                        for tuner in aspect_model:\n                            aspects = tuner.extract_topics(self, documents, c_tf_idf, aspects)\n                    elif isinstance(aspect_model, BaseRepresentation):\n                        aspects = aspect_model.extract_topics(self, documents, c_tf_idf, aspects)\n                    else:\n                        raise TypeError(\n                            f\"unsupported type {type(aspect_model).__name__} for representation_model[{repr(aspect)}]\"\n                        )\n                    self.topic_aspects_[aspect] = aspects\n\n        return topics\n\n    def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:\n        \"\"\"Reduce topics to self.nr_topics.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic\n                        embeddings are used.\n\n        Returns:\n            documents: Updated dataframe with documents and the reduced number of Topics\n        \"\"\"\n        logger.info(\"Topic reduction - Reducing number of topics\")\n        initial_nr_topics = len(self.get_topics())\n\n        if isinstance(self.nr_topics, int):\n            if self.nr_topics < initial_nr_topics:\n                documents = self._reduce_to_n_topics(documents, use_ctfidf)\n        elif isinstance(self.nr_topics, str):\n            documents = self._auto_reduce_topics(documents, use_ctfidf)\n        else:\n            raise ValueError(\"nr_topics needs to be an int or 'auto'! \")\n\n        logger.info(\n            f\"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}\"\n        )\n        return documents\n\n    def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:\n        \"\"\"Reduce topics to self.nr_topics.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic\n                        embedding are used.\n\n        Returns:\n            documents: Updated dataframe with documents and the reduced number of Topics\n        \"\"\"\n        topics = documents.Topic.tolist().copy()\n\n        # Create topic distance matrix\n        topic_embeddings = select_topic_representation(\n            self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True\n        )[0][self._outliers :]\n        distance_matrix = 1 - cosine_similarity(topic_embeddings)\n        np.fill_diagonal(distance_matrix, 0)\n\n        # Cluster the topic embeddings using AgglomerativeClustering\n        if version.parse(sklearn_version) >= version.parse(\"1.4.0\"):\n            cluster = AgglomerativeClustering(self.nr_topics - self._outliers, metric=\"precomputed\", linkage=\"average\")\n        else:\n            cluster = AgglomerativeClustering(\n                self.nr_topics - self._outliers,\n                affinity=\"precomputed\",\n                linkage=\"average\",\n            )\n        cluster.fit(distance_matrix)\n        new_topics = [cluster.labels_[topic] if topic != -1 else -1 for topic in topics]\n\n        # Track mappings and sizes of topics for merging topic embeddings\n        mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, new_topics)}\n        basic_mappings = defaultdict(list)\n        for key, val in sorted(mapped_topics.items()):\n            basic_mappings[val].append(key)\n        mappings = {\n            topic_to: {\n                \"topics_from\": topics_from,\n                \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_from],\n            }\n            for topic_to, topics_from in basic_mappings.items()\n        }\n\n        # Map topics\n        documents.Topic = new_topics\n        self._update_topic_size(documents)\n        self.topic_mapper_.add_mappings(mapped_topics)\n\n        # Update representations\n        documents = self._sort_mappings_by_frequency(documents)\n        self._extract_topics(documents, mappings=mappings)\n\n        # When zero-shot topic(s) are present in the topics to merge,\n        # determine whether to take one of the zero-shot topic labels\n        # or use a calculated representation.\n        if self._is_zeroshot():\n            new_topic_id_to_zeroshot_topic_idx = {}\n            topics_to_map = {\n                topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]\n            }\n\n            for topic_to, topics_from in basic_mappings.items():\n                # When extracting topics, the reduced topics were reordered.\n                # Must get the updated topic_to.\n                topic_to = topics_to_map[topic_to]\n\n                # which of the original topics are zero-shot\n                zeroshot_topic_ids = [\n                    topic_id for topic_id in topics_from if topic_id in self._topic_id_to_zeroshot_topic_idx\n                ]\n                if len(zeroshot_topic_ids) == 0:\n                    continue\n\n                # If any of the original topics are zero-shot, take the best fitting zero-shot label\n                # if the cosine similarity with the new topic exceeds the zero-shot threshold\n                zeroshot_labels = [\n                    self.zeroshot_topic_list[self._topic_id_to_zeroshot_topic_idx[topic_id]]\n                    for topic_id in zeroshot_topic_ids\n                ]\n                zeroshot_embeddings = self._extract_embeddings(zeroshot_labels)\n                cosine_similarities = cosine_similarity(\n                    zeroshot_embeddings, [self.topic_embeddings_[topic_to]]\n                ).flatten()\n                best_zeroshot_topic_idx = np.argmax(cosine_similarities)\n                best_cosine_similarity = cosine_similarities[best_zeroshot_topic_idx]\n                if best_cosine_similarity >= self.zeroshot_min_similarity:\n                    new_topic_id_to_zeroshot_topic_idx[topic_to] = zeroshot_topic_ids[best_zeroshot_topic_idx]\n\n            self._topic_id_to_zeroshot_topic_idx = new_topic_id_to_zeroshot_topic_idx\n\n        self._update_topic_size(documents)\n        return documents\n\n    def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:\n        \"\"\"Reduce the number of topics automatically using HDBSCAN.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                        embeddings from the embedding model are used.\n\n        Returns:\n            documents: Updated dataframe with documents and the reduced number of Topics\n        \"\"\"\n        topics = documents.Topic.tolist().copy()\n        unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :]\n        max_topic = unique_topics[-1]\n\n        # Find similar topics\n        embeddings = select_topic_representation(\n            self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True\n        )[0]\n        norm_data = normalize(embeddings, norm=\"l2\")\n        predictions = hdbscan.HDBSCAN(\n            min_cluster_size=2,\n            metric=\"euclidean\",\n            cluster_selection_method=\"eom\",\n            prediction_data=True,\n        ).fit_predict(norm_data[self._outliers :])\n\n        # Map similar topics\n        mapped_topics = {\n            unique_topics[index]: prediction + max_topic\n            for index, prediction in enumerate(predictions)\n            if prediction != -1\n        }\n        documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)\n        mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}\n\n        # Track mappings and sizes of topics for merging topic embeddings\n        mappings = defaultdict(list)\n        for key, val in sorted(mapped_topics.items()):\n            mappings[val].append(key)\n        mappings = {\n            topic_from: {\n                \"topics_to\": topics_to,\n                \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_to],\n            }\n            for topic_from, topics_to in mappings.items()\n        }\n\n        # Update documents and topics\n        self.topic_mapper_.add_mappings(mapped_topics)\n        documents = self._sort_mappings_by_frequency(documents)\n        self._extract_topics(documents, mappings=mappings)\n        self._update_topic_size(documents)\n        return documents\n\n    def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame:\n        \"\"\"Reorder mappings by their frequency.\n\n        For example, if topic 88 was mapped to topic\n        5 and topic 5 turns out to be the largest topic,\n        then topic 5 will be topic 0. The second largest\n        will be topic 1, etc.\n\n        If there are no mappings since no reduction of topics\n        took place, then the topics will simply be ordered\n        by their frequency and will get the topic ids based\n        on that order.\n\n        This means that -1 will remain the outlier class, and\n        that the rest of the topics will be in descending order\n        of ids and frequency.\n\n        Arguments:\n            documents: Dataframe with documents and their corresponding IDs and Topics\n\n        Returns:\n            documents: Updated dataframe with documents and the mapped\n                       and re-ordered topic ids\n        \"\"\"\n        self._update_topic_size(documents)\n\n        # Map topics based on frequency\n        df = pd.DataFrame(self.topic_sizes_.items(), columns=[\"Old_Topic\", \"Size\"]).sort_values(\"Size\", ascending=False)\n        df = df[df.Old_Topic != -1]\n        sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))}\n        self.topic_mapper_.add_mappings(sorted_topics)\n\n        # Map documents\n        documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int)\n        self._update_topic_size(documents)\n        return documents\n\n    def _map_probabilities(\n        self, probabilities: Union[np.ndarray, None], original_topics: bool = False\n    ) -> Union[np.ndarray, None]:\n        \"\"\"Map the probabilities to the reduced topics.\n        This is achieved by adding together the probabilities\n        of all topics that are mapped to the same topic. Then,\n        the topics that were mapped from are set to 0 as they\n        were reduced.\n\n        Arguments:\n            probabilities: An array containing probabilities\n            original_topics: Whether we want to map from the\n                             original topics to the most recent topics\n                             or from the second-most recent topics.\n\n        Returns:\n            mapped_probabilities: Updated probabilities\n        \"\"\"\n        mappings = self.topic_mapper_.get_mappings(original_topics)\n\n        # Map array of probabilities (probability for assigned topic per document)\n        if probabilities is not None:\n            if len(probabilities.shape) == 2:\n                mapped_probabilities = np.zeros(\n                    (\n                        probabilities.shape[0],\n                        len(set(mappings.values())) - self._outliers,\n                    )\n                )\n                for from_topic, to_topic in mappings.items():\n                    if to_topic != -1 and from_topic != -1:\n                        mapped_probabilities[:, to_topic] += probabilities[:, from_topic]\n\n                return mapped_probabilities\n\n        return probabilities\n\n    def _preprocess_text(self, documents: np.ndarray) -> List[str]:\n        r\"\"\"Basic preprocessing of text.\n\n        Steps:\n            * Replace \\n and \\t with whitespace\n            * Only keep alpha-numerical characters\n        \"\"\"\n        cleaned_documents = [doc.replace(\"\\n\", \" \") for doc in documents]\n        cleaned_documents = [doc.replace(\"\\t\", \" \") for doc in cleaned_documents]\n        if self.language == \"english\":\n            cleaned_documents = [re.sub(r\"[^A-Za-z0-9 ]+\", \"\", doc) for doc in cleaned_documents]\n        cleaned_documents = [doc if doc != \"\" else \"emptydoc\" for doc in cleaned_documents]\n        return cleaned_documents\n\n    @staticmethod\n    def _top_n_idx_sparse(matrix: csr_matrix, n: int) -> np.ndarray:\n        \"\"\"Return indices of top n values in each row of a sparse matrix.\n\n        Retrieved from:\n            https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix\n\n        Arguments:\n            matrix: The sparse matrix from which to get the top n indices per row\n            n: The number of highest values to extract from each row\n\n        Returns:\n            indices: The top n indices per row\n        \"\"\"\n        indices = []\n        for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):\n            n_row_pick = min(n, ri - le)\n            values = matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]]\n            values = [values[index] if len(values) >= index + 1 else None for index in range(n)]\n            indices.append(values)\n        return np.array(indices)\n\n    @staticmethod\n    def _top_n_values_sparse(matrix: csr_matrix, indices: np.ndarray) -> np.ndarray:\n        \"\"\"Return the top n values for each row in a sparse matrix.\n\n        Arguments:\n            matrix: The sparse matrix from which to get the top n indices per row\n            indices: The top n indices per row\n\n        Returns:\n            top_values: The top n scores per row\n        \"\"\"\n        top_values = []\n        for row, values in enumerate(indices):\n            scores = np.array([matrix[row, value] if value is not None else 0 for value in values])\n            top_values.append(scores)\n        return np.array(top_values)\n\n    @classmethod\n    def _get_param_names(cls):\n        \"\"\"Get parameter names for the estimator.\n\n        Adapted from:\n            https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178\n        \"\"\"\n        init_signature = inspect.signature(cls.__init__)\n        parameters = sorted(\n            [p.name for p in init_signature.parameters.values() if p.name != \"self\" and p.kind != p.VAR_KEYWORD]\n        )\n        return parameters\n\n    def __str__(self):\n        \"\"\"Get a string representation of the current object.\n\n        Returns:\n            str: Human readable representation of the most important model parameters.\n                 The parameters that represent models are ignored due to their length.\n        \"\"\"\n        parameters = \"\"\n        for parameter, value in self.get_params().items():\n            value = str(value)\n            if \"(\" in value and value[0] != \"(\":\n                value = value.split(\"(\")[0] + \"(...)\"\n            parameters += f\"{parameter}={value}, \"\n\n        return f\"BERTopic({parameters[:-2]})\"\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.topic_labels_","title":"topic_labels_ property readonly","text":"

        Map topic IDs to their labels. A label is the topic ID, along with the first four words of the topic representation, joined using '_'. Zeroshot topic labels come from self.zeroshot_topic_list rather than the calculated representation.

        Returns:

        Type Description topic_labels

        a dict mapping a topic ID (int) to its label (str)

        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.__init__","title":"__init__(self, language='english', top_n_words=10, n_gram_range=(1, 1), min_topic_size=10, nr_topics=None, low_memory=False, calculate_probabilities=False, seed_topic_list=None, zeroshot_topic_list=None, zeroshot_min_similarity=0.7, embedding_model=None, umap_model=None, hdbscan_model=None, vectorizer_model=None, ctfidf_model=None, representation_model=None, verbose=False) special","text":"

        BERTopic initialization.

        Parameters:

        Name Type Description Default language str

        The main language used in your documents. The default sentence-transformers model for \"english\" is all-MiniLM-L6-v2. For a full overview of supported languages see bertopic.backend.languages. Select \"multilingual\" to load in the paraphrase-multilingual-MiniLM-L12-v2 sentence-transformers model that supports 50+ languages. NOTE: This is not used if embedding_model is used.

        'english' top_n_words int

        The number of words per topic to extract. Setting this too high can negatively impact topic embeddings as topics are typically best represented by at most 10 words.

        10 n_gram_range Tuple[int, int]

        The n-gram range for the CountVectorizer. Advised to keep high values between 1 and 3. More would likely lead to memory issues. NOTE: This param will not be used if you pass in your own CountVectorizer.

        (1, 1) min_topic_size int

        The minimum size of the topic. Increasing this value will lead to a lower number of clusters/topics and vice versa. It is the same parameter as min_cluster_size in HDBSCAN. NOTE: This param will not be used if you are using hdbscan_model.

        10 nr_topics Union[int, str]

        Specifying the number of topics will reduce the initial number of topics to the value specified. This reduction can take a while as each reduction in topics (-1) activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use \"auto\" to automatically reduce topics using HDBSCAN. NOTE: Controlling the number of topics is best done by adjusting min_topic_size first before adjusting this parameter.

        None low_memory bool

        Sets UMAP low memory to True to make sure less memory is used. NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP this parameter will not be used.

        False calculate_probabilities bool

        Calculate the probabilities of all topics per document instead of the probability of the assigned topic per document. This could slow down the extraction of topics if you have many documents (> 100_000). NOTE: If false you cannot use the corresponding visualization method visualize_probabilities. NOTE: This is an approximation of topic probabilities as used in HDBSCAN and not an exact representation.

        False seed_topic_list List[List[str]]

        A list of seed words per topic to converge around

        None zeroshot_topic_list List[str]

        A list of topic names to use for zero-shot classification

        None zeroshot_min_similarity float

        The minimum similarity between a zero-shot topic and a document for assignment. The higher this value, the more confident the model needs to be to assign a zero-shot topic to a document.

        0.7 verbose bool

        Changes the verbosity of the model, Set to True if you want to track the stages of the model.

        False embedding_model

        Use a custom embedding model. The following backends are currently supported * SentenceTransformers * Flair * Spacy * Gensim * USE (TF-Hub) You can also pass in a string that points to one of the following sentence-transformers models: * https://www.sbert.net/docs/pretrained_models.html

        None umap_model UMAP

        Pass in a UMAP model to be used instead of the default. NOTE: You can also pass in any dimensionality reduction algorithm as long as it has .fit and .transform functions.

        None hdbscan_model HDBSCAN

        Pass in a hdbscan.HDBSCAN model to be used instead of the default NOTE: You can also pass in any clustering algorithm as long as it has .fit and .predict functions along with the .labels_ variable.

        None vectorizer_model CountVectorizer

        Pass in a custom CountVectorizer instead of the default model.

        None ctfidf_model TfidfTransformer

        Pass in a custom ClassTfidfTransformer instead of the default model.

        None representation_model BaseRepresentation

        Pass in a model that fine-tunes the topic representations calculated through c-TF-IDF. Models from bertopic.representation are supported.

        None Source code in bertopic\\_bertopic.py
        def __init__(\n    self,\n    language: str = \"english\",\n    top_n_words: int = 10,\n    n_gram_range: Tuple[int, int] = (1, 1),\n    min_topic_size: int = 10,\n    nr_topics: Union[int, str] = None,\n    low_memory: bool = False,\n    calculate_probabilities: bool = False,\n    seed_topic_list: List[List[str]] = None,\n    zeroshot_topic_list: List[str] = None,\n    zeroshot_min_similarity: float = 0.7,\n    embedding_model=None,\n    umap_model: UMAP = None,\n    hdbscan_model: hdbscan.HDBSCAN = None,\n    vectorizer_model: CountVectorizer = None,\n    ctfidf_model: TfidfTransformer = None,\n    representation_model: BaseRepresentation = None,\n    verbose: bool = False,\n):\n    \"\"\"BERTopic initialization.\n\n    Arguments:\n        language: The main language used in your documents. The default sentence-transformers\n                  model for \"english\" is `all-MiniLM-L6-v2`. For a full overview of\n                  supported languages see bertopic.backend.languages. Select\n                  \"multilingual\" to load in the `paraphrase-multilingual-MiniLM-L12-v2`\n                  sentence-transformers model that supports 50+ languages.\n                  NOTE: This is not used if `embedding_model` is used.\n        top_n_words: The number of words per topic to extract. Setting this\n                     too high can negatively impact topic embeddings as topics\n                     are typically best represented by at most 10 words.\n        n_gram_range: The n-gram range for the CountVectorizer.\n                      Advised to keep high values between 1 and 3.\n                      More would likely lead to memory issues.\n                      NOTE: This param will not be used if you pass in your own\n                      CountVectorizer.\n        min_topic_size: The minimum size of the topic. Increasing this value will lead\n                        to a lower number of clusters/topics and vice versa.\n                        It is the same parameter as `min_cluster_size` in HDBSCAN.\n                        NOTE: This param will not be used if you are using `hdbscan_model`.\n        nr_topics: Specifying the number of topics will reduce the initial\n                   number of topics to the value specified. This reduction can take\n                   a while as each reduction in topics (-1) activates a c-TF-IDF\n                   calculation. If this is set to None, no reduction is applied. Use\n                   \"auto\" to automatically reduce topics using HDBSCAN.\n                   NOTE: Controlling the number of topics is best done by adjusting\n                   `min_topic_size` first before adjusting this parameter.\n        low_memory: Sets UMAP low memory to True to make sure less memory is used.\n                    NOTE: This is only used in UMAP. For example, if you use PCA instead of UMAP\n                    this parameter will not be used.\n        calculate_probabilities: Calculate the probabilities of all topics\n                                 per document instead of the probability of the assigned\n                                 topic per document. This could slow down the extraction\n                                 of topics if you have many documents (> 100_000).\n                                 NOTE: If false you cannot use the corresponding\n                                 visualization method `visualize_probabilities`.\n                                 NOTE: This is an approximation of topic probabilities\n                                 as used in HDBSCAN and not an exact representation.\n        seed_topic_list: A list of seed words per topic to converge around\n        zeroshot_topic_list: A list of topic names to use for zero-shot classification\n        zeroshot_min_similarity: The minimum similarity between a zero-shot topic and\n                                 a document for assignment. The higher this value, the more\n                                 confident the model needs to be to assign a zero-shot topic to a document.\n        verbose: Changes the verbosity of the model, Set to True if you want\n                 to track the stages of the model.\n        embedding_model: Use a custom embedding model.\n                         The following backends are currently supported\n                           * SentenceTransformers\n                           * Flair\n                           * Spacy\n                           * Gensim\n                           * USE (TF-Hub)\n                         You can also pass in a string that points to one of the following\n                         sentence-transformers models:\n                           * https://www.sbert.net/docs/pretrained_models.html\n        umap_model: Pass in a UMAP model to be used instead of the default.\n                    NOTE: You can also pass in any dimensionality reduction algorithm as long\n                    as it has `.fit` and `.transform` functions.\n        hdbscan_model: Pass in a hdbscan.HDBSCAN model to be used instead of the default\n                       NOTE: You can also pass in any clustering algorithm as long as it has\n                       `.fit` and `.predict` functions along with the `.labels_` variable.\n        vectorizer_model: Pass in a custom `CountVectorizer` instead of the default model.\n        ctfidf_model: Pass in a custom ClassTfidfTransformer instead of the default model.\n        representation_model: Pass in a model that fine-tunes the topic representations\n                              calculated through c-TF-IDF. Models from `bertopic.representation`\n                              are supported.\n    \"\"\"\n    # Topic-based parameters\n    if top_n_words > 100:\n        logger.warning(\n            \"Note that extracting more than 100 words from a sparse can slow down computation quite a bit.\"\n        )\n\n    self.top_n_words = top_n_words\n    self.min_topic_size = min_topic_size\n    self.nr_topics = nr_topics\n    self.low_memory = low_memory\n    self.calculate_probabilities = calculate_probabilities\n    self.verbose = verbose\n    self.seed_topic_list = seed_topic_list\n    self.zeroshot_topic_list = zeroshot_topic_list\n    self.zeroshot_min_similarity = zeroshot_min_similarity\n\n    # Embedding model\n    self.language = language if not embedding_model else None\n    self.embedding_model = embedding_model\n\n    # Vectorizer\n    self.n_gram_range = n_gram_range\n    self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=self.n_gram_range)\n    self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n\n    # Representation model\n    self.representation_model = representation_model\n\n    # UMAP or another algorithm that has .fit and .transform functions\n    self.umap_model = umap_model or UMAP(\n        n_neighbors=15,\n        n_components=5,\n        min_dist=0.0,\n        metric=\"cosine\",\n        low_memory=self.low_memory,\n    )\n\n    # HDBSCAN or another clustering algorithm that has .fit and .predict functions and\n    # the .labels_ variable to extract the labels\n    self.hdbscan_model = hdbscan_model or hdbscan.HDBSCAN(\n        min_cluster_size=self.min_topic_size,\n        metric=\"euclidean\",\n        cluster_selection_method=\"eom\",\n        prediction_data=True,\n    )\n\n    # Public attributes\n    self.topics_ = None\n    self.probabilities_ = None\n    self.topic_sizes_ = None\n    self.topic_mapper_ = None\n    self.topic_representations_ = None\n    self.topic_embeddings_ = None\n    self._topic_id_to_zeroshot_topic_idx = {}\n    self.custom_labels_ = None\n    self.c_tf_idf_ = None\n    self.representative_images_ = None\n    self.representative_docs_ = {}\n    self.topic_aspects_ = {}\n\n    # Private attributes for internal tracking purposes\n    self._merged_topics = None\n\n    if verbose:\n        logger.set_level(\"DEBUG\")\n    else:\n        logger.set_level(\"WARNING\")\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.__str__","title":"__str__(self) special","text":"

        Get a string representation of the current object.

        Returns:

        Type Description str

        Human readable representation of the most important model parameters. The parameters that represent models are ignored due to their length.

        Source code in bertopic\\_bertopic.py
        def __str__(self):\n    \"\"\"Get a string representation of the current object.\n\n    Returns:\n        str: Human readable representation of the most important model parameters.\n             The parameters that represent models are ignored due to their length.\n    \"\"\"\n    parameters = \"\"\n    for parameter, value in self.get_params().items():\n        value = str(value)\n        if \"(\" in value and value[0] != \"(\":\n            value = value.split(\"(\")[0] + \"(...)\"\n        parameters += f\"{parameter}={value}, \"\n\n    return f\"BERTopic({parameters[:-2]})\"\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.approximate_distribution","title":"approximate_distribution(self, documents, window=4, stride=1, min_similarity=0.1, batch_size=1000, padding=False, use_embedding_model=False, calculate_tokens=False, separator=' ')","text":"

        A post-hoc approximation of topic distributions across documents.

        In order to perform this approximation, each document is split into tokens according to the provided tokenizer in the CountVectorizer. Then, a sliding window is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the sentence:

        Solving the right problem is difficult.

        can be split up into solving the right, the right problem, right problem is, and problem is difficult. These are called tokensets. For each of these tokensets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. Then, the similarities to the topics for each tokenset are summed up in order to create a topic distribution for the entire document.

        We can also dive into this a bit deeper by then splitting these tokensets up into individual tokens and calculate how much a word, in a specific sentence, contributes to the topics found in that document. This can be enabled by setting calculate_tokens=True which can be used for visualization purposes in topic_model.visualize_approximate_distribution.

        The main output, topic_distributions, can also be used directly in .visualize_distribution(topic_distributions[index]) by simply selecting a single distribution.

        Parameters:

        Name Type Description Default documents Union[str, List[str]]

        A single document or a list of documents for which we approximate their topic distributions

        required window int

        Size of the moving window which indicates the number of tokens being considered.

        4 stride int

        How far the window should move at each step.

        1 min_similarity float

        The minimum similarity of a document's tokenset with respect to the topics.

        0.1 batch_size int

        The number of documents to process at a time. If None, then all documents are processed at once. NOTE: With a large number of documents, it is not advised to process all documents at once.

        1000 padding bool

        Whether to pad the beginning and ending of a document with empty tokens.

        False use_embedding_model bool

        Whether to use the topic model's embedding model to calculate the similarity between tokensets and topics instead of using c-TF-IDF.

        False calculate_tokens bool

        Calculate the similarity of tokens with all topics. NOTE: This is computation-wise more expensive and can require more memory. Using this over batches of documents might be preferred.

        False separator str

        The separator used to merge tokens into tokensets.

        ' '

        Returns:

        Type Description topic_distributions

        A n x m matrix containing the topic distributions for all input documents with n being the documents and m the topics. topic_token_distributions: A list of t x m arrays with t being the number of tokens for the respective document and m the topics.

        Examples:

        After fitting the model, the topic distributions can be calculated regardless of the clustering model and regardless of whether the documents were previously seen or not:

        topic_distr, _ = topic_model.approximate_distribution(docs)\n

        As a result, the topic distributions are calculated in topic_distr for the entire document based on a token set with a specific window size and stride.

        If you want to calculate the topic distributions on a token-level:

        topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n

        The topic_token_distr then contains, for each token, the best fitting topics. As with topic_distr, it can contain multiple topics for a single token.

        Source code in bertopic\\_bertopic.py
        def approximate_distribution(\n    self,\n    documents: Union[str, List[str]],\n    window: int = 4,\n    stride: int = 1,\n    min_similarity: float = 0.1,\n    batch_size: int = 1000,\n    padding: bool = False,\n    use_embedding_model: bool = False,\n    calculate_tokens: bool = False,\n    separator: str = \" \",\n) -> Tuple[np.ndarray, Union[List[np.ndarray], None]]:\n    \"\"\"A post-hoc approximation of topic distributions across documents.\n\n    In order to perform this approximation, each document is split into tokens\n    according to the provided tokenizer in the `CountVectorizer`. Then, a\n    sliding window is applied on each document creating subsets of the document.\n    For example, with a window size of 3 and stride of 1, the sentence:\n\n    `Solving the right problem is difficult.`\n\n    can be split up into `solving the right`, `the right problem`, `right problem is`,\n    and `problem is difficult`. These are called tokensets. For each of these\n    tokensets, we calculate their c-TF-IDF representation and find out\n    how similar they are to the previously generated topics. Then, the\n    similarities to the topics for each tokenset are summed up in order to\n    create a topic distribution for the entire document.\n\n    We can also dive into this a bit deeper by then splitting these tokensets\n    up into individual tokens and calculate how much a word, in a specific sentence,\n    contributes to the topics found in that document. This can be enabled by\n    setting `calculate_tokens=True` which can be used for visualization purposes\n    in `topic_model.visualize_approximate_distribution`.\n\n    The main output, `topic_distributions`, can also be used directly in\n    `.visualize_distribution(topic_distributions[index])` by simply selecting\n    a single distribution.\n\n    Arguments:\n        documents: A single document or a list of documents for which we\n                   approximate their topic distributions\n        window: Size of the moving window which indicates the number of\n                tokens being considered.\n        stride: How far the window should move at each step.\n        min_similarity: The minimum similarity of a document's tokenset\n                        with respect to the topics.\n        batch_size: The number of documents to process at a time. If None,\n                    then all documents are processed at once.\n                    NOTE: With a large number of documents, it is not\n                    advised to process all documents at once.\n        padding: Whether to pad the beginning and ending of a document with\n                 empty tokens.\n        use_embedding_model: Whether to use the topic model's embedding\n                             model to calculate the similarity between\n                             tokensets and topics instead of using c-TF-IDF.\n        calculate_tokens: Calculate the similarity of tokens with all topics.\n                          NOTE: This is computation-wise more expensive and\n                          can require more memory. Using this over batches of\n                          documents might be preferred.\n        separator: The separator used to merge tokens into tokensets.\n\n    Returns:\n        topic_distributions: A `n` x `m` matrix containing the topic distributions\n                             for all input documents with `n` being the documents\n                             and `m` the topics.\n        topic_token_distributions: A list of `t` x `m` arrays with `t` being the\n                                   number of tokens for the respective document\n                                   and `m` the topics.\n\n    Examples:\n    After fitting the model, the topic distributions can be calculated regardless\n    of the clustering model and regardless of whether the documents were previously\n    seen or not:\n\n    ```python\n    topic_distr, _ = topic_model.approximate_distribution(docs)\n    ```\n\n    As a result, the topic distributions are calculated in `topic_distr` for the\n    entire document based on a token set with a specific window size and stride.\n\n    If you want to calculate the topic distributions on a token-level:\n\n    ```python\n    topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n    ```\n\n    The `topic_token_distr` then contains, for each token, the best fitting topics.\n    As with `topic_distr`, it can contain multiple topics for a single token.\n    \"\"\"\n    if isinstance(documents, str):\n        documents = [documents]\n\n    if batch_size is None:\n        batch_size = len(documents)\n        batches = 1\n    else:\n        batches = math.ceil(len(documents) / batch_size)\n\n    topic_distributions = []\n    topic_token_distributions = []\n\n    for i in tqdm(range(batches), disable=not self.verbose):\n        doc_set = documents[i * batch_size : (i + 1) * batch_size]\n\n        # Extract tokens\n        analyzer = self.vectorizer_model.build_tokenizer()\n        tokens = [analyzer(document) for document in doc_set]\n\n        # Extract token sets\n        all_sentences = []\n        all_indices = [0]\n        all_token_sets_ids = []\n\n        for tokenset in tokens:\n            if len(tokenset) < window:\n                token_sets = [tokenset]\n                token_sets_ids = [list(range(len(tokenset)))]\n            else:\n                # Extract tokensets using window and stride parameters\n                stride_indices = list(range(len(tokenset)))[::stride]\n                token_sets = []\n                token_sets_ids = []\n                for stride_index in stride_indices:\n                    selected_tokens = tokenset[stride_index : stride_index + window]\n\n                    if padding or len(selected_tokens) == window:\n                        token_sets.append(selected_tokens)\n                        token_sets_ids.append(\n                            list(\n                                range(\n                                    stride_index,\n                                    stride_index + len(selected_tokens),\n                                )\n                            )\n                        )\n\n                # Add empty tokens at the beginning and end of a document\n                if padding:\n                    padded = []\n                    padded_ids = []\n                    t = math.ceil(window / stride) - 1\n                    for i in range(math.ceil(window / stride) - 1):\n                        padded.append(tokenset[: window - ((t - i) * stride)])\n                        padded_ids.append(list(range(0, window - ((t - i) * stride))))\n\n                    token_sets = padded + token_sets\n                    token_sets_ids = padded_ids + token_sets_ids\n\n            # Join the tokens\n            sentences = [separator.join(token) for token in token_sets]\n            all_sentences.extend(sentences)\n            all_token_sets_ids.extend(token_sets_ids)\n            all_indices.append(all_indices[-1] + len(sentences))\n\n        # Calculate similarity between embeddings of token sets and the topics\n        if use_embedding_model:\n            embeddings = self._extract_embeddings(all_sentences, method=\"document\", verbose=True)\n            similarity = cosine_similarity(embeddings, self.topic_embeddings_[self._outliers :])\n\n        # Calculate similarity between c-TF-IDF of token sets and the topics\n        else:\n            bow_doc = self.vectorizer_model.transform(all_sentences)\n            c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n            similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :])\n\n        # Only keep similarities that exceed the minimum\n        similarity[similarity < min_similarity] = 0\n\n        # Aggregate results on an individual token level\n        if calculate_tokens:\n            topic_distribution = []\n            topic_token_distribution = []\n            for index, token in enumerate(tokens):\n                start = all_indices[index]\n                end = all_indices[index + 1]\n\n                if start == end:\n                    end = end + 1\n\n                # Assign topics to individual tokens\n                token_id = [i for i in range(len(token))]\n                token_val = {index: [] for index in token_id}\n                for sim, token_set in zip(similarity[start:end], all_token_sets_ids[start:end]):\n                    for token in token_set:\n                        if token in token_val:\n                            token_val[token].append(sim)\n\n                matrix = []\n                for _, value in token_val.items():\n                    matrix.append(np.add.reduce(value))\n\n                # Take empty documents into account\n                matrix = np.array(matrix)\n                if len(matrix.shape) == 1:\n                    matrix = np.zeros((1, len(self.topic_labels_) - self._outliers))\n\n                topic_token_distribution.append(np.array(matrix))\n                topic_distribution.append(np.add.reduce(matrix))\n\n            topic_distribution = normalize(topic_distribution, norm=\"l1\", axis=1)\n\n        # Aggregate on a tokenset level indicated by the window and stride\n        else:\n            topic_distribution = []\n            for index in range(len(all_indices) - 1):\n                start = all_indices[index]\n                end = all_indices[index + 1]\n\n                if start == end:\n                    end = end + 1\n                group = similarity[start:end].sum(axis=0)\n                topic_distribution.append(group)\n            topic_distribution = normalize(np.array(topic_distribution), norm=\"l1\", axis=1)\n            topic_token_distribution = None\n\n        # Combine results\n        topic_distributions.append(topic_distribution)\n        if topic_token_distribution is None:\n            topic_token_distributions = None\n        else:\n            topic_token_distributions.extend(topic_token_distribution)\n\n    topic_distributions = np.vstack(topic_distributions)\n\n    return topic_distributions, topic_token_distributions\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.find_topics","title":"find_topics(self, search_term=None, image=None, top_n=5)","text":"

        Find topics most similar to a search_term.

        Creates an embedding for a search query and compares that with the topic embeddings. The most similar topics are returned along with their similarity values.

        The query is specified using search_term for text queries or image for image queries.

        The search_term can be of any size but since it is compared with the topic representation it is advised to keep it below 5 words.

        Parameters:

        Name Type Description Default search_term str

        the term you want to use to search for topics.

        None image str

        path to the image you want to use to search for topics.

        None top_n int

        the number of topics to return

        5

        Returns:

        Type Description similar_topics

        the most similar topics from high to low similarity: the similarity scores from high to low

        Examples:

        You can use the underlying embedding model to find topics that best represent the search term:

        topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n

        Note that the search query is typically more accurate if the search_term consists of a phrase or multiple words.

        Source code in bertopic\\_bertopic.py
        def find_topics(self, search_term: str = None, image: str = None, top_n: int = 5) -> Tuple[List[int], List[float]]:\n    \"\"\"Find topics most similar to a search_term.\n\n    Creates an embedding for a search query and compares that with\n    the topic embeddings. The most similar topics are returned\n    along with their similarity values.\n\n    The query is specified using search_term for text queries or image for image queries.\n\n    The search_term can be of any size but since it is compared\n    with the topic representation it is advised to keep it\n    below 5 words.\n\n    Arguments:\n        search_term: the term you want to use to search for topics.\n        image: path to the image you want to use to search for topics.\n        top_n: the number of topics to return\n\n    Returns:\n        similar_topics: the most similar topics from high to low\n        similarity: the similarity scores from high to low\n\n    Examples:\n    You can use the underlying embedding model to find topics that\n    best represent the search term:\n\n    ```python\n    topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n    ```\n\n    Note that the search query is typically more accurate if the\n    search_term consists of a phrase or multiple words.\n    \"\"\"\n    if self.embedding_model is None:\n        raise Exception(\"This method can only be used if you did not use custom embeddings.\")\n\n    topic_list = list(self.topic_representations_.keys())\n    topic_list.sort()\n\n    # Extract search_term embeddings and compare with topic embeddings\n    if search_term is not None:\n        search_embedding = self._extract_embeddings([search_term], method=\"word\", verbose=False).flatten()\n    elif image is not None:\n        search_embedding = self._extract_embeddings(\n            [None], images=[image], method=\"document\", verbose=False\n        ).flatten()\n    sims = cosine_similarity(search_embedding.reshape(1, -1), self.topic_embeddings_).flatten()\n\n    # Extract topics most similar to search_term\n    ids = np.argsort(sims)[-top_n:]\n    similarity = [sims[i] for i in ids][::-1]\n    similar_topics = [topic_list[index] for index in ids][::-1]\n\n    return similar_topics, similarity\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.fit","title":"fit(self, documents, embeddings=None, images=None, y=None)","text":"

        Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents to fit on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model

        None images List[str]

        A list of paths to the images to fit on or the images themselves

        None y Union[List[int], numpy.ndarray]

        The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified.

        None

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic().fit(docs)\n

        If you want to use your own embeddings, use it as follows:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Create embeddings\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# Create topic model\ntopic_model = BERTopic().fit(docs, embeddings)\n
        Source code in bertopic\\_bertopic.py
        def fit(\n    self,\n    documents: List[str],\n    embeddings: np.ndarray = None,\n    images: List[str] = None,\n    y: Union[List[int], np.ndarray] = None,\n):\n    \"\"\"Fit the models (Bert, UMAP, and, HDBSCAN) on a collection of documents and generate topics.\n\n    Arguments:\n        documents: A list of documents to fit on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model\n        images: A list of paths to the images to fit on or the images themselves\n        y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n           specific instance is specified.\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic().fit(docs)\n    ```\n\n    If you want to use your own embeddings, use it as follows:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    # Create embeddings\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n    # Create topic model\n    topic_model = BERTopic().fit(docs, embeddings)\n    ```\n    \"\"\"\n    self.fit_transform(documents=documents, embeddings=embeddings, y=y, images=images)\n    return self\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.fit_transform","title":"fit_transform(self, documents, embeddings=None, images=None, y=None)","text":"

        Fit the models on a collection of documents, generate topics, and return the probabilities and topic per document.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents to fit on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model

        None images List[str]

        A list of paths to the images to fit on or the images themselves

        None y Union[List[int], numpy.ndarray]

        The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified.

        None

        Returns:

        Type Description predictions

        Topic predictions for each documents probabilities: The probability of the assigned topic per document. If calculate_probabilities in BERTopic is set to True, then it calculates the probabilities of all topics across all documents instead of only the assigned topic. This, however, slows down computation and may increase memory usage.

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        If you want to use your own embeddings, use it as follows:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Create embeddings\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# Create topic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n
        Source code in bertopic\\_bertopic.py
        def fit_transform(\n    self,\n    documents: List[str],\n    embeddings: np.ndarray = None,\n    images: List[str] = None,\n    y: Union[List[int], np.ndarray] = None,\n) -> Tuple[List[int], Union[np.ndarray, None]]:\n    \"\"\"Fit the models on a collection of documents, generate topics,\n    and return the probabilities and topic per document.\n\n    Arguments:\n        documents: A list of documents to fit on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model\n        images: A list of paths to the images to fit on or the images themselves\n        y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n           specific instance is specified.\n\n    Returns:\n        predictions: Topic predictions for each documents\n        probabilities: The probability of the assigned topic per document.\n                       If `calculate_probabilities` in BERTopic is set to True, then\n                       it calculates the probabilities of all topics across all documents\n                       instead of only the assigned topic. This, however, slows down\n                       computation and may increase memory usage.\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    ```\n\n    If you want to use your own embeddings, use it as follows:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    # Create embeddings\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n    # Create topic model\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs, embeddings)\n    ```\n    \"\"\"\n    if documents is not None:\n        check_documents_type(documents)\n        check_embeddings_shape(embeddings, documents)\n\n    doc_ids = range(len(documents)) if documents is not None else range(len(images))\n    documents = pd.DataFrame({\"Document\": documents, \"ID\": doc_ids, \"Topic\": None, \"Image\": images})\n\n    # Extract embeddings\n    if embeddings is None:\n        logger.info(\"Embedding - Transforming documents to embeddings.\")\n        self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)\n        embeddings = self._extract_embeddings(\n            documents.Document.values.tolist(),\n            images=images,\n            method=\"document\",\n            verbose=self.verbose,\n        )\n        logger.info(\"Embedding - Completed \\u2713\")\n    else:\n        if self.embedding_model is not None:\n            self.embedding_model = select_backend(\n                self.embedding_model, language=self.language, verbose=self.verbose\n            )\n\n    # Guided Topic Modeling\n    if self.seed_topic_list is not None and self.embedding_model is not None:\n        y, embeddings = self._guided_topic_modeling(embeddings)\n\n    # Reduce dimensionality and fit UMAP model\n    umap_embeddings = self._reduce_dimensionality(embeddings, y)\n\n    # Zero-shot Topic Modeling\n    if self._is_zeroshot():\n        documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(\n            documents, embeddings\n        )\n        # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering\n        umap_embeddings = self.umap_model.transform(embeddings)\n\n    if len(documents) > 0:  # No zero-shot topics matched\n        # Cluster reduced embeddings\n        documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)\n        if self._is_zeroshot() and len(assigned_documents) > 0:\n            documents, embeddings = self._combine_zeroshot_topics(\n                documents, embeddings, assigned_documents, assigned_embeddings\n            )\n    else:\n        # All documents matches zero-shot topics\n        documents = assigned_documents\n        embeddings = assigned_embeddings\n    topics_before_reduction = self.topics_\n\n    # Sort and Map Topic IDs by their frequency\n    if not self.nr_topics:\n        documents = self._sort_mappings_by_frequency(documents)\n\n    # Create documents from images if we have images only\n    if documents.Document.values[0] is None:\n        custom_documents = self._images_to_text(documents, embeddings)\n\n        # Extract topics by calculating c-TF-IDF\n        self._extract_topics(custom_documents, embeddings=embeddings)\n        self._create_topic_vectors(documents=documents, embeddings=embeddings)\n\n        # Reduce topics\n        if self.nr_topics:\n            custom_documents = self._reduce_topics(custom_documents)\n\n        # Save the top 3 most representative documents per topic\n        self._save_representative_docs(custom_documents)\n    else:\n        # Extract topics by calculating c-TF-IDF\n        self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)\n\n        # Reduce topics\n        if self.nr_topics:\n            documents = self._reduce_topics(documents)\n\n        # Save the top 3 most representative documents per topic\n        self._save_representative_docs(documents)\n\n    # In the case of zero-shot topics, probability will come from cosine similarity,\n    # and the HDBSCAN model will be removed\n    if self._is_zeroshot() and len(assigned_documents) > 0:\n        self.hdbscan_model = BaseCluster()\n        sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))\n\n        if self.calculate_probabilities:\n            probabilities = sim_matrix\n        else:\n            # Use `topics_before_reduction` because `self.topics_` may have already been updated from\n            # reducing topics, and the original probabilities are needed for `self._map_probabilities()`\n            probabilities = sim_matrix[\n                np.arange(len(documents)),\n                np.array(topics_before_reduction) + self._outliers,\n            ]\n\n    # Resulting output\n    self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)\n    predictions = documents.Topic.to_list()\n\n    return predictions, self.probabilities_\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.generate_topic_labels","title":"generate_topic_labels(self, nr_words=3, topic_prefix=True, word_length=None, separator='_', aspect=None)","text":"

        Get labels for each topic in a user-defined format.

        Parameters:

        Name Type Description Default nr_words int

        Top n words per topic to use

        3 topic_prefix bool

        Whether to use the topic ID as a prefix. If set to True, the topic ID will be separated using the separator

        True word_length int

        The maximum length of each word in the topic label. Some words might be relatively long and setting this value helps to make sure that all labels have relatively similar lengths.

        None separator str

        The string with which the words and topic prefix will be separated. Underscores are the default but a nice alternative is \", \".

        '_' aspect str

        The aspect from which to generate topic labels

        None

        Returns:

        Type Description topic_labels

        A list of topic labels sorted from the lowest topic ID to the highest. If the topic model was trained using HDBSCAN, the lowest topic ID is -1, otherwise it is 0.

        Examples:

        To create our custom topic labels, usage is rather straightforward:

        topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=\", \")\n
        Source code in bertopic\\_bertopic.py
        def generate_topic_labels(\n    self,\n    nr_words: int = 3,\n    topic_prefix: bool = True,\n    word_length: int = None,\n    separator: str = \"_\",\n    aspect: str = None,\n) -> List[str]:\n    \"\"\"Get labels for each topic in a user-defined format.\n\n    Arguments:\n        nr_words: Top `n` words per topic to use\n        topic_prefix: Whether to use the topic ID as a prefix.\n                      If set to True, the topic ID will be separated\n                      using the `separator`\n        word_length: The maximum length of each word in the topic label.\n                     Some words might be relatively long and setting this\n                     value helps to make sure that all labels have relatively\n                     similar lengths.\n        separator: The string with which the words and topic prefix will be\n                   separated. Underscores are the default but a nice alternative\n                   is `\", \"`.\n        aspect: The aspect from which to generate topic labels\n\n    Returns:\n        topic_labels: A list of topic labels sorted from the lowest topic ID to the highest.\n                      If the topic model was trained using HDBSCAN, the lowest topic ID is -1,\n                      otherwise it is 0.\n\n    Examples:\n    To create our custom topic labels, usage is rather straightforward:\n\n    ```python\n    topic_labels = topic_model.generate_topic_labels(nr_words=2, separator=\", \")\n    ```\n    \"\"\"\n    unique_topics = sorted(set(self.topics_))\n\n    topic_labels = []\n    for topic in unique_topics:\n        if aspect:\n            words, _ = zip(*self.topic_aspects_[aspect][topic])\n        else:\n            words, _ = zip(*self.get_topic(topic))\n\n        if word_length:\n            words = [word[:word_length] for word in words][:nr_words]\n        else:\n            words = list(words)[:nr_words]\n\n        if topic_prefix:\n            topic_label = f\"{topic}{separator}\" + separator.join(words)\n        else:\n            topic_label = separator.join(words)\n\n        topic_labels.append(topic_label)\n\n    return topic_labels\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_document_info","title":"get_document_info(self, docs, df=None, metadata=None)","text":"

        Get information about the documents on which the topic was trained including the documents themselves, their respective topics, the name of each topic, the top n words of each topic, whether it is a representative document, and probability of the clustering if the cluster model supports it.

        There are also options to include other meta data, such as the topic distributions or the x and y coordinates of the reduced embeddings.

        Parameters:

        Name Type Description Default docs List[str]

        The documents on which the topic model was trained.

        required df DataFrame

        A dataframe containing the metadata and the documents on which the topic model was originally trained on.

        None metadata Mapping[str, Any]

        A dictionary with meta data for each document in the form of column name (key) and the respective values (value).

        None

        Returns:

        Type Description document_info

        A dataframe with several statistics regarding the documents on which the topic model was trained.

        Usage:

        To get the document info, you will only need to pass the documents on which the topic model was trained:

        document_info = topic_model.get_document_info(docs)\n

        There are additionally options to include meta data, such as the topic distributions. Moreover, we can pass the original dataframe that contains the documents and extend it with the information retrieved from BERTopic:

        from sklearn.datasets import fetch_20newsgroups\n\n# The original data in a dataframe format to include the target variable\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndf = pd.DataFrame({\"Document\": data['data'], \"Class\": data['target']})\n\n# Add information about the percentage of the document that relates to the topic\ntopic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)\ndistributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]\n\n# Create our documents dataframe using the original dataframe and meta data about\n# the topic distributions\ndocument_info = topic_model.get_document_info(docs, df=df,\n                                              metadata={\"Topic_distribution\": distributions})\n
        Source code in bertopic\\_bertopic.py
        def get_document_info(\n    self,\n    docs: List[str],\n    df: pd.DataFrame = None,\n    metadata: Mapping[str, Any] = None,\n) -> pd.DataFrame:\n    \"\"\"Get information about the documents on which the topic was trained\n    including the documents themselves, their respective topics, the name\n    of each topic, the top n words of each topic, whether it is a\n    representative document, and probability of the clustering if the cluster\n    model supports it.\n\n    There are also options to include other meta data, such as the topic\n    distributions or the x and y coordinates of the reduced embeddings.\n\n    Arguments:\n        docs: The documents on which the topic model was trained.\n        df: A dataframe containing the metadata and the documents on which\n            the topic model was originally trained on.\n        metadata: A dictionary with meta data for each document in the form\n                  of column name (key) and the respective values (value).\n\n    Returns:\n        document_info: A dataframe with several statistics regarding\n                       the documents on which the topic model was trained.\n\n    Usage:\n\n    To get the document info, you will only need to pass the documents on which\n    the topic model was trained:\n\n    ```python\n    document_info = topic_model.get_document_info(docs)\n    ```\n\n    There are additionally options to include meta data, such as the topic\n    distributions. Moreover, we can pass the original dataframe that contains\n    the documents and extend it with the information retrieved from BERTopic:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n\n    # The original data in a dataframe format to include the target variable\n    data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\n    df = pd.DataFrame({\"Document\": data['data'], \"Class\": data['target']})\n\n    # Add information about the percentage of the document that relates to the topic\n    topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)\n    distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]\n\n    # Create our documents dataframe using the original dataframe and meta data about\n    # the topic distributions\n    document_info = topic_model.get_document_info(docs, df=df,\n                                                  metadata={\"Topic_distribution\": distributions})\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    if df is not None:\n        document_info = df.copy()\n        document_info[\"Document\"] = docs\n        document_info[\"Topic\"] = self.topics_\n    else:\n        document_info = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_})\n\n    # Add topic info through `.get_topic_info()`\n    topic_info = self.get_topic_info().drop(\"Count\", axis=1)\n    document_info = pd.merge(document_info, topic_info, on=\"Topic\", how=\"left\")\n\n    # Add top n words\n    top_n_words = {topic: \" - \".join(list(zip(*self.get_topic(topic)))[0]) for topic in set(self.topics_)}\n    document_info[\"Top_n_words\"] = document_info.Topic.map(top_n_words)\n\n    # Add flat probabilities\n    if self.probabilities_ is not None:\n        if len(self.probabilities_.shape) == 1:\n            document_info[\"Probability\"] = self.probabilities_\n        else:\n            document_info[\"Probability\"] = [\n                max(probs) if topic != -1 else 1 - sum(probs)\n                for topic, probs in zip(self.topics_, self.probabilities_)\n            ]\n\n    # Add representative document labels\n    repr_docs = [repr_doc for repr_docs in self.representative_docs_.values() for repr_doc in repr_docs]\n    document_info[\"Representative_document\"] = False\n    document_info.loc[document_info.Document.isin(repr_docs), \"Representative_document\"] = True\n\n    # Add custom meta data provided by the user\n    if metadata is not None:\n        for column, values in metadata.items():\n            document_info[column] = values\n    return document_info\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_params","title":"get_params(self, deep=False)","text":"

        Get parameters for this estimator.

        Adapted from: https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178

        Parameters:

        Name Type Description Default deep bool

        bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators.

        False

        Returns:

        Type Description out

        Parameter names mapped to their values.

        Source code in bertopic\\_bertopic.py
        def get_params(self, deep: bool = False) -> Mapping[str, Any]:\n    \"\"\"Get parameters for this estimator.\n\n    Adapted from:\n        https://github.com/scikit-learn/scikit-learn/blob/b3ea3ed6a/sklearn/base.py#L178\n\n    Arguments:\n        deep: bool, default=True\n              If True, will return the parameters for this estimator and\n              contained subobjects that are estimators.\n\n    Returns:\n        out: Parameter names mapped to their values.\n    \"\"\"\n    out = dict()\n    for key in self._get_param_names():\n        value = getattr(self, key)\n        if deep and hasattr(value, \"get_params\"):\n            deep_items = value.get_params().items()\n            out.update((key + \"__\" + k, val) for k, val in deep_items)\n        out[key] = value\n    return out\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_representative_docs","title":"get_representative_docs(self, topic=None)","text":"

        Extract the best representing documents per topic.

        Note

        This does not extract all documents per topic as all documents are not saved within BERTopic. To get all documents, please run the following:

        # When you used `.fit_transform`:\ndf = pd.DataFrame({\"Document\": docs, \"Topic\": topic})\n\n# When you used `.fit`:\ndf = pd.DataFrame({\"Document\": docs, \"Topic\": topic_model.topics_})\n

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want the representative documents

        None

        Returns:

        Type Description List[str]

        Representative documents of the chosen topic

        Examples:

        To extract the representative docs of all topics:

        representative_docs = topic_model.get_representative_docs()\n

        To get the representative docs of a single topic:

        representative_docs = topic_model.get_representative_docs(12)\n
        Source code in bertopic\\_bertopic.py
        def get_representative_docs(self, topic: int = None) -> List[str]:\n    \"\"\"Extract the best representing documents per topic.\n\n    Note:\n        This does not extract all documents per topic as all documents\n        are not saved within BERTopic. To get all documents, please\n        run the following:\n\n        ```python\n        # When you used `.fit_transform`:\n        df = pd.DataFrame({\"Document\": docs, \"Topic\": topic})\n\n        # When you used `.fit`:\n        df = pd.DataFrame({\"Document\": docs, \"Topic\": topic_model.topics_})\n        ```\n\n    Arguments:\n        topic: A specific topic for which you want\n               the representative documents\n\n    Returns:\n        Representative documents of the chosen topic\n\n    Examples:\n    To extract the representative docs of all topics:\n\n    ```python\n    representative_docs = topic_model.get_representative_docs()\n    ```\n\n    To get the representative docs of a single topic:\n\n    ```python\n    representative_docs = topic_model.get_representative_docs(12)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    if isinstance(topic, int):\n        if self.representative_docs_.get(topic):\n            return self.representative_docs_[topic]\n        else:\n            return None\n    else:\n        return self.representative_docs_\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic","title":"get_topic(self, topic, full=False)","text":"

        Return top n words for a specific topic and their c-TF-IDF scores.

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want its representation

        required full bool

        If True, returns all different forms of topic representations for a topic, including aspects

        False

        Returns:

        Type Description Union[Mapping[str, Tuple[str, float]], bool]

        The top n words for a specific word and its respective c-TF-IDF scores

        Examples:

        topic = topic_model.get_topic(12)\n
        Source code in bertopic\\_bertopic.py
        def get_topic(self, topic: int, full: bool = False) -> Union[Mapping[str, Tuple[str, float]], bool]:\n    \"\"\"Return top n words for a specific topic and their c-TF-IDF scores.\n\n    Arguments:\n        topic: A specific topic for which you want its representation\n        full: If True, returns all different forms of topic representations\n              for a topic, including aspects\n\n    Returns:\n        The top n words for a specific word and its respective c-TF-IDF scores\n\n    Examples:\n    ```python\n    topic = topic_model.get_topic(12)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    if topic in self.topic_representations_:\n        if full:\n            representations = {\"Main\": self.topic_representations_[topic]}\n            aspects = {aspect: representations[topic] for aspect, representations in self.topic_aspects_.items()}\n            representations.update(aspects)\n            return representations\n        else:\n            return self.topic_representations_[topic]\n    else:\n        return False\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic_freq","title":"get_topic_freq(self, topic=None)","text":"

        Return the size of topics (descending order).

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want the frequency

        None

        Returns:

        Type Description Union[pandas.core.frame.DataFrame, int]

        Either the frequency of a single topic or dataframe with the frequencies of all topics

        Examples:

        To extract the frequency of all topics:

        frequency = topic_model.get_topic_freq()\n

        To get the frequency of a single topic:

        frequency = topic_model.get_topic_freq(12)\n
        Source code in bertopic\\_bertopic.py
        def get_topic_freq(self, topic: int = None) -> Union[pd.DataFrame, int]:\n    \"\"\"Return the size of topics (descending order).\n\n    Arguments:\n        topic: A specific topic for which you want the frequency\n\n    Returns:\n        Either the frequency of a single topic or dataframe with\n        the frequencies of all topics\n\n    Examples:\n    To extract the frequency of all topics:\n\n    ```python\n    frequency = topic_model.get_topic_freq()\n    ```\n\n    To get the frequency of a single topic:\n\n    ```python\n    frequency = topic_model.get_topic_freq(12)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    if isinstance(topic, int):\n        return self.topic_sizes_[topic]\n    else:\n        return pd.DataFrame(self.topic_sizes_.items(), columns=[\"Topic\", \"Count\"]).sort_values(\n            \"Count\", ascending=False\n        )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic_info","title":"get_topic_info(self, topic=None)","text":"

        Get information about each topic including its ID, frequency, and name.

        Parameters:

        Name Type Description Default topic int

        A specific topic for which you want the frequency

        None

        Returns:

        Type Description info

        The information relating to either a single topic or all topics

        Examples:

        info_df = topic_model.get_topic_info()\n
        Source code in bertopic\\_bertopic.py
        def get_topic_info(self, topic: int = None) -> pd.DataFrame:\n    \"\"\"Get information about each topic including its ID, frequency, and name.\n\n    Arguments:\n        topic: A specific topic for which you want the frequency\n\n    Returns:\n        info: The information relating to either a single topic or all topics\n\n    Examples:\n    ```python\n    info_df = topic_model.get_topic_info()\n    ```\n    \"\"\"\n    check_is_fitted(self)\n\n    info = pd.DataFrame(self.topic_sizes_.items(), columns=[\"Topic\", \"Count\"]).sort_values(\"Topic\")\n    info[\"Name\"] = info.Topic.map(self.topic_labels_)\n\n    # Custom label\n    if self.custom_labels_ is not None:\n        if len(self.custom_labels_) == len(info):\n            labels = {topic - self._outliers: label for topic, label in enumerate(self.custom_labels_)}\n            info[\"CustomName\"] = info[\"Topic\"].map(labels)\n\n    # Main Keywords\n    values = {topic: list(list(zip(*values))[0]) for topic, values in self.topic_representations_.items()}\n    info[\"Representation\"] = info[\"Topic\"].map(values)\n\n    # Extract all topic aspects\n    if self.topic_aspects_:\n        for aspect, values in self.topic_aspects_.items():\n            if isinstance(list(values.values())[-1], list):\n                if isinstance(list(values.values())[-1][0], tuple) or isinstance(\n                    list(values.values())[-1][0], list\n                ):\n                    values = {topic: list(list(zip(*value))[0]) for topic, value in values.items()}\n                elif isinstance(list(values.values())[-1][0], str):\n                    values = {topic: \" \".join(value).strip() for topic, value in values.items()}\n            info[aspect] = info[\"Topic\"].map(values)\n\n    # Representative Docs / Images\n    if self.representative_docs_ is not None:\n        info[\"Representative_Docs\"] = info[\"Topic\"].map(self.representative_docs_)\n    if self.representative_images_ is not None:\n        info[\"Representative_Images\"] = info[\"Topic\"].map(self.representative_images_)\n\n    # Select specific topic to return\n    if topic is not None:\n        info = info.loc[info.Topic == topic, :]\n\n    return info.reset_index(drop=True)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topic_tree","title":"get_topic_tree(hier_topics, max_distance=None, tight_layout=False) staticmethod","text":"

        Extract the topic tree such that it can be printed.

        Parameters:

        Name Type Description Default hier_topics DataFrame

        A dataframe containing the structure of the topic tree. This is the output of topic_model.hierarchical_topics()

        required max_distance float

        The maximum distance between two topics. This value is based on the Distance column in hier_topics.

        None tight_layout bool

        Whether to use a tight layout (narrow width) for easier readability if you have hundreds of topics.

        False

        Returns:

        Type Description A tree that has the following structure when printed

        . . \u2514\u2500health_medical_disease_patients_hiv \u251c\u2500patients_medical_disease_candida_health \u2502 \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48 \u2502 \u2514\u2500patients_disease_cancer_medical_doctor \u2502 \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34 \u2502 \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26 \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9

        The blocks (\u25a0) indicate that the topic is one you can directly access from topic_model.get_topic. In other words, they are the original un-grouped topics.

        Examples:

        # Train model\nfrom bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Print topic tree\ntree = topic_model.get_topic_tree(hierarchical_topics)\nprint(tree)\n
        Source code in bertopic\\_bertopic.py
        @staticmethod\ndef get_topic_tree(\n    hier_topics: pd.DataFrame,\n    max_distance: float = None,\n    tight_layout: bool = False,\n) -> str:\n    \"\"\"Extract the topic tree such that it can be printed.\n\n    Arguments:\n        hier_topics: A dataframe containing the structure of the topic tree.\n                     This is the output of `topic_model.hierarchical_topics()`\n        max_distance: The maximum distance between two topics. This value is\n                      based on the Distance column in `hier_topics`.\n        tight_layout: Whether to use a tight layout (narrow width) for\n                      easier readability if you have hundreds of topics.\n\n    Returns:\n        A tree that has the following structure when printed:\n            .\n            .\n            \u2514\u2500health_medical_disease_patients_hiv\n                \u251c\u2500patients_medical_disease_candida_health\n                \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n                \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n                \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n                \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n                \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n\n        The blocks (\u25a0) indicate that the topic is one you can directly access\n        from `topic_model.get_topic`. In other words, they are the original un-grouped topics.\n\n    Examples:\n    ```python\n    # Train model\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Print topic tree\n    tree = topic_model.get_topic_tree(hierarchical_topics)\n    print(tree)\n    ```\n    \"\"\"\n    width = 1 if tight_layout else 4\n    if max_distance is None:\n        max_distance = hier_topics.Distance.max() + 1\n\n    max_original_topic = hier_topics.Parent_ID.astype(int).min() - 1\n\n    # Extract mapping from ID to name\n    topic_to_name = dict(zip(hier_topics.Child_Left_ID, hier_topics.Child_Left_Name))\n    topic_to_name.update(dict(zip(hier_topics.Child_Right_ID, hier_topics.Child_Right_Name)))\n    topic_to_name = {topic: name[:100] for topic, name in topic_to_name.items()}\n\n    # Create tree\n    tree = {\n        str(row[1].Parent_ID): [\n            str(row[1].Child_Left_ID),\n            str(row[1].Child_Right_ID),\n        ]\n        for row in hier_topics.iterrows()\n    }\n\n    def get_tree(start, tree):\n        \"\"\"Based on: https://stackoverflow.com/a/51920869/10532563.\"\"\"\n\n        def _tree(to_print, start, parent, tree, grandpa=None, indent=\"\"):\n            # Get distance between merged topics\n            distance = hier_topics.loc[\n                (hier_topics.Child_Left_ID == parent) | (hier_topics.Child_Right_ID == parent),\n                \"Distance\",\n            ]\n            distance = distance.values[0] if len(distance) > 0 else 10\n\n            if parent != start:\n                if grandpa is None:\n                    to_print += topic_to_name[parent]\n                else:\n                    if int(parent) <= max_original_topic:\n                        # Do not append topic ID if they are not merged\n                        if distance < max_distance:\n                            to_print += \"\u25a0\u2500\u2500\" + topic_to_name[parent] + f\" \u2500\u2500 Topic: {parent}\" + \"\\n\"\n                        else:\n                            to_print += \"O \\n\"\n                    else:\n                        to_print += topic_to_name[parent] + \"\\n\"\n\n            if parent not in tree:\n                return to_print\n\n            for child in tree[parent][:-1]:\n                to_print += indent + \"\u251c\" + \"\u2500\"\n                to_print = _tree(to_print, start, child, tree, parent, indent + \"\u2502\" + \" \" * width)\n\n            child = tree[parent][-1]\n            to_print += indent + \"\u2514\" + \"\u2500\"\n            to_print = _tree(to_print, start, child, tree, parent, indent + \" \" * (width + 1))\n\n            return to_print\n\n        to_print = \".\" + \"\\n\"\n        to_print = _tree(to_print, start, start, tree)\n        return to_print\n\n    start = str(hier_topics.Parent_ID.astype(int).max())\n    return get_tree(start, tree)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.get_topics","title":"get_topics(self, full=False)","text":"

        Return topics with top n words and their c-TF-IDF score.

        Parameters:

        Name Type Description Default full bool

        If True, returns all different forms of topic representations for each topic, including aspects

        False

        Returns:

        Type Description self.topic_representations_

        The top n words per topic and the corresponding c-TF-IDF score

        Examples:

        all_topics = topic_model.get_topics()\n
        Source code in bertopic\\_bertopic.py
        def get_topics(self, full: bool = False) -> Mapping[str, Tuple[str, float]]:\n    \"\"\"Return topics with top n words and their c-TF-IDF score.\n\n    Arguments:\n        full: If True, returns all different forms of topic representations\n              for each topic, including aspects\n\n    Returns:\n        self.topic_representations_: The top n words per topic and the corresponding c-TF-IDF score\n\n    Examples:\n    ```python\n    all_topics = topic_model.get_topics()\n    ```\n    \"\"\"\n    check_is_fitted(self)\n\n    if full:\n        topic_representations = {\"Main\": self.topic_representations_}\n        topic_representations.update(self.topic_aspects_)\n        return topic_representations\n    else:\n        return self.topic_representations_\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.hierarchical_topics","title":"hierarchical_topics(self, docs, use_ctfidf=True, linkage_function=None, distance_function=None)","text":"

        Create a hierarchy of topics.

        To create this hierarchy, BERTopic needs to be already fitted once. Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings representation using scipy.cluster.hierarchy.linkage.

        Based on that hierarchy, we calculate the topic representation at each merged step. This is a local representation, as we only assume that the chosen step is merged and not all others which typically improves the topic representation.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required use_ctfidf bool

        Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used.

        True linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

        The linkage function to use. Default is: lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

        None distance_function Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

        The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x). You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix.

        None

        Returns:

        Type Description hierarchical_topics

        A dataframe that contains a hierarchy of topics represented by their parents and their children

        Examples:

        from bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n

        A custom linkage function can be used as follows:

        from scipy.cluster import hierarchy as sch\nfrom bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Hierarchical topics\nlinkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\nhierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n
        Source code in bertopic\\_bertopic.py
        def hierarchical_topics(\n    self,\n    docs: List[str],\n    use_ctfidf: bool = True,\n    linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n    distance_function: Callable[[csr_matrix], csr_matrix] = None,\n) -> pd.DataFrame:\n    \"\"\"Create a hierarchy of topics.\n\n    To create this hierarchy, BERTopic needs to be already fitted once.\n    Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings\n    representation using `scipy.cluster.hierarchy.linkage`.\n\n    Based on that hierarchy, we calculate the topic representation at each\n    merged step. This is a local representation, as we only assume that the\n    chosen step is merged and not all others which typically improves the\n    topic representation.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                    embeddings from the embedding model are used.\n        linkage_function: The linkage function to use. Default is:\n                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                           `lambda x: 1 - cosine_similarity(x)`.\n                           You can pass any function that returns either a square matrix of\n                           shape (n_samples, n_samples) with zeros on the diagonal and\n                           non-negative values or condensed distance matrix of shape\n                           (n_samples * (n_samples - 1) / 2,) containing the upper\n                           triangular of the distance matrix.\n\n    Returns:\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n    ```\n\n    A custom linkage function can be used as follows:\n\n    ```python\n    from scipy.cluster import hierarchy as sch\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n\n    # Hierarchical topics\n    linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)\n    hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    if distance_function is None:\n        distance_function = lambda x: 1 - cosine_similarity(x)\n\n    if linkage_function is None:\n        linkage_function = lambda x: sch.linkage(x, \"ward\", optimal_ordering=True)\n\n    # Calculate distance\n    embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][\n        self._outliers :\n    ]\n    X = distance_function(embeddings)\n    X = validate_distance_matrix(X, embeddings.shape[0])\n\n    # Use the 1-D condensed distance matrix as an input instead of the raw distance matrix\n    Z = linkage_function(X)\n\n    # Ensuring that the distances between clusters are unique otherwise the flatting of the hierarchy with\n    # `sch.fcluster(...)` would produce incorrect values for \"Topics\" for these clusters\n    if len(Z[:, 2]) != len(np.unique(Z[:, 2])):\n        Z[:, 2] = get_unique_distances(Z[:, 2])\n\n    # Calculate basic bag-of-words to be iteratively merged later\n    documents = pd.DataFrame({\"Document\": docs, \"ID\": range(len(docs)), \"Topic\": self.topics_})\n    documents_per_topic = documents.groupby([\"Topic\"], as_index=False).agg({\"Document\": \" \".join})\n    documents_per_topic = documents_per_topic.loc[documents_per_topic.Topic != -1, :]\n    clean_documents = self._preprocess_text(documents_per_topic.Document.values)\n\n    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n    # and will be removed in 1.2. Please use get_feature_names_out instead.\n    if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n        words = self.vectorizer_model.get_feature_names_out()\n    else:\n        words = self.vectorizer_model.get_feature_names()\n\n    bow = self.vectorizer_model.transform(clean_documents)\n\n    # Extract clusters\n    hier_topics = pd.DataFrame(\n        columns=[\n            \"Parent_ID\",\n            \"Parent_Name\",\n            \"Topics\",\n            \"Child_Left_ID\",\n            \"Child_Left_Name\",\n            \"Child_Right_ID\",\n            \"Child_Right_Name\",\n        ]\n    )\n    for index in tqdm(range(len(Z))):\n        # Find clustered documents\n        clusters = sch.fcluster(Z, t=Z[index][2], criterion=\"distance\") - self._outliers\n        nr_clusters = len(clusters)\n\n        # Extract first topic we find to get the set of topics in a merged topic\n        topic = None\n        val = Z[index][0]\n        while topic is None:\n            if val - len(clusters) < 0:\n                topic = int(val)\n            else:\n                val = Z[int(val - len(clusters))][0]\n        clustered_topics = [i for i, x in enumerate(clusters) if x == clusters[topic]]\n\n        # Group bow per cluster, calculate c-TF-IDF and extract words\n        grouped = csr_matrix(bow[clustered_topics].sum(axis=0))\n        c_tf_idf = self.ctfidf_model.transform(grouped)\n        selection = documents.loc[documents.Topic.isin(clustered_topics), :]\n        selection.Topic = 0\n        words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n\n        # Extract parent's name and ID\n        parent_id = index + len(clusters)\n        parent_name = \"_\".join([x[0] for x in words_per_topic[0]][:5])\n\n        # Extract child's name and ID\n        Z_id = Z[index][0]\n        child_left_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n        if Z_id - nr_clusters < 0:\n            child_left_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n        else:\n            child_left_name = hier_topics.iloc[int(child_left_id)].Parent_Name\n\n        # Extract child's name and ID\n        Z_id = Z[index][1]\n        child_right_id = Z_id if Z_id - nr_clusters < 0 else Z_id - nr_clusters\n\n        if Z_id - nr_clusters < 0:\n            child_right_name = \"_\".join([x[0] for x in self.get_topic(Z_id)][:5])\n        else:\n            child_right_name = hier_topics.iloc[int(child_right_id)].Parent_Name\n\n        # Save results\n        hier_topics.loc[len(hier_topics), :] = [\n            parent_id,\n            parent_name,\n            clustered_topics,\n            int(Z[index][0]),\n            child_left_name,\n            int(Z[index][1]),\n            child_right_name,\n        ]\n\n    hier_topics[\"Distance\"] = Z[:, 2]\n    hier_topics = hier_topics.sort_values(\"Parent_ID\", ascending=False)\n    hier_topics[[\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]] = hier_topics[\n        [\"Parent_ID\", \"Child_Left_ID\", \"Child_Right_ID\"]\n    ].astype(str)\n\n    return hier_topics\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.load","title":"load(path, embedding_model=None) classmethod","text":"

        Loads the model from the specified path or directory.

        Parameters:

        Name Type Description Default path str

        Either load a BERTopic model from a file (.pickle) or a folder containing .safetensors or .bin files.

        required embedding_model

        Additionally load in an embedding model if it was not saved in the BERTopic model file or directory.

        None

        Examples:

        BERTopic.load(\"model_dir\")\n

        or if you did not save the embedding model:

        BERTopic.load(\"model_dir\", embedding_model=\"all-MiniLM-L6-v2\")\n
        Source code in bertopic\\_bertopic.py
        @classmethod\ndef load(cls, path: str, embedding_model=None):\n    \"\"\"Loads the model from the specified path or directory.\n\n    Arguments:\n        path: Either load a BERTopic model from a file (`.pickle`) or a folder containing\n              `.safetensors` or `.bin` files.\n        embedding_model: Additionally load in an embedding model if it was not saved\n                         in the BERTopic model file or directory.\n\n    Examples:\n    ```python\n    BERTopic.load(\"model_dir\")\n    ```\n\n    or if you did not save the embedding model:\n\n    ```python\n    BERTopic.load(\"model_dir\", embedding_model=\"all-MiniLM-L6-v2\")\n    ```\n    \"\"\"\n    file_or_dir = Path(path)\n\n    # Load from Pickle\n    if file_or_dir.is_file():\n        with open(file_or_dir, \"rb\") as file:\n            if embedding_model:\n                topic_model = joblib.load(file)\n                topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose)\n            else:\n                topic_model = joblib.load(file)\n            return topic_model\n\n    # Load from directory or HF\n    if file_or_dir.is_dir():\n        topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_local_files(file_or_dir)\n    elif \"/\" in str(path):\n        topics, params, tensors, ctfidf_tensors, ctfidf_config, images = save_utils.load_files_from_hf(path)\n    else:\n        raise ValueError(\"Make sure to either pass a valid directory or HF model.\")\n    topic_model = _create_model_from_files(\n        topics,\n        params,\n        tensors,\n        ctfidf_tensors,\n        ctfidf_config,\n        images,\n        warn_no_backend=(embedding_model is None),\n    )\n\n    # Replace embedding model if one is specifically chosen\n    if embedding_model is not None:\n        topic_model.embedding_model = select_backend(embedding_model, verbose=topic_model.verbose)\n\n    return topic_model\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.merge_models","title":"merge_models(models, min_similarity=0.7, embedding_model=None) classmethod","text":"

        Merge multiple pre-trained BERTopic models into a single model.

        The models are merged as if they were all saved using pytorch or safetensors, so a minimal version without c-TF-IDF.

        To do this, we choose the first model in the list of models as a baseline. Then, we check each model whether they contain topics that are not in the baseline. This check is based on the cosine similarity between topics embeddings. If topic embeddings between two models are similar, then the topic of the second model is re-assigned to the first. If they are dissimilar, the topic of the second model is assigned to the first.

        In essence, we simply check whether sufficiently \"new\" topics emerge and add them.

        Parameters:

        Name Type Description Default models

        A list of fitted BERTopic models

        required min_similarity float

        The minimum similarity for when topics are merged.

        0.7 embedding_model

        Additionally load in an embedding model if necessary.

        None

        Returns:

        Type Description

        A new BERTopic model that was created as if you were loading a model from the HuggingFace Hub without c-TF-IDF

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# Create three separate models\ntopic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000])\ntopic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000])\ntopic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:])\n\n# Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n
        Source code in bertopic\\_bertopic.py
        @classmethod\ndef merge_models(cls, models, min_similarity: float = 0.7, embedding_model=None):\n    \"\"\"Merge multiple pre-trained BERTopic models into a single model.\n\n    The models are merged as if they were all saved using pytorch or\n    safetensors, so a minimal version without c-TF-IDF.\n\n    To do this, we choose the first model in the list of\n    models as a baseline. Then, we check each model whether\n    they contain topics that are not in the baseline.\n    This check is based on the cosine similarity between\n    topics embeddings. If topic embeddings between two models\n    are similar, then the topic of the second model is re-assigned\n    to the first. If they are dissimilar, the topic of the second\n    model is assigned to the first.\n\n    In essence, we simply check whether sufficiently \"new\"\n    topics emerge and add them.\n\n    Arguments:\n        models: A list of fitted BERTopic models\n        min_similarity: The minimum similarity for when topics are merged.\n        embedding_model: Additionally load in an embedding model if necessary.\n\n    Returns:\n        A new BERTopic model that was created as if you were\n        loading a model from the HuggingFace Hub without c-TF-IDF\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n    # Create three separate models\n    topic_model_1 = BERTopic(min_topic_size=5).fit(docs[:4000])\n    topic_model_2 = BERTopic(min_topic_size=5).fit(docs[4000:8000])\n    topic_model_3 = BERTopic(min_topic_size=5).fit(docs[8000:])\n\n    # Combine all models into one\n    merged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n    ```\n    \"\"\"\n    import torch\n\n    # Temporarily save model and push to HF\n    with TemporaryDirectory() as tmpdir:\n        # Save model weights and config.\n        all_topics, all_params, all_tensors = [], [], []\n        for index, model in enumerate(models):\n            model.save(tmpdir, serialization=\"pytorch\")\n            topics, params, tensors, _, _, _ = save_utils.load_local_files(Path(tmpdir))\n            all_topics.append(topics)\n            all_params.append(params)\n            all_tensors.append(np.array(tensors[\"topic_embeddings\"]))\n\n            # Create a base set of parameters\n            if index == 0:\n                merged_topics = topics\n                merged_params = params\n                merged_tensors = np.array(tensors[\"topic_embeddings\"])\n                merged_topics[\"custom_labels\"] = None\n\n    for tensors, selected_topics in zip(all_tensors[1:], all_topics[1:]):\n        # Calculate similarity matrix\n        sim_matrix = cosine_similarity(tensors, merged_tensors)\n        sims = np.max(sim_matrix, axis=1)\n\n        # Extract new topics\n        new_topics = sorted(\n            [index - selected_topics[\"_outliers\"] for index, sim in enumerate(sims) if sim < min_similarity]\n        )\n        max_topic = max(set(merged_topics[\"topics\"]))\n\n        # Merge Topic Representations\n        new_topics_dict = {}\n        for new_topic in new_topics:\n            if new_topic != -1:\n                max_topic += 1\n                new_topics_dict[new_topic] = max_topic\n                merged_topics[\"topic_representations\"][str(max_topic)] = selected_topics[\"topic_representations\"][\n                    str(new_topic)\n                ]\n                merged_topics[\"topic_labels\"][str(max_topic)] = selected_topics[\"topic_labels\"][str(new_topic)]\n\n                # Add new aspects\n                if selected_topics[\"topic_aspects\"]:\n                    aspects_1 = set(merged_topics[\"topic_aspects\"].keys())\n                    aspects_2 = set(selected_topics[\"topic_aspects\"].keys())\n                    aspects_diff = aspects_2.difference(aspects_1)\n                    if aspects_diff:\n                        for aspect in aspects_diff:\n                            merged_topics[\"topic_aspects\"][aspect] = {}\n\n                    # If the original model does not have topic aspects but the to be added model does\n                    if not merged_topics.get(\"topic_aspects\"):\n                        merged_topics[\"topic_aspects\"] = selected_topics[\"topic_aspects\"]\n\n                    # If they both contain topic aspects, add to the existing set of aspects\n                    else:\n                        for aspect, values in selected_topics[\"topic_aspects\"].items():\n                            merged_topics[\"topic_aspects\"][aspect][str(max_topic)] = values[str(new_topic)]\n\n                # Add new embeddings\n                new_tensors = tensors[new_topic + selected_topics[\"_outliers\"]]\n                merged_tensors = np.vstack([merged_tensors, new_tensors])\n\n        # Topic Mapper\n        merged_topics[\"topic_mapper\"] = TopicMapper(list(range(-1, max_topic + 1, 1))).mappings_\n\n        # Find similar topics and re-assign those from the new models\n        sims_idx = np.argmax(sim_matrix, axis=1)\n        sims = np.max(sim_matrix, axis=1)\n        to_merge = {\n            a - selected_topics[\"_outliers\"]: b - merged_topics[\"_outliers\"]\n            for a, (b, val) in enumerate(zip(sims_idx, sims))\n            if val >= min_similarity\n        }\n        to_merge.update(new_topics_dict)\n        to_merge[-1] = -1\n        topics = [to_merge[topic] for topic in selected_topics[\"topics\"]]\n        merged_topics[\"topics\"].extend(topics)\n        merged_topics[\"topic_sizes\"] = dict(Counter(merged_topics[\"topics\"]))\n\n    # Create a new model from the merged parameters\n    merged_tensors = {\"topic_embeddings\": torch.from_numpy(merged_tensors)}\n    merged_model = _create_model_from_files(\n        merged_topics,\n        merged_params,\n        merged_tensors,\n        None,\n        None,\n        None,\n        warn_no_backend=False,\n    )\n    merged_model.embedding_model = models[0].embedding_model\n\n    # Replace embedding model if one is specifically chosen\n    verbose = any([model.verbose for model in models])\n    if embedding_model is not None and type(merged_model.embedding_model) == BaseEmbedder:\n        merged_model.embedding_model = select_backend(embedding_model, verbose=verbose)\n    return merged_model\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.merge_topics","title":"merge_topics(self, docs, topics_to_merge, images=None)","text":"

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics_to_merge List[Union[Iterable[int], int]]

        Either a list of topics or a list of list of topics to merge. For example: [1, 2, 3] will merge topics 1, 2 and 3 [[1, 2], [3, 4]] will merge topics 1 and 2, and separately merge topics 3 and 4.

        required images List[str]

        A list of paths to the images used when calling either fit or fit_transform.

        None

        Examples:

        If you want to merge topics 1, 2, and 3:

        topics_to_merge = [1, 2, 3]\ntopic_model.merge_topics(docs, topics_to_merge)\n

        or if you want to merge topics 1 and 2, and separately merge topics 3 and 4:

        topics_to_merge = [[1, 2],\n                    [3, 4]]\ntopic_model.merge_topics(docs, topics_to_merge)\n
        Source code in bertopic\\_bertopic.py
        def merge_topics(\n    self,\n    docs: List[str],\n    topics_to_merge: List[Union[Iterable[int], int]],\n    images: List[str] = None,\n) -> None:\n    \"\"\"Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics_to_merge: Either a list of topics or a list of list of topics\n                         to merge. For example:\n                            [1, 2, 3] will merge topics 1, 2 and 3\n                            [[1, 2], [3, 4]] will merge topics 1 and 2, and\n                            separately merge topics 3 and 4.\n        images: A list of paths to the images used when calling either\n                `fit` or `fit_transform`.\n\n    Examples:\n    If you want to merge topics 1, 2, and 3:\n\n    ```python\n    topics_to_merge = [1, 2, 3]\n    topic_model.merge_topics(docs, topics_to_merge)\n    ```\n\n    or if you want to merge topics 1 and 2, and separately\n    merge topics 3 and 4:\n\n    ```python\n    topics_to_merge = [[1, 2],\n                        [3, 4]]\n    topic_model.merge_topics(docs, topics_to_merge)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    documents = pd.DataFrame(\n        {\n            \"Document\": docs,\n            \"Topic\": self.topics_,\n            \"Image\": images,\n            \"ID\": range(len(docs)),\n        }\n    )\n\n    mapping = {topic: topic for topic in set(self.topics_)}\n    if isinstance(topics_to_merge[0], int):\n        for topic in sorted(topics_to_merge):\n            mapping[topic] = topics_to_merge[0]\n    elif isinstance(topics_to_merge[0], Iterable):\n        for topic_group in sorted(topics_to_merge):\n            for topic in topic_group:\n                mapping[topic] = topic_group[0]\n    else:\n        raise ValueError(\n            \"Make sure that `topics_to_merge` is either\" \"a list of topics or a list of list of topics.\"\n        )\n\n    # Track mappings and sizes of topics for merging topic embeddings\n    mappings = defaultdict(list)\n    for key, val in sorted(mapping.items()):\n        mappings[val].append(key)\n    mappings = {\n        topic_to: {\n            \"topics_from\": topics_from,\n            \"topic_sizes\": [self.topic_sizes_[topic] for topic in topics_from],\n        }\n        for topic_to, topics_from in mappings.items()\n    }\n\n    # Update topics\n    documents.Topic = documents.Topic.map(mapping)\n    self.topic_mapper_.add_mappings(mapping)\n    documents = self._sort_mappings_by_frequency(documents)\n    self._extract_topics(documents, mappings=mappings)\n    self._update_topic_size(documents)\n    self._save_representative_docs(documents)\n    self.probabilities_ = self._map_probabilities(self.probabilities_)\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.partial_fit","title":"partial_fit(self, documents, embeddings=None, y=None)","text":"

        Fit BERTopic on a subset of the data and perform online learning with batch-like data.

        Online topic modeling in BERTopic is performed by using dimensionality reduction and cluster algorithms that support a partial_fit method in order to incrementally train the topic model.

        Likewise, the bertopic.vectorizers.OnlineCountVectorizer is used to dynamically update its vocabulary when presented with new data. It has several parameters for modeling decay and updating the representations.

        In other words, although the main algorithm stays the same, the training procedure now works as follows:

        For each subset of the data:

        1. Generate embeddings with a pre-trained language model
        2. Incrementally update the dimensionality reduction algorithm with partial_fit
        3. Incrementally update the cluster algorithm with partial_fit
        4. Incrementally update the OnlineCountVectorizer and apply some form of decay

        Note that it is advised to use partial_fit with batches and not single documents for the best performance.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents to fit on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model

        None y Union[List[int], numpy.ndarray]

        The target class for (semi)-supervised modeling. Use -1 if no class for a specific instance is specified.

        None

        Examples:

        from sklearn.datasets import fetch_20newsgroups\nfrom sklearn.cluster import MiniBatchKMeans\nfrom sklearn.decomposition import IncrementalPCA\nfrom bertopic.vectorizers import OnlineCountVectorizer\nfrom bertopic import BERTopic\n\n# Prepare documents\ndocs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\n# Prepare sub-models that support online learning\numap_model = IncrementalPCA(n_components=5)\ncluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\ntopic_model = BERTopic(umap_model=umap_model,\n                       hdbscan_model=cluster_model,\n                       vectorizer_model=vectorizer_model)\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor index in range(0, len(docs), 1000):\n    topic_model.partial_fit(docs[index: index+1000])\n
        Source code in bertopic\\_bertopic.py
        def partial_fit(\n    self,\n    documents: List[str],\n    embeddings: np.ndarray = None,\n    y: Union[List[int], np.ndarray] = None,\n):\n    \"\"\"Fit BERTopic on a subset of the data and perform online learning\n    with batch-like data.\n\n    Online topic modeling in BERTopic is performed by using dimensionality\n    reduction and cluster algorithms that support a `partial_fit` method\n    in order to incrementally train the topic model.\n\n    Likewise, the `bertopic.vectorizers.OnlineCountVectorizer` is used\n    to dynamically update its vocabulary when presented with new data.\n    It has several parameters for modeling decay and updating the\n    representations.\n\n    In other words, although the main algorithm stays the same, the training\n    procedure now works as follows:\n\n    For each subset of the data:\n\n    1. Generate embeddings with a pre-trained language model\n    2. Incrementally update the dimensionality reduction algorithm with `partial_fit`\n    3. Incrementally update the cluster algorithm with `partial_fit`\n    4. Incrementally update the OnlineCountVectorizer and apply some form of decay\n\n    Note that it is advised to use `partial_fit` with batches and\n    not single documents for the best performance.\n\n    Arguments:\n        documents: A list of documents to fit on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model\n        y: The target class for (semi)-supervised modeling. Use -1 if no class for a\n           specific instance is specified.\n\n    Examples:\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sklearn.cluster import MiniBatchKMeans\n    from sklearn.decomposition import IncrementalPCA\n    from bertopic.vectorizers import OnlineCountVectorizer\n    from bertopic import BERTopic\n\n    # Prepare documents\n    docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\n    # Prepare sub-models that support online learning\n    umap_model = IncrementalPCA(n_components=5)\n    cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\n    vectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n\n    topic_model = BERTopic(umap_model=umap_model,\n                           hdbscan_model=cluster_model,\n                           vectorizer_model=vectorizer_model)\n\n    # Incrementally fit the topic model by training on 1000 documents at a time\n    for index in range(0, len(docs), 1000):\n        topic_model.partial_fit(docs[index: index+1000])\n    ```\n    \"\"\"\n    # Checks\n    check_embeddings_shape(embeddings, documents)\n    if not hasattr(self.hdbscan_model, \"partial_fit\"):\n        raise ValueError(\n            \"In order to use `.partial_fit`, the cluster model should have \" \"a `.partial_fit` function.\"\n        )\n\n    # Prepare documents\n    if isinstance(documents, str):\n        documents = [documents]\n    documents = pd.DataFrame({\"Document\": documents, \"ID\": range(len(documents)), \"Topic\": None})\n\n    # Extract embeddings\n    if embeddings is None:\n        if self.topic_representations_ is None:\n            self.embedding_model = select_backend(\n                self.embedding_model, language=self.language, verbose=self.verbose\n            )\n        embeddings = self._extract_embeddings(\n            documents.Document.values.tolist(),\n            method=\"document\",\n            verbose=self.verbose,\n        )\n    else:\n        if self.embedding_model is not None and self.topic_representations_ is None:\n            self.embedding_model = select_backend(\n                self.embedding_model, language=self.language, verbose=self.verbose\n            )\n\n    # Reduce dimensionality\n    if self.seed_topic_list is not None and self.embedding_model is not None:\n        y, embeddings = self._guided_topic_modeling(embeddings)\n    umap_embeddings = self._reduce_dimensionality(embeddings, y, partial_fit=True)\n\n    # Cluster reduced embeddings\n    documents, self.probabilities_ = self._cluster_embeddings(umap_embeddings, documents, partial_fit=True)\n    topics = documents.Topic.to_list()\n\n    # Map and find new topics\n    if not self.topic_mapper_:\n        self.topic_mapper_ = TopicMapper(topics)\n    mappings = self.topic_mapper_.get_mappings()\n    new_topics = set(topics).difference(set(mappings.keys()))\n    new_topic_ids = {topic: max(mappings.values()) + index + 1 for index, topic in enumerate(new_topics)}\n    self.topic_mapper_.add_new_topics(new_topic_ids)\n    updated_mappings = self.topic_mapper_.get_mappings()\n    updated_topics = [updated_mappings[topic] for topic in topics]\n    documents[\"Topic\"] = updated_topics\n\n    # Add missing topics (topics that were originally created but are now missing)\n    if self.topic_representations_:\n        missing_topics = set(self.topic_representations_.keys()).difference(set(updated_topics))\n        for missing_topic in missing_topics:\n            documents.loc[len(documents), :] = [\" \", len(documents), missing_topic]\n    else:\n        missing_topics = {}\n\n    # Prepare documents\n    documents_per_topic = documents.sort_values(\"Topic\").groupby([\"Topic\"], as_index=False)\n    updated_topics = documents_per_topic.first().Topic.astype(int)\n    documents_per_topic = documents_per_topic.agg({\"Document\": \" \".join})\n\n    # Update topic representations\n    self.c_tf_idf_, updated_words = self._c_tf_idf(documents_per_topic, partial_fit=True)\n    self.topic_representations_ = self._extract_words_per_topic(\n        updated_words, documents, self.c_tf_idf_, calculate_aspects=False\n    )\n    self._create_topic_vectors()\n\n    # Update topic sizes\n    if len(missing_topics) > 0:\n        documents = documents.iloc[: -len(missing_topics)]\n\n    if self.topic_sizes_ is None:\n        self._update_topic_size(documents)\n    else:\n        sizes = documents.groupby([\"Topic\"], as_index=False).count()\n        for _, row in sizes.iterrows():\n            topic = int(row.Topic)\n            if self.topic_sizes_.get(topic) is not None and topic not in missing_topics:\n                self.topic_sizes_[topic] += int(row.Document)\n            elif self.topic_sizes_.get(topic) is None:\n                self.topic_sizes_[topic] = int(row.Document)\n        self.topics_ = documents.Topic.astype(int).tolist()\n\n    return self\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.push_to_hf_hub","title":"push_to_hf_hub(self, repo_id, commit_message='Add BERTopic model', token=None, revision=None, private=False, create_pr=False, model_card=True, serialization='safetensors', save_embedding_model=True, save_ctfidf=False)","text":"

        Push your BERTopic model to a HuggingFace Hub.

        Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:

        • Log in to your HuggingFace account with the following command:
          huggingface-cli login\n\n# or using an environment variable\nhuggingface-cli login --token $HUGGINGFACE_TOKEN\n
        • Alternatively, you can programmatically login using login() in a notebook or a script:
          from huggingface_hub import login\nlogin()\n
        • Or you can give a token with the token variable

        Parameters:

        Name Type Description Default repo_id str

        The name of your HuggingFace repository

        required commit_message str

        A commit message

        'Add BERTopic model' token str

        Token to add if not already logged in

        None revision str

        Repository revision

        None private bool

        Whether to create a private repository

        False create_pr bool

        Whether to upload the model as a Pull Request

        False model_card bool

        Whether to automatically create a modelcard

        True serialization str

        The type of serialization. Either safetensors or pytorch

        'safetensors' save_embedding_model Union[str, bool]

        A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., sentence-transformers/all-MiniLM-L6-v2

        True save_ctfidf bool

        Whether to save c-TF-IDF information

        False

        Examples:

        topic_model.push_to_hf_hub(\n    repo_id=\"ArXiv\",\n    save_ctfidf=True,\n    save_embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n)\n
        Source code in bertopic\\_bertopic.py
        def push_to_hf_hub(\n    self,\n    repo_id: str,\n    commit_message: str = \"Add BERTopic model\",\n    token: str = None,\n    revision: str = None,\n    private: bool = False,\n    create_pr: bool = False,\n    model_card: bool = True,\n    serialization: str = \"safetensors\",\n    save_embedding_model: Union[str, bool] = True,\n    save_ctfidf: bool = False,\n):\n    \"\"\"Push your BERTopic model to a HuggingFace Hub.\n\n    Whenever you want to upload files to the Hub, you need to log in to your HuggingFace account:\n\n    * Log in to your HuggingFace account with the following command:\n        ```bash\n        huggingface-cli login\n\n        # or using an environment variable\n        huggingface-cli login --token $HUGGINGFACE_TOKEN\n        ```\n    * Alternatively, you can programmatically login using login() in a notebook or a script:\n        ```python\n        from huggingface_hub import login\n        login()\n        ```\n    * Or you can give a token with the `token` variable\n\n    Arguments:\n        repo_id: The name of your HuggingFace repository\n        commit_message: A commit message\n        token: Token to add if not already logged in\n        revision: Repository revision\n        private: Whether to create a private repository\n        create_pr: Whether to upload the model as a Pull Request\n        model_card: Whether to automatically create a modelcard\n        serialization: The type of serialization.\n                       Either `safetensors` or `pytorch`\n        save_embedding_model: A pointer towards a HuggingFace model to be loaded in with\n                              SentenceTransformers. E.g.,\n                              `sentence-transformers/all-MiniLM-L6-v2`\n        save_ctfidf: Whether to save c-TF-IDF information\n\n\n    Examples:\n    ```python\n    topic_model.push_to_hf_hub(\n        repo_id=\"ArXiv\",\n        save_ctfidf=True,\n        save_embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\"\n    )\n    ```\n    \"\"\"\n    return save_utils.push_to_hf_hub(\n        model=self,\n        repo_id=repo_id,\n        commit_message=commit_message,\n        token=token,\n        revision=revision,\n        private=private,\n        create_pr=create_pr,\n        model_card=model_card,\n        serialization=serialization,\n        save_embedding_model=save_embedding_model,\n        save_ctfidf=save_ctfidf,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.reduce_outliers","title":"reduce_outliers(self, documents, topics, images=None, strategy='distributions', probabilities=None, threshold=0, embeddings=None, distributions_params={})","text":"

        Reduce outliers by merging them with their nearest topic according to one of several strategies.

        When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created that do not fall within any of the created topics. These are labeled as -1. This function allows the user to match outlier documents with their nearest topic using one of the following strategies using the strategy parameter: * \"probabilities\" This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True. * \"distributions\" Use the topic distributions, as calculated with .approximate_distribution to find the most frequent topic in each outlier document. You can use the distributions_params variable to tweak the parameters of .approximate_distribution. * \"c-tf-idf\" Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity. * \"embeddings\" Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents for which we reduce or remove the outliers.

        required topics List[int]

        The topics that correspond to the documents

        required images List[str]

        A list of paths to the images used when calling either fit or fit_transform

        None strategy str

        The strategy used for reducing outliers. Options: * \"probabilities\" This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.

            * \"distributions\"\n        Use the topic distributions, as calculated with `.approximate_distribution`\n        to find the most frequent topic in each outlier document.\n\n    * \"c-tf-idf\"\n        Calculate the c-TF-IDF representation for outlier documents and\n        find the best matching c-TF-IDF topic representation.\n\n    * \"embeddings\"\n        Calculate the embeddings for outlier documents and\n        find the best matching topic embedding.\n
        'distributions' probabilities ndarray

        Probabilities generated by HDBSCAN for each document when using the strategy \"probabilities\".

        None threshold float

        The threshold for assigning topics to outlier documents. This value represents the minimum probability when strategy=\"probabilities\". For all other strategies, it represents the minimum similarity.

        0 embeddings ndarray

        The pre-computed embeddings to be used when strategy=\"embeddings\". If this is None, then it will compute the embeddings for the outlier documents.

        None distributions_params Mapping[str, Any]

        The parameters used in .approximate_distribution when using the strategy \"distributions\".

        {}

        Returns:

        Type Description new_topics

        The updated topics

        Usage:

        The default settings uses the \"distributions\" strategy:

        new_topics = topic_model.reduce_outliers(docs, topics)\n

        When you use the \"probabilities\" strategy, make sure to also pass the probabilities as generated through HDBSCAN:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs)\n\nnew_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n
        Source code in bertopic\\_bertopic.py
        def reduce_outliers(\n    self,\n    documents: List[str],\n    topics: List[int],\n    images: List[str] = None,\n    strategy: str = \"distributions\",\n    probabilities: np.ndarray = None,\n    threshold: float = 0,\n    embeddings: np.ndarray = None,\n    distributions_params: Mapping[str, Any] = {},\n) -> List[int]:\n    \"\"\"Reduce outliers by merging them with their nearest topic according\n    to one of several strategies.\n\n    When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created\n    that do not fall within any of the created topics. These are labeled as -1.\n    This function allows the user to match outlier documents with their nearest topic\n    using one of the following strategies using the `strategy` parameter:\n        * \"probabilities\"\n            This uses the soft-clustering as performed by HDBSCAN to find the\n            best matching topic for each outlier document. To use this, make\n            sure to calculate the `probabilities` beforehand by instantiating\n            BERTopic with `calculate_probabilities=True`.\n        * \"distributions\"\n            Use the topic distributions, as calculated with `.approximate_distribution`\n            to find the most frequent topic in each outlier document. You can use the\n            `distributions_params` variable to tweak the parameters of\n            `.approximate_distribution`.\n        * \"c-tf-idf\"\n            Calculate the c-TF-IDF representation for each outlier document and\n            find the best matching c-TF-IDF topic representation using\n            cosine similarity.\n        * \"embeddings\"\n            Using the embeddings of each outlier documents, find the best\n            matching topic embedding using cosine similarity.\n\n    Arguments:\n        documents: A list of documents for which we reduce or remove the outliers.\n        topics: The topics that correspond to the documents\n        images: A list of paths to the images used when calling either\n                `fit` or `fit_transform`\n        strategy: The strategy used for reducing outliers.\n                Options:\n                    * \"probabilities\"\n                        This uses the soft-clustering as performed by HDBSCAN\n                        to find the best matching topic for each outlier document.\n\n                    * \"distributions\"\n                        Use the topic distributions, as calculated with `.approximate_distribution`\n                        to find the most frequent topic in each outlier document.\n\n                    * \"c-tf-idf\"\n                        Calculate the c-TF-IDF representation for outlier documents and\n                        find the best matching c-TF-IDF topic representation.\n\n                    * \"embeddings\"\n                        Calculate the embeddings for outlier documents and\n                        find the best matching topic embedding.\n        probabilities: Probabilities generated by HDBSCAN for each document when using the strategy `\"probabilities\"`.\n        threshold: The threshold for assigning topics to outlier documents. This value\n                   represents the minimum probability when `strategy=\"probabilities\"`.\n                   For all other strategies, it represents the minimum similarity.\n        embeddings: The pre-computed embeddings to be used when `strategy=\"embeddings\"`.\n                    If this is None, then it will compute the embeddings for the outlier documents.\n        distributions_params: The parameters used in `.approximate_distribution` when using\n                              the strategy `\"distributions\"`.\n\n    Returns:\n        new_topics: The updated topics\n\n    Usage:\n\n    The default settings uses the `\"distributions\"` strategy:\n\n    ```python\n    new_topics = topic_model.reduce_outliers(docs, topics)\n    ```\n\n    When you use the `\"probabilities\"` strategy, make sure to also pass the probabilities\n    as generated through HDBSCAN:\n\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic(calculate_probabilities=True)\n    topics, probs = topic_model.fit_transform(docs)\n\n    new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n    ```\n    \"\"\"\n    if not self._outliers:\n        raise ValueError(\"No outliers to reduce.\")\n\n    if images is not None:\n        strategy = \"embeddings\"\n\n    # Check correct use of parameters\n    if strategy.lower() == \"probabilities\" and probabilities is None:\n        raise ValueError(\"Make sure to pass in `probabilities` in order to use the probabilities strategy\")\n\n    # Reduce outliers by extracting most likely topics through the topic-term probability matrix\n    if strategy.lower() == \"probabilities\":\n        new_topics = [\n            np.argmax(prob) if np.max(prob) >= threshold and topic == -1 else topic\n            for topic, prob in zip(topics, probabilities)\n        ]\n\n    # Reduce outliers by extracting most frequent topics through calculating of Topic Distributions\n    elif strategy.lower() == \"distributions\":\n        outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n        outlier_docs = [documents[index] for index in outlier_ids]\n        topic_distr, _ = self.approximate_distribution(\n            outlier_docs, min_similarity=threshold, **distributions_params\n        )\n        outlier_topics = iter([np.argmax(prob) if sum(prob) > 0 else -1 for prob in topic_distr])\n        new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n    # Reduce outliers by finding the most similar c-TF-IDF representations\n    elif strategy.lower() == \"c-tf-idf\":\n        outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n        outlier_docs = [documents[index] for index in outlier_ids]\n\n        # Calculate c-TF-IDF of outlier documents with all topics\n        bow_doc = self.vectorizer_model.transform(outlier_docs)\n        c_tf_idf_doc = self.ctfidf_model.transform(bow_doc)\n        similarity = cosine_similarity(c_tf_idf_doc, self.c_tf_idf_[self._outliers :])\n\n        # Update topics\n        similarity[similarity < threshold] = 0\n        outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n        new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n    # Reduce outliers by finding the most similar topic embeddings\n    elif strategy.lower() == \"embeddings\":\n        if self.embedding_model is None and embeddings is None:\n            raise ValueError(\n                \"To use this strategy, you will need to pass a model to `embedding_model`\"\n                \"when instantiating BERTopic.\"\n            )\n        outlier_ids = [index for index, topic in enumerate(topics) if topic == -1]\n        if images is not None:\n            outlier_docs = [images[index] for index in outlier_ids]\n        else:\n            outlier_docs = [documents[index] for index in outlier_ids]\n\n        # Extract or calculate embeddings for outlier documents\n        if embeddings is not None:\n            outlier_embeddings = np.array([embeddings[index] for index in outlier_ids])\n        elif images is not None:\n            outlier_images = [images[index] for index in outlier_ids]\n            outlier_embeddings = self.embedding_model.embed_images(outlier_images, verbose=self.verbose)\n        else:\n            outlier_embeddings = self.embedding_model.embed_documents(outlier_docs)\n        similarity = cosine_similarity(outlier_embeddings, self.topic_embeddings_[self._outliers :])\n\n        # Update topics\n        similarity[similarity < threshold] = 0\n        outlier_topics = iter([np.argmax(sim) if sum(sim) > 0 else -1 for sim in similarity])\n        new_topics = [topic if topic != -1 else next(outlier_topics) for topic in topics]\n\n    return new_topics\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.reduce_topics","title":"reduce_topics(self, docs, nr_topics=20, images=None, use_ctfidf=False)","text":"

        Reduce the number of topics to a fixed number of topics or automatically.

        If nr_topics is an integer, then the number of topics is reduced to nr_topics using AgglomerativeClustering on the cosine distance matrix of the topic c-TF-IDF or semantic embeddings.

        If nr_topics is \"auto\", then HDBSCAN is used to automatically reduce the number of topics by running it on the topic embeddings.

        The topics, their sizes, and representations are updated.

        Parameters:

        Name Type Description Default docs List[str]

        The docs you used when calling either fit or fit_transform

        required nr_topics Union[int, str]

        The number of topics you want reduced to

        20 images List[str]

        A list of paths to the images used when calling either fit or fit_transform

        None use_ctfidf bool

        Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used.

        False

        Updates

        topics_ : Assigns topics to their merged representations. probabilities_ : Assigns probabilities to their merged representations.

        Examples:

        You can further reduce the topics by passing the documents with their topics and probabilities (if they were calculated):

        topic_model.reduce_topics(docs, nr_topics=30)\n

        You can then access the updated topics and probabilities with:

        topics = topic_model.topics_\nprobabilities = topic_model.probabilities_\n
        Source code in bertopic\\_bertopic.py
        def reduce_topics(\n    self,\n    docs: List[str],\n    nr_topics: Union[int, str] = 20,\n    images: List[str] = None,\n    use_ctfidf: bool = False,\n) -> None:\n    \"\"\"Reduce the number of topics to a fixed number of topics\n    or automatically.\n\n    If nr_topics is an integer, then the number of topics is reduced\n    to nr_topics using `AgglomerativeClustering` on the cosine distance matrix\n    of the topic c-TF-IDF or semantic embeddings.\n\n    If nr_topics is `\"auto\"`, then HDBSCAN is used to automatically\n    reduce the number of topics by running it on the topic embeddings.\n\n    The topics, their sizes, and representations are updated.\n\n    Arguments:\n        docs: The docs you used when calling either `fit` or `fit_transform`\n        nr_topics: The number of topics you want reduced to\n        images: A list of paths to the images used when calling either\n                `fit` or `fit_transform`\n        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                    embeddings from the embedding model are used.\n\n    Updates:\n        topics_ : Assigns topics to their merged representations.\n        probabilities_ : Assigns probabilities to their merged representations.\n\n    Examples:\n    You can further reduce the topics by passing the documents with their\n    topics and probabilities (if they were calculated):\n\n    ```python\n    topic_model.reduce_topics(docs, nr_topics=30)\n    ```\n\n    You can then access the updated topics and probabilities with:\n\n    ```python\n    topics = topic_model.topics_\n    probabilities = topic_model.probabilities_\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n\n    self.nr_topics = nr_topics\n    documents = pd.DataFrame(\n        {\n            \"Document\": docs,\n            \"Topic\": self.topics_,\n            \"Image\": images,\n            \"ID\": range(len(docs)),\n        }\n    )\n\n    # Reduce number of topics\n    documents = self._reduce_topics(documents, use_ctfidf)\n    self._merged_topics = None\n    self._save_representative_docs(documents)\n    self.probabilities_ = self._map_probabilities(self.probabilities_)\n\n    return self\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.save","title":"save(self, path, serialization='pickle', save_embedding_model=True, save_ctfidf=False)","text":"

        Saves the model to the specified path or folder.

        When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic should not be loaded in other versions.

        Parameters:

        Name Type Description Default path

        If serialization is 'safetensors' or pytorch, this is a directory. If serialization is pickle, then this is a file.

        required serialization Literal['safetensors', 'pickle', 'pytorch']

        If pickle, the entire model will be pickled. If safetensors or pytorch the model will be saved without the embedding, dimensionality reduction, and clustering algorithms. This is a very efficient format and typically advised.

        'pickle' save_embedding_model Union[bool, str]

        If serialization is pickle, then you can choose to skip saving the embedding model. If serialization is safetensors or pytorch, this variable can be used as a string pointing towards a huggingface model.

        True save_ctfidf bool

        Whether to save c-TF-IDF information if serialization is safetensors or pytorch

        False

        Examples:

        To save the model in an efficient and safe format (safetensors) with c-TF-IDF information:

        topic_model.save(\"model_dir\", serialization=\"safetensors\", save_ctfidf=True)\n

        If you wish to also add a pointer to the embedding model, which will be downloaded from HuggingFace upon loading:

        embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"model_dir\", serialization=\"safetensors\", save_embedding_model=embedding_model)\n

        or if you want save the full model with pickle:

        topic_model.save(\"my_model\")\n

        NOTE: Pickle can run arbitrary code and is generally considered to be less safe than safetensors.

        Source code in bertopic\\_bertopic.py
        def save(\n    self,\n    path,\n    serialization: Literal[\"safetensors\", \"pickle\", \"pytorch\"] = \"pickle\",\n    save_embedding_model: Union[bool, str] = True,\n    save_ctfidf: bool = False,\n):\n    \"\"\"Saves the model to the specified path or folder.\n\n    When saving the model, make sure to also keep track of the versions\n    of dependencies and Python used. Loading and saving the model should\n    be done using the same dependencies and Python. Moreover, models\n    saved in one version of BERTopic should not be loaded in other versions.\n\n    Arguments:\n        path: If `serialization` is 'safetensors' or `pytorch`, this is a directory.\n              If `serialization` is `pickle`, then this is a file.\n        serialization: If `pickle`, the entire model will be pickled. If `safetensors`\n                       or `pytorch` the model will be saved without the embedding,\n                       dimensionality reduction, and clustering algorithms.\n                       This is a very efficient format and typically advised.\n        save_embedding_model: If serialization is `pickle`, then you can choose to skip\n                              saving the embedding model. If serialization is `safetensors`\n                              or `pytorch`, this variable can be used as a string pointing\n                              towards a huggingface model.\n        save_ctfidf: Whether to save c-TF-IDF information if serialization is `safetensors`\n                     or `pytorch`\n\n    Examples:\n    To save the model in an efficient and safe format (safetensors) with c-TF-IDF information:\n\n    ```python\n    topic_model.save(\"model_dir\", serialization=\"safetensors\", save_ctfidf=True)\n    ```\n\n    If you wish to also add a pointer to the embedding model, which will be downloaded from\n    HuggingFace upon loading:\n\n    ```python\n    embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\n    topic_model.save(\"model_dir\", serialization=\"safetensors\", save_embedding_model=embedding_model)\n    ```\n\n    or if you want save the full model with pickle:\n\n    ```python\n    topic_model.save(\"my_model\")\n    ```\n\n    NOTE: Pickle can run arbitrary code and is generally considered to be less safe than\n    safetensors.\n    \"\"\"\n    if serialization == \"pickle\":\n        logger.warning(\n            \"When you use `pickle` to save/load a BERTopic model,\"\n            \"please make sure that the environments in which you save\"\n            \"and load the model are **exactly** the same. The version of BERTopic,\"\n            \"its dependencies, and python need to remain the same.\"\n        )\n\n        with open(path, \"wb\") as file:\n            # This prevents the vectorizer from being too large in size if `min_df` was\n            # set to a value higher than 1\n            self.vectorizer_model.stop_words_ = None\n\n            if not save_embedding_model:\n                embedding_model = self.embedding_model\n                self.embedding_model = None\n                joblib.dump(self, file)\n                self.embedding_model = embedding_model\n            else:\n                joblib.dump(self, file)\n    elif serialization == \"safetensors\" or serialization == \"pytorch\":\n        # Directory\n        save_directory = Path(path)\n        save_directory.mkdir(exist_ok=True, parents=True)\n\n        # Check embedding model\n        if (\n            save_embedding_model\n            and hasattr(self.embedding_model, \"_hf_model\")\n            and not isinstance(save_embedding_model, str)\n        ):\n            save_embedding_model = self.embedding_model._hf_model\n        elif not save_embedding_model:\n            logger.warning(\n                \"You are saving a BERTopic model without explicitly defining an embedding model.\"\n                \"If you are using a sentence-transformers model or a HuggingFace model supported\"\n                \"by sentence-transformers, please save the model by using a pointer towards that model.\"\n                \"For example, `save_embedding_model='sentence-transformers/all-mpnet-base-v2'`\"\n            )\n\n        # Minimal\n        save_utils.save_hf(model=self, save_directory=save_directory, serialization=serialization)\n        save_utils.save_topics(model=self, path=save_directory / \"topics.json\")\n        save_utils.save_images(model=self, path=save_directory / \"images\")\n        save_utils.save_config(\n            model=self,\n            path=save_directory / \"config.json\",\n            embedding_model=save_embedding_model,\n        )\n\n        # Additional\n        if save_ctfidf:\n            save_utils.save_ctfidf(\n                model=self,\n                save_directory=save_directory,\n                serialization=serialization,\n            )\n            save_utils.save_ctfidf_config(model=self, path=save_directory / \"ctfidf_config.json\")\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.set_topic_labels","title":"set_topic_labels(self, topic_labels)","text":"

        Set custom topic labels in your fitted BERTopic model.

        Parameters:

        Name Type Description Default topic_labels Union[List[str], Mapping[int, str]]

        If a list of topic labels, it should contain the same number of labels as there are topics. This must be ordered from the topic with the lowest ID to the highest ID, including topic -1 if it exists. If a dictionary of topic ID: topic_label, it can have any number of topics as it will only map the topics found in the dictionary.

        required

        Examples:

        First, we define our topic labels with .generate_topic_labels in which we can customize our topic labels:

        topic_labels = topic_model.generate_topic_labels(nr_words=2,\n                                            topic_prefix=True,\n                                            word_length=10,\n                                            separator=\", \")\n

        Then, we pass these topic_labels to our topic model which can be accessed at any time with .custom_labels_:

        topic_model.set_topic_labels(topic_labels)\ntopic_model.custom_labels_\n

        You might want to change only a few topic labels instead of all of them. To do so, you can pass a dictionary where the keys are the topic IDs and its keys the topic labels:

        topic_model.set_topic_labels({0: \"Space\", 1: \"Sports\", 2: \"Medicine\"})\ntopic_model.custom_labels_\n
        Source code in bertopic\\_bertopic.py
        def set_topic_labels(self, topic_labels: Union[List[str], Mapping[int, str]]) -> None:\n    \"\"\"Set custom topic labels in your fitted BERTopic model.\n\n    Arguments:\n        topic_labels: If a list of topic labels, it should contain the same number\n                      of labels as there are topics. This must be ordered\n                      from the topic with the lowest ID to the highest ID,\n                      including topic -1 if it exists.\n                      If a dictionary of `topic ID`: `topic_label`, it can have\n                      any number of topics as it will only map the topics found\n                      in the dictionary.\n\n    Examples:\n    First, we define our topic labels with `.generate_topic_labels` in which\n    we can customize our topic labels:\n\n    ```python\n    topic_labels = topic_model.generate_topic_labels(nr_words=2,\n                                                topic_prefix=True,\n                                                word_length=10,\n                                                separator=\", \")\n    ```\n\n    Then, we pass these `topic_labels` to our topic model which\n    can be accessed at any time with `.custom_labels_`:\n\n    ```python\n    topic_model.set_topic_labels(topic_labels)\n    topic_model.custom_labels_\n    ```\n\n    You might want to change only a few topic labels instead of all of them.\n    To do so, you can pass a dictionary where the keys are the topic IDs and\n    its keys the topic labels:\n\n    ```python\n    topic_model.set_topic_labels({0: \"Space\", 1: \"Sports\", 2: \"Medicine\"})\n    topic_model.custom_labels_\n    ```\n    \"\"\"\n    unique_topics = sorted(set(self.topics_))\n\n    if isinstance(topic_labels, dict):\n        if self.custom_labels_ is not None:\n            original_labels = {topic: label for topic, label in zip(unique_topics, self.custom_labels_)}\n        else:\n            info = self.get_topic_info()\n            original_labels = dict(zip(info.Topic, info.Name))\n        custom_labels = [\n            topic_labels.get(topic) if topic_labels.get(topic) else original_labels[topic]\n            for topic in unique_topics\n        ]\n\n    elif isinstance(topic_labels, list):\n        if len(topic_labels) == len(unique_topics):\n            custom_labels = topic_labels\n        else:\n            raise ValueError(\n                \"Make sure that `topic_labels` contains the same number \" \"of labels as there are topics.\"\n            )\n\n    self.custom_labels_ = custom_labels\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.topics_over_time","title":"topics_over_time(self, docs, timestamps, topics=None, nr_bins=None, datetime_format=None, evolution_tuning=True, global_tuning=True)","text":"

        Create topics over time.

        To create the topics over time, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculate at each timestamp t. Then, the c-TF-IDF representations at timestamp t are averaged with the global c-TF-IDF representations in order to fine-tune the local representations.

        Note

        Make sure to use a limited number of unique timestamps (<100) as the c-TF-IDF representation will be calculated at each single unique timestamp. Having a large number of unique timestamps can take some time to be calculated. Moreover, there aren't many use-cases where you would like to see the difference in topic representations over more than 100 different timestamps.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required timestamps Union[List[str], List[int]]

        The timestamp of each document. This can be either a list of strings or ints. If it is a list of strings, then the datetime format will be automatically inferred. If it is a list of ints, then the documents will be ordered in ascending order.

        required topics List[int]

        A list of topics where each topic is related to a document in docs and a timestamp in timestamps. You can use this to apply topics_over_time on a subset of the data. Make sure that docs, timestamps, and topics all correspond to one another and have the same size.

        None nr_bins int

        The number of bins you want to create for the timestamps. The left interval will be chosen as the timestamp. An additional column will be created with the entire interval.

        None datetime_format str

        The datetime format of the timestamps if they are strings, eg \u201c%d/%m/%Y\u201d. Set this to None if you want to have it automatically detect the format. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.

        None evolution_tuning bool

        Fine-tune each topic representation at timestamp t by averaging its c-TF-IDF matrix with the c-TF-IDF matrix at timestamp t-1. This creates evolutionary topic representations.

        True global_tuning bool

        Fine-tune each topic representation at timestamp t by averaging its c-TF-IDF matrix with the global c-TF-IDF matrix. Turn this off if you want to prevent words in topic representations that could not be found in the documents at timestamp t.

        True

        Returns:

        Type Description topics_over_time

        A dataframe that contains the topic, words, and frequency of topic at timestamp t.

        Examples:

        The timestamps variable represents the timestamp of each document. If you have over 100 unique timestamps, it is advised to bin the timestamps as shown below:

        from bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n
        Source code in bertopic\\_bertopic.py
        def topics_over_time(\n    self,\n    docs: List[str],\n    timestamps: Union[List[str], List[int]],\n    topics: List[int] = None,\n    nr_bins: int = None,\n    datetime_format: str = None,\n    evolution_tuning: bool = True,\n    global_tuning: bool = True,\n) -> pd.DataFrame:\n    \"\"\"Create topics over time.\n\n    To create the topics over time, BERTopic needs to be already fitted once.\n    From the fitted models, the c-TF-IDF representations are calculate at\n    each timestamp t. Then, the c-TF-IDF representations at timestamp t are\n    averaged with the global c-TF-IDF representations in order to fine-tune the\n    local representations.\n\n    Note:\n        Make sure to use a limited number of unique timestamps (<100) as the\n        c-TF-IDF representation will be calculated at each single unique timestamp.\n        Having a large number of unique timestamps can take some time to be calculated.\n        Moreover, there aren't many use-cases where you would like to see the difference\n        in topic representations over more than 100 different timestamps.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        timestamps: The timestamp of each document. This can be either a list of strings or ints.\n                    If it is a list of strings, then the datetime format will be automatically\n                    inferred. If it is a list of ints, then the documents will be ordered in\n                    ascending order.\n        topics: A list of topics where each topic is related to a document in `docs` and\n                a timestamp in `timestamps`. You can use this to apply topics_over_time on\n                a subset of the data. Make sure that `docs`, `timestamps`, and `topics`\n                all correspond to one another and have the same size.\n        nr_bins: The number of bins you want to create for the timestamps. The left interval will\n                 be chosen as the timestamp. An additional column will be created with the\n                 entire interval.\n        datetime_format: The datetime format of the timestamps if they are strings, eg \u201c%d/%m/%Y\u201d.\n                         Set this to None if you want to have it automatically detect the format.\n                         See strftime documentation for more information on choices:\n                         https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.\n        evolution_tuning: Fine-tune each topic representation at timestamp *t* by averaging its\n                          c-TF-IDF matrix with the c-TF-IDF matrix at timestamp *t-1*. This creates\n                          evolutionary topic representations.\n        global_tuning: Fine-tune each topic representation at timestamp *t* by averaging its c-TF-IDF matrix\n                   with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                   topic representations that could not be found in the documents at timestamp *t*.\n\n    Returns:\n        topics_over_time: A dataframe that contains the topic, words, and frequency of topic\n                          at timestamp *t*.\n\n    Examples:\n    The timestamps variable represents the timestamp of each document. If you have over\n    100 unique timestamps, it is advised to bin the timestamps as shown below:\n\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    selected_topics = topics if topics else self.topics_\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": selected_topics, \"Timestamps\": timestamps})\n    global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm=\"l1\", copy=False)\n\n    all_topics = sorted(list(documents.Topic.unique()))\n    all_topics_indices = {topic: index for index, topic in enumerate(all_topics)}\n\n    if isinstance(timestamps[0], str):\n        infer_datetime_format = True if not datetime_format else False\n        documents[\"Timestamps\"] = pd.to_datetime(\n            documents[\"Timestamps\"],\n            infer_datetime_format=infer_datetime_format,\n            format=datetime_format,\n        )\n\n    if nr_bins:\n        documents[\"Bins\"] = pd.cut(documents.Timestamps, bins=nr_bins)\n        documents[\"Timestamps\"] = documents.apply(lambda row: row.Bins.left, 1)\n\n    # Sort documents in chronological order\n    documents = documents.sort_values(\"Timestamps\")\n    timestamps = documents.Timestamps.unique()\n    if len(timestamps) > 100:\n        logger.warning(\n            f\"There are more than 100 unique timestamps (i.e., {len(timestamps)}) \"\n            \"which significantly slows down the application. Consider setting `nr_bins` \"\n            \"to a value lower than 100 to speed up calculation. \"\n        )\n\n    # For each unique timestamp, create topic representations\n    topics_over_time = []\n    for index, timestamp in tqdm(enumerate(timestamps), disable=not self.verbose):\n        # Calculate c-TF-IDF representation for a specific timestamp\n        selection = documents.loc[documents.Timestamps == timestamp, :]\n        documents_per_topic = selection.groupby([\"Topic\"], as_index=False).agg(\n            {\"Document\": \" \".join, \"Timestamps\": \"count\"}\n        )\n        c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n        if global_tuning or evolution_tuning:\n            c_tf_idf = normalize(c_tf_idf, axis=1, norm=\"l1\", copy=False)\n\n        # Fine-tune the c-TF-IDF matrix at timestamp t by averaging it with the c-TF-IDF\n        # matrix at timestamp t-1\n        if evolution_tuning and index != 0:\n            current_topics = sorted(list(documents_per_topic.Topic.values))\n            overlapping_topics = sorted(\n                list(set(previous_topics).intersection(set(current_topics)))  # noqa: F821\n            )\n\n            current_overlap_idx = [current_topics.index(topic) for topic in overlapping_topics]\n            previous_overlap_idx = [\n                previous_topics.index(topic)  # noqa: F821\n                for topic in overlapping_topics\n            ]\n\n            c_tf_idf.tolil()[current_overlap_idx] = (\n                (\n                    c_tf_idf[current_overlap_idx] + previous_c_tf_idf[previous_overlap_idx]  # noqa: F821\n                )\n                / 2.0\n            ).tolil()\n\n        # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n        # by simply taking the average of the two\n        if global_tuning:\n            selected_topics = [all_topics_indices[topic] for topic in documents_per_topic.Topic.values]\n            c_tf_idf = (global_c_tf_idf[selected_topics] + c_tf_idf) / 2.0\n\n        # Extract the words per topic\n        words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n        topic_frequency = pd.Series(\n            documents_per_topic.Timestamps.values, index=documents_per_topic.Topic\n        ).to_dict()\n\n        # Fill dataframe with results\n        topics_at_timestamp = [\n            (\n                topic,\n                \", \".join([words[0] for words in values][:5]),\n                topic_frequency[topic],\n                timestamp,\n            )\n            for topic, values in words_per_topic.items()\n        ]\n        topics_over_time.extend(topics_at_timestamp)\n\n        if evolution_tuning:\n            previous_topics = sorted(list(documents_per_topic.Topic.values))  # noqa: F841\n            previous_c_tf_idf = c_tf_idf.copy()  # noqa: F841\n\n    return pd.DataFrame(topics_over_time, columns=[\"Topic\", \"Words\", \"Frequency\", \"Timestamp\"])\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.topics_per_class","title":"topics_per_class(self, docs, classes, global_tuning=True)","text":"

        Create topics per class.

        To create the topics per class, BERTopic needs to be already fitted once. From the fitted models, the c-TF-IDF representations are calculated at each class c. Then, the c-TF-IDF representations at class c are averaged with the global c-TF-IDF representations in order to fine-tune the local representations. This can be turned off if the pure representation is needed.

        Note

        Make sure to use a limited number of unique classes (<100) as the c-TF-IDF representation will be calculated at each single unique class. Having a large number of unique classes can take some time to be calculated.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required classes Union[List[int], List[str]]

        The class of each document. This can be either a list of strings or ints.

        required global_tuning bool

        Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix with the global c-TF-IDF matrix. Turn this off if you want to prevent words in topic representations that could not be found in the documents for class c.

        True

        Returns:

        Type Description topics_per_class

        A dataframe that contains the topic, words, and frequency of topics for each class.

        Examples:

        from bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_per_class = topic_model.topics_per_class(docs, classes)\n
        Source code in bertopic\\_bertopic.py
        def topics_per_class(\n    self,\n    docs: List[str],\n    classes: Union[List[int], List[str]],\n    global_tuning: bool = True,\n) -> pd.DataFrame:\n    \"\"\"Create topics per class.\n\n    To create the topics per class, BERTopic needs to be already fitted once.\n    From the fitted models, the c-TF-IDF representations are calculated at\n    each class c. Then, the c-TF-IDF representations at class c are\n    averaged with the global c-TF-IDF representations in order to fine-tune the\n    local representations. This can be turned off if the pure representation is\n    needed.\n\n    Note:\n        Make sure to use a limited number of unique classes (<100) as the\n        c-TF-IDF representation will be calculated at each single unique class.\n        Having a large number of unique classes can take some time to be calculated.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        classes: The class of each document. This can be either a list of strings or ints.\n        global_tuning: Fine-tune each topic representation for class c by averaging its c-TF-IDF matrix\n                       with the global c-TF-IDF matrix. Turn this off if you want to prevent words in\n                       topic representations that could not be found in the documents for class c.\n\n    Returns:\n        topics_per_class: A dataframe that contains the topic, words, and frequency of topics\n                          for each class.\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    topic_model = BERTopic()\n    topics, probs = topic_model.fit_transform(docs)\n    topics_per_class = topic_model.topics_per_class(docs, classes)\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": self.topics_, \"Class\": classes})\n    global_c_tf_idf = normalize(self.c_tf_idf_, axis=1, norm=\"l1\", copy=False)\n\n    # For each unique timestamp, create topic representations\n    topics_per_class = []\n    for _, class_ in tqdm(enumerate(set(classes)), disable=not self.verbose):\n        # Calculate c-TF-IDF representation for a specific timestamp\n        selection = documents.loc[documents.Class == class_, :]\n        documents_per_topic = selection.groupby([\"Topic\"], as_index=False).agg(\n            {\"Document\": \" \".join, \"Class\": \"count\"}\n        )\n        c_tf_idf, words = self._c_tf_idf(documents_per_topic, fit=False)\n\n        # Fine-tune the timestamp c-TF-IDF representation based on the global c-TF-IDF representation\n        # by simply taking the average of the two\n        if global_tuning:\n            c_tf_idf = normalize(c_tf_idf, axis=1, norm=\"l1\", copy=False)\n            c_tf_idf = (global_c_tf_idf[documents_per_topic.Topic.values + self._outliers] + c_tf_idf) / 2.0\n\n        # Extract the words per topic\n        words_per_topic = self._extract_words_per_topic(words, selection, c_tf_idf, calculate_aspects=False)\n        topic_frequency = pd.Series(documents_per_topic.Class.values, index=documents_per_topic.Topic).to_dict()\n\n        # Fill dataframe with results\n        topics_at_class = [\n            (\n                topic,\n                \", \".join([words[0] for words in values][:5]),\n                topic_frequency[topic],\n                class_,\n            )\n            for topic, values in words_per_topic.items()\n        ]\n        topics_per_class.extend(topics_at_class)\n\n    topics_per_class = pd.DataFrame(topics_per_class, columns=[\"Topic\", \"Words\", \"Frequency\", \"Class\"])\n\n    return topics_per_class\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.transform","title":"transform(self, documents, embeddings=None, images=None)","text":"

        After having fit a model, use transform to predict new instances.

        Parameters:

        Name Type Description Default documents Union[str, List[str]]

        A single document or a list of documents to predict on

        required embeddings ndarray

        Pre-trained document embeddings. These can be used instead of the sentence-transformer model.

        None images List[str]

        A list of paths to the images to predict on or the images themselves

        None

        Returns:

        Type Description predictions

        Topic predictions for each documents probabilities: The topic probability distribution which is returned by default. If calculate_probabilities in BERTopic is set to False, then the probabilities are not calculated to speed up computation and decrease memory usage.

        Examples:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all')['data']\ntopic_model = BERTopic().fit(docs)\ntopics, probs = topic_model.transform(docs)\n

        If you want to use your own embeddings:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Create embeddings\ndocs = fetch_20newsgroups(subset='all')['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# Create topic model\ntopic_model = BERTopic().fit(docs, embeddings)\ntopics, probs = topic_model.transform(docs, embeddings)\n
        Source code in bertopic\\_bertopic.py
        def transform(\n    self,\n    documents: Union[str, List[str]],\n    embeddings: np.ndarray = None,\n    images: List[str] = None,\n) -> Tuple[List[int], np.ndarray]:\n    \"\"\"After having fit a model, use transform to predict new instances.\n\n    Arguments:\n        documents: A single document or a list of documents to predict on\n        embeddings: Pre-trained document embeddings. These can be used\n                    instead of the sentence-transformer model.\n        images: A list of paths to the images to predict on or the images themselves\n\n    Returns:\n        predictions: Topic predictions for each documents\n        probabilities: The topic probability distribution which is returned by default.\n                       If `calculate_probabilities` in BERTopic is set to False, then the\n                       probabilities are not calculated to speed up computation and\n                       decrease memory usage.\n\n    Examples:\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n\n    docs = fetch_20newsgroups(subset='all')['data']\n    topic_model = BERTopic().fit(docs)\n    topics, probs = topic_model.transform(docs)\n    ```\n\n    If you want to use your own embeddings:\n\n    ```python\n    from bertopic import BERTopic\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n\n    # Create embeddings\n    docs = fetch_20newsgroups(subset='all')['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n    # Create topic model\n    topic_model = BERTopic().fit(docs, embeddings)\n    topics, probs = topic_model.transform(docs, embeddings)\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_embeddings_shape(embeddings, documents)\n\n    if isinstance(documents, str) or documents is None:\n        documents = [documents]\n\n    if embeddings is None:\n        embeddings = self._extract_embeddings(documents, images=images, method=\"document\", verbose=self.verbose)\n\n    # Check if an embedding model was found\n    if embeddings is None:\n        raise ValueError(\n            \"No embedding model was found to embed the documents.\"\n            \"Make sure when loading in the model using BERTopic.load()\"\n            \"to also specify the embedding model.\"\n        )\n\n    # Transform without hdbscan_model and umap_model using only cosine similarity\n    elif type(self.hdbscan_model) == BaseCluster:\n        logger.info(\"Predicting topic assignments through cosine similarity of topic and document embeddings.\")\n        sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))\n        predictions = np.argmax(sim_matrix, axis=1) - self._outliers\n\n        if self.calculate_probabilities:\n            probabilities = sim_matrix\n        else:\n            probabilities = np.max(sim_matrix, axis=1)\n\n    # Transform with full pipeline\n    else:\n        logger.info(\"Dimensionality - Reducing dimensionality of input embeddings.\")\n        umap_embeddings = self.umap_model.transform(embeddings)\n        logger.info(\"Dimensionality - Completed \\u2713\")\n\n        # Extract predictions and probabilities if it is a HDBSCAN-like model\n        logger.info(\"Clustering - Approximating new points with `hdbscan_model`\")\n        if is_supported_hdbscan(self.hdbscan_model):\n            predictions, probabilities = hdbscan_delegator(\n                self.hdbscan_model, \"approximate_predict\", umap_embeddings\n            )\n\n            # Calculate probabilities\n            if self.calculate_probabilities:\n                logger.info(\"Probabilities - Start calculation of probabilities with HDBSCAN\")\n                probabilities = hdbscan_delegator(self.hdbscan_model, \"membership_vector\", umap_embeddings)\n                logger.info(\"Probabilities - Completed \\u2713\")\n        else:\n            predictions = self.hdbscan_model.predict(umap_embeddings)\n            probabilities = None\n        logger.info(\"Cluster - Completed \\u2713\")\n\n        # Map probabilities and predictions\n        probabilities = self._map_probabilities(probabilities, original_topics=True)\n        predictions = self._map_predictions(predictions)\n    return predictions, probabilities\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.update_topics","title":"update_topics(self, docs, images=None, topics=None, top_n_words=10, n_gram_range=None, vectorizer_model=None, ctfidf_model=None, representation_model=None)","text":"

        Updates the topic representation by recalculating c-TF-IDF with the new parameters as defined in this function.

        When you have trained a model and viewed the topics and the words that represent them, you might not be satisfied with the representation. Perhaps you forgot to remove stop_words or you want to try out a different n_gram_range. This function allows you to update the topic representation after they have been formed.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required images List[str]

        The images you used when calling either fit or fit_transform

        None topics List[int]

        A list of topics where each topic is related to a document in docs. Use this variable to change or map the topics. NOTE: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline

        None top_n_words int

        The number of words per topic to extract. Setting this too high can negatively impact topic embeddings as topics are typically best represented by at most 10 words.

        10 n_gram_range Tuple[int, int]

        The n-gram range for the CountVectorizer.

        None vectorizer_model CountVectorizer

        Pass in your own CountVectorizer from scikit-learn

        None ctfidf_model ClassTfidfTransformer

        Pass in your own c-TF-IDF model to update the representations

        None representation_model BaseRepresentation

        Pass in a model that fine-tunes the topic representations calculated through c-TF-IDF. Models from bertopic.representation are supported.

        None

        Examples:

        In order to update the topic representation, you will need to first fit the topic model and extract topics from them. Based on these, you can update the representation:

        topic_model.update_topics(docs, n_gram_range=(2, 3))\n

        You can also use a custom vectorizer to update the representation:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\")\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        You can also use this function to change or map the topics to something else. You can update them as follows:

        topic_model.update_topics(docs, my_updated_topics)\n
        Source code in bertopic\\_bertopic.py
        def update_topics(\n    self,\n    docs: List[str],\n    images: List[str] = None,\n    topics: List[int] = None,\n    top_n_words: int = 10,\n    n_gram_range: Tuple[int, int] = None,\n    vectorizer_model: CountVectorizer = None,\n    ctfidf_model: ClassTfidfTransformer = None,\n    representation_model: BaseRepresentation = None,\n):\n    \"\"\"Updates the topic representation by recalculating c-TF-IDF with the new\n    parameters as defined in this function.\n\n    When you have trained a model and viewed the topics and the words that represent them,\n    you might not be satisfied with the representation. Perhaps you forgot to remove\n    stop_words or you want to try out a different n_gram_range. This function allows you\n    to update the topic representation after they have been formed.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        images: The images you used when calling either `fit` or `fit_transform`\n        topics: A list of topics where each topic is related to a document in `docs`.\n                Use this variable to change or map the topics.\n                NOTE: Using a custom list of topic assignments may lead to errors if\n                      topic reduction techniques are used afterwards. Make sure that\n                      manually assigning topics is the last step in the pipeline\n        top_n_words: The number of words per topic to extract. Setting this\n                     too high can negatively impact topic embeddings as topics\n                     are typically best represented by at most 10 words.\n        n_gram_range: The n-gram range for the CountVectorizer.\n        vectorizer_model: Pass in your own CountVectorizer from scikit-learn\n        ctfidf_model: Pass in your own c-TF-IDF model to update the representations\n        representation_model: Pass in a model that fine-tunes the topic representations\n                              calculated through c-TF-IDF. Models from `bertopic.representation`\n                              are supported.\n\n    Examples:\n    In order to update the topic representation, you will need to first fit the topic\n    model and extract topics from them. Based on these, you can update the representation:\n\n    ```python\n    topic_model.update_topics(docs, n_gram_range=(2, 3))\n    ```\n\n    You can also use a custom vectorizer to update the representation:\n\n    ```python\n    from sklearn.feature_extraction.text import CountVectorizer\n    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=\"english\")\n    topic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n    ```\n\n    You can also use this function to change or map the topics to something else.\n    You can update them as follows:\n\n    ```python\n    topic_model.update_topics(docs, my_updated_topics)\n    ```\n    \"\"\"\n    check_documents_type(docs)\n    check_is_fitted(self)\n    if not n_gram_range:\n        n_gram_range = self.n_gram_range\n\n    if top_n_words > 100:\n        logger.warning(\n            \"Note that extracting more than 100 words from a sparse \" \"can slow down computation quite a bit.\"\n        )\n    self.top_n_words = top_n_words\n    self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)\n    self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()\n    self.representation_model = representation_model\n\n    if topics is None:\n        topics = self.topics_\n    else:\n        logger.warning(\n            \"Using a custom list of topic assignments may lead to errors if \"\n            \"topic reduction techniques are used afterwards. Make sure that \"\n            \"manually assigning topics is the last step in the pipeline.\"\n            \"Note that topic embeddings will also be created through weighted\"\n            \"c-TF-IDF embeddings instead of centroid embeddings.\"\n        )\n\n    documents = pd.DataFrame({\"Document\": docs, \"Topic\": topics, \"ID\": range(len(docs)), \"Image\": images})\n    documents_per_topic = documents.groupby([\"Topic\"], as_index=False).agg({\"Document\": \" \".join})\n\n    # Update topic sizes and assignments\n    self._update_topic_size(documents)\n\n    # Extract words and update topic labels\n    self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)\n    self.topic_representations_ = self._extract_words_per_topic(words, documents)\n\n    # Update topic vectors\n    if set(topics) != self.topics_:\n        # Remove outlier topic embedding if all that has changed is the outlier class\n        same_position = all(\n            [\n                True if old_topic == new_topic else False\n                for old_topic, new_topic in zip(self.topics_, topics)\n                if old_topic != -1\n            ]\n        )\n        if same_position and -1 not in topics and -1 in self.topics_:\n            self.topic_embeddings_ = self.topic_embeddings_[1:]\n        else:\n            self._create_topic_vectors()\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_approximate_distribution","title":"visualize_approximate_distribution(self, document, topic_token_distribution, normalize=False)","text":"

        Visualize the topic distribution calculated by .approximate_topic_distribution on a token level. Thereby indicating the extent to which a certain word or phrase belongs to a specific topic. The assumption here is that a single word can belong to multiple similar topics and as such can give information about the broader set of topics within a single document.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required document str

        The document for which you want to visualize the approximated topic distribution.

        required topic_token_distribution ndarray

        The topic-token distribution of the document as extracted by .approximate_topic_distribution

        required normalize bool

        Whether to normalize, between 0 and 1 (summing up to 1), the topic distribution values.

        False

        Returns:

        Type Description df

        A stylized dataframe indicating the best fitting topics for each token.

        Examples:

        # Calculate the topic distributions on a token level\n# Note that we need to have `calculate_token_level=True`\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(\n        docs, calculate_token_level=True\n)\n\n# Visualize the approximated topic distributions\ndf = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])\ndf\n

        To revert this stylized dataframe back to a regular dataframe, you can run the following:

        df.data.columns = [column.strip() for column in df.data.columns]\ndf = df.data\n
        Source code in bertopic\\_bertopic.py
        def visualize_approximate_distribution(\n    self,\n    document: str,\n    topic_token_distribution: np.ndarray,\n    normalize: bool = False,\n):\n    \"\"\"Visualize the topic distribution calculated by `.approximate_topic_distribution`\n    on a token level. Thereby indicating the extent to which a certain word or phrase belongs\n    to a specific topic. The assumption here is that a single word can belong to multiple\n    similar topics and as such can give information about the broader set of topics within\n    a single document.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        document: The document for which you want to visualize\n                  the approximated topic distribution.\n        topic_token_distribution: The topic-token distribution of the document as\n                                  extracted by `.approximate_topic_distribution`\n        normalize: Whether to normalize, between 0 and 1 (summing up to 1), the\n                   topic distribution values.\n\n    Returns:\n        df: A stylized dataframe indicating the best fitting topics\n            for each token.\n\n    Examples:\n    ```python\n    # Calculate the topic distributions on a token level\n    # Note that we need to have `calculate_token_level=True`\n    topic_distr, topic_token_distr = topic_model.approximate_distribution(\n            docs, calculate_token_level=True\n    )\n\n    # Visualize the approximated topic distributions\n    df = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])\n    df\n    ```\n\n    To revert this stylized dataframe back to a regular dataframe,\n    you can run the following:\n\n    ```python\n    df.data.columns = [column.strip() for column in df.data.columns]\n    df = df.data\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_approximate_distribution(\n        self,\n        document=document,\n        topic_token_distribution=topic_token_distribution,\n        normalize=normalize,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_barchart","title":"visualize_barchart(self, topics=None, top_n_topics=8, n_words=5, custom_labels=False, title='Topic Word Scores', width=250, height=250, autoscale=False)","text":"

        Visualize a barchart of selected topics.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        8 n_words int

        Number of words to show in a topic

        5 custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        'Topic Word Scores' width int

        The width of each figure.

        250 height int

        The height of each figure.

        250 autoscale bool

        Whether to automatically calculate the height of the figures to fit the whole bar text

        False

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the barchart of selected topics simply run:

        topic_model.visualize_barchart()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_barchart()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_barchart(\n    self,\n    topics: List[int] = None,\n    top_n_topics: int = 8,\n    n_words: int = 5,\n    custom_labels: bool = False,\n    title: str = \"Topic Word Scores\",\n    width: int = 250,\n    height: int = 250,\n    autoscale: bool = False,\n) -> go.Figure:\n    \"\"\"Visualize a barchart of selected topics.\n\n    Arguments:\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_words: Number of words to show in a topic\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of each figure.\n        height: The height of each figure.\n        autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the barchart of selected topics\n    simply run:\n\n    ```python\n    topic_model.visualize_barchart()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_barchart()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_barchart(\n        self,\n        topics=topics,\n        top_n_topics=top_n_topics,\n        n_words=n_words,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n        autoscale=autoscale,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_distribution","title":"visualize_distribution(self, probabilities, min_probability=0.015, custom_labels=False, title='<b>Topic Probability Distribution</b>', width=800, height=600)","text":"

        Visualize the distribution of topic probabilities.

        Parameters:

        Name Type Description Default probabilities ndarray

        An array of probability scores

        required min_probability float

        The minimum probability score to visualize. All others are ignored.

        0.015 custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Topic Probability Distribution</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        600

        Examples:

        Make sure to fit the model before and only input the probabilities of a single document:

        topic_model.visualize_distribution(topic_model.probabilities_[0])\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_distribution(topic_model.probabilities_[0])\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_distribution(\n    self,\n    probabilities: np.ndarray,\n    min_probability: float = 0.015,\n    custom_labels: bool = False,\n    title: str = \"<b>Topic Probability Distribution</b>\",\n    width: int = 800,\n    height: int = 600,\n) -> go.Figure:\n    \"\"\"Visualize the distribution of topic probabilities.\n\n    Arguments:\n        probabilities: An array of probability scores\n        min_probability: The minimum probability score to visualize.\n                         All others are ignored.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    Make sure to fit the model before and only input the\n    probabilities of a single document:\n\n    ```python\n    topic_model.visualize_distribution(topic_model.probabilities_[0])\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_distribution(topic_model.probabilities_[0])\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_distribution(\n        self,\n        probabilities=probabilities,\n        min_probability=min_probability,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_document_datamap","title":"visualize_document_datamap(self, docs, topics=None, embeddings=None, reduced_embeddings=None, custom_labels=False, title='Documents and Topics', sub_title=None, width=1200, height=1200, **datamap_kwds)","text":"

        Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best to use a sufficiently large min_topic_size or set nr_topics when building the model.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize.

        None Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5

        topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points.

        required embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        'Documents and Topics' sub_title Optional[str]

        Sub-title of the plot.

        None width int

        The width of the figure.

        1200 height int

        The height of the figure.

        1200 **datamap_kwds

        All further keyword args will be passed on to DataMapPlot's create_plot function. See the DataMapPlot documentation for more details.

        {}

        Returns:

        Type Description figure

        A Matplotlib Figure object.

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_document_datamap(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic(min_topic_size=36).fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\nfig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_document_datamap(\n    self,\n    docs: List[str],\n    topics: List[int] = None,\n    embeddings: np.ndarray = None,\n    reduced_embeddings: np.ndarray = None,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"Documents and Topics\",\n    sub_title: Union[str, None] = None,\n    width: int = 1200,\n    height: int = 1200,\n    **datamap_kwds,\n):\n    \"\"\"Visualize documents and their topics in 2D as a static plot for publication using\n    DataMapPlot. This works best if there are between 5 and 60 topics. It is therefore best\n    to use a sufficiently large `min_topic_size` or set `nr_topics` when building the model.\n\n    Arguments:\n        topic_model:  A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n        Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points.\n        embeddings:  The embeddings of all documents in `docs`.\n        reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.\n        custom_labels:  If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        sub_title: Sub-title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n        **datamap_kwds:  All further keyword args will be passed on to DataMapPlot's\n                         `create_plot` function. See the DataMapPlot documentation\n                         for more details.\n\n    Returns:\n        figure: A Matplotlib Figure object.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_document_datamap(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic(min_topic_size=36).fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    fig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    return plotting.visualize_document_datamap(\n        self,\n        docs,\n        topics,\n        embeddings,\n        reduced_embeddings,\n        custom_labels,\n        title,\n        sub_title,\n        width,\n        height,\n        **datamap_kwds,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_documents","title":"visualize_documents(self, docs, topics=None, embeddings=None, reduced_embeddings=None, sample=None, hide_annotations=False, hide_document_hover=False, custom_labels=False, title='<b>Documents and Topics</b>', width=1200, height=750)","text":"

        Visualize documents and their topics in 2D.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample float

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization.

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_documents(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_documents(\n    self,\n    docs: List[str],\n    topics: List[int] = None,\n    embeddings: np.ndarray = None,\n    reduced_embeddings: np.ndarray = None,\n    sample: float = None,\n    hide_annotations: bool = False,\n    hide_document_hover: bool = False,\n    custom_labels: bool = False,\n    title: str = \"<b>Documents and Topics</b>\",\n    width: int = 1200,\n    height: int = 750,\n) -> go.Figure:\n    \"\"\"Visualize documents and their topics in 2D.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                            specific points. Helps to speed up generation of visualization.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_documents(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic().fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_documents(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../getting_started/visualization/documents.html\"\n    style=\"width:1000px; height: 800px; border: 0px;\"\"></iframe>\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    return plotting.visualize_documents(\n        self,\n        docs=docs,\n        topics=topics,\n        embeddings=embeddings,\n        reduced_embeddings=reduced_embeddings,\n        sample=sample,\n        hide_annotations=hide_annotations,\n        hide_document_hover=hide_document_hover,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_heatmap","title":"visualize_heatmap(self, topics=None, top_n_topics=None, n_clusters=None, use_ctfidf=False, custom_labels=False, title='<b>Similarity Matrix</b>', width=800, height=800)","text":"

        Visualize a heatmap of the topic's similarity matrix.

        Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics, a heatmap is created showing the similarity between topics.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        None n_clusters int

        Create n clusters and order the similarity matrix by those clusters.

        None use_ctfidf bool

        Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used.

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Similarity Matrix</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        800

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the similarity matrix of topics simply run:

        topic_model.visualize_heatmap()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_heatmap()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_heatmap(\n    self,\n    topics: List[int] = None,\n    top_n_topics: int = None,\n    n_clusters: int = None,\n    use_ctfidf: bool = False,\n    custom_labels: bool = False,\n    title: str = \"<b>Similarity Matrix</b>\",\n    width: int = 800,\n    height: int = 800,\n) -> go.Figure:\n    \"\"\"Visualize a heatmap of the topic's similarity matrix.\n\n    Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics,\n    a heatmap is created showing the similarity between topics.\n\n    Arguments:\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_clusters: Create n clusters and order the similarity\n                    matrix by those clusters.\n        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                    embeddings from the embedding model are used.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the similarity matrix of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_heatmap()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_heatmap()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_heatmap(\n        self,\n        topics=topics,\n        top_n_topics=top_n_topics,\n        n_clusters=n_clusters,\n        use_ctfidf=use_ctfidf,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_hierarchical_documents","title":"visualize_hierarchical_documents(self, docs, hierarchical_topics, topics=None, embeddings=None, reduced_embeddings=None, sample=None, hide_annotations=False, hide_document_hover=True, nr_levels=10, level_scale='linear', custom_labels=False, title='<b>Hierarchical Documents and Topics</b>', width=1200, height=750)","text":"

        Visualize documents and their topics in 2D at different levels of hierarchy.

        Parameters:

        Name Type Description Default docs List[str]

        The documents you used when calling either fit or fit_transform

        required hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample Union[float, int]

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations.

        True nr_levels int

        The number of levels to be visualized in the hierarchy. First, the distances in hierarchical_topics.Distance are split in nr_levels lists of distances with equal length. Then, for each list of distances, the merged topics, that have a distance less or equal to the maximum distance of the selected list of distances, are selected. NOTE: To get all possible merged steps, make sure that nr_levels is equal to the length of hierarchical_topics.

        10 level_scale str

        Whether to apply a linear or logarithmic ('log') scale levels of the distance vector. Linear scaling will perform an equal number of merges at each level while logarithmic scaling will perform more mergers in earlier levels to provide more resolution at higher levels (this can be used for when the number of topics is large).

        'linear' custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels. NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_hierarchical_documents(\n    self,\n    docs: List[str],\n    hierarchical_topics: pd.DataFrame,\n    topics: List[int] = None,\n    embeddings: np.ndarray = None,\n    reduced_embeddings: np.ndarray = None,\n    sample: Union[float, int] = None,\n    hide_annotations: bool = False,\n    hide_document_hover: bool = True,\n    nr_levels: int = 10,\n    level_scale: str = \"linear\",\n    custom_labels: bool = False,\n    title: str = \"<b>Hierarchical Documents and Topics</b>\",\n    width: int = 1200,\n    height: int = 750,\n) -> go.Figure:\n    \"\"\"Visualize documents and their topics in 2D at different levels of hierarchy.\n\n    Arguments:\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                            represented by their parents and their children\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                             specific points. Helps to speed up generation of visualizations.\n        nr_levels: The number of levels to be visualized in the hierarchy. First, the distances\n                   in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances with\n                   equal length. Then, for each list of distances, the merged topics, that have\n                   a distance less or equal to the maximum distance of the selected list of distances, are selected.\n                   NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to\n                   the length of `hierarchical_topics`.\n        level_scale: Whether to apply a linear or logarithmic ('log') scale levels of the distance\n                     vector. Linear scaling will perform an equal number of merges at each level\n                     while logarithmic scaling will perform more mergers in earlier levels to\n                     provide more resolution at higher levels (this can be used for when the number\n                     of topics is large).\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       NOTE: Custom labels are only generated for the original\n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic and extract hierarchical topics\n    topic_model = BERTopic().fit(docs, embeddings)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../getting_started/visualization/hierarchical_documents.html\"\n    style=\"width:1000px; height: 770px; border: 0px;\"\"></iframe>\n    \"\"\"\n    check_is_fitted(self)\n    check_documents_type(docs)\n    return plotting.visualize_hierarchical_documents(\n        self,\n        docs=docs,\n        hierarchical_topics=hierarchical_topics,\n        topics=topics,\n        embeddings=embeddings,\n        reduced_embeddings=reduced_embeddings,\n        sample=sample,\n        hide_annotations=hide_annotations,\n        hide_document_hover=hide_document_hover,\n        nr_levels=nr_levels,\n        level_scale=level_scale,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_hierarchy","title":"visualize_hierarchy(self, orientation='left', topics=None, top_n_topics=None, use_ctfidf=True, custom_labels=False, title='<b>Hierarchical Clustering</b>', width=1000, height=600, hierarchical_topics=None, linkage_function=None, distance_function=None, color_threshold=1)","text":"

        Visualize a hierarchical structure of the topics.

        A ward linkage function is used to perform the hierarchical clustering based on the cosine distance matrix between c-TF-IDF or semantic embeddings of the topics.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required orientation str

        The orientation of the figure. Either 'left' or 'bottom'

        'left' topics List[int]

        A selection of topics to visualize

        None top_n_topics int

        Only select the top n most frequent topics

        None use_ctfidf bool

        Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used.

        True custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels. NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Clustering</b>' width int

        The width of the figure. Only works if orientation is set to 'left'

        1000 height int

        The height of the figure. Only works if orientation is set to 'bottom'

        600 hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both topics and top_n_topics are not set.

        None linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

        The linkage function to use. Default is: lambda x: sch.linkage(x, 'ward', optimal_ordering=True) NOTE: Make sure to use the same linkage_function as used in topic_model.hierarchical_topics.

        None distance_function Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

        The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x) NOTE: Make sure to use the same distance_function as used in topic_model.hierarchical_topics.

        None color_threshold int

        Value at which the separation of clusters will be made which will result in different colors for different clusters. A higher value will typically lead to less colored clusters.

        1

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the hierarchical structure of topics simply run:

        topic_model.visualize_hierarchy()\n

        If you also want the labels of hierarchical topics visualized, run the following:

        # Extract hierarchical topics and their representations\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Visualize these representations\ntopic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you want to save the resulting figure:

        fig = topic_model.visualize_hierarchy()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_hierarchy(\n    self,\n    orientation: str = \"left\",\n    topics: List[int] = None,\n    top_n_topics: int = None,\n    use_ctfidf: bool = True,\n    custom_labels: bool = False,\n    title: str = \"<b>Hierarchical Clustering</b>\",\n    width: int = 1000,\n    height: int = 600,\n    hierarchical_topics: pd.DataFrame = None,\n    linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n    distance_function: Callable[[csr_matrix], csr_matrix] = None,\n    color_threshold: int = 1,\n) -> go.Figure:\n    \"\"\"Visualize a hierarchical structure of the topics.\n\n    A ward linkage function is used to perform the\n    hierarchical clustering based on the cosine distance\n    matrix between c-TF-IDF or semantic embeddings of the topics.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        orientation: The orientation of the figure.\n                     Either 'left' or 'bottom'\n        topics: A selection of topics to visualize\n        top_n_topics: Only select the top n most frequent topics\n        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the\n                    embeddings from the embedding model are used.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       NOTE: Custom labels are only generated for the original\n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure. Only works if orientation is set to 'left'\n        height: The height of the figure. Only works if orientation is set to 'bottom'\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children.\n                             NOTE: The hierarchical topic names are only visualized\n                             if both `topics` and `top_n_topics` are not set.\n        linkage_function: The linkage function to use. Default is:\n                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n                          NOTE: Make sure to use the same `linkage_function` as used\n                          in `topic_model.hierarchical_topics`.\n        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                           `lambda x: 1 - cosine_similarity(x)`\n                           NOTE: Make sure to use the same `distance_function` as used\n                           in `topic_model.hierarchical_topics`.\n        color_threshold: Value at which the separation of clusters will be made which\n                         will result in different colors for different clusters.\n                         A higher value will typically lead to less colored clusters.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the hierarchical structure of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchy()\n    ```\n\n    If you also want the labels of hierarchical topics visualized,\n    run the following:\n\n    ```python\n    # Extract hierarchical topics and their representations\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Visualize these representations\n    topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n    ```\n\n    If you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchy()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../getting_started/visualization/hierarchy.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_hierarchy(\n        self,\n        orientation=orientation,\n        topics=topics,\n        top_n_topics=top_n_topics,\n        use_ctfidf=use_ctfidf,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n        hierarchical_topics=hierarchical_topics,\n        linkage_function=linkage_function,\n        distance_function=distance_function,\n        color_threshold=color_threshold,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_term_rank","title":"visualize_term_rank(self, topics=None, log_scale=False, custom_labels=False, title='<b>Term score decline per Topic</b>', width=800, height=500)","text":"

        Visualize the ranks of all terms across all topics.

        Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize. These will be colored red where all others will be colored black.

        None log_scale bool

        Whether to represent the ranking on a log scale

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Term score decline per Topic</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        500

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the ranks of all words across all topics simply run:

        topic_model.visualize_term_rank()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_term_rank()\nfig.write_html(\"path/to/file.html\")\n

        Reference:

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        Source code in bertopic\\_bertopic.py
        def visualize_term_rank(\n    self,\n    topics: List[int] = None,\n    log_scale: bool = False,\n    custom_labels: bool = False,\n    title: str = \"<b>Term score decline per Topic</b>\",\n    width: int = 800,\n    height: int = 500,\n) -> go.Figure:\n    \"\"\"Visualize the ranks of all terms across all topics.\n\n    Each topic is represented by a set of words. These words, however,\n    do not all equally represent the topic. This visualization shows\n    how many words are needed to represent a topic and at which point\n    the beneficial effect of adding words starts to decline.\n\n    Arguments:\n        topics: A selection of topics to visualize. These will be colored\n                red where all others will be colored black.\n        log_scale: Whether to represent the ranking on a log scale\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the ranks of all words across\n    all topics simply run:\n\n    ```python\n    topic_model.visualize_term_rank()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_term_rank()\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    Reference:\n\n    This visualization was heavily inspired by the\n    \"Term Probability Decline\" visualization found in an\n    analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/).\n    Reference to that specific analysis can be found\n    [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html).\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_term_rank(\n        self,\n        topics=topics,\n        log_scale=log_scale,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_topics","title":"visualize_topics(self, topics=None, top_n_topics=None, use_ctfidf=False, custom_labels=False, title='<b>Intertopic Distance Map</b>', width=650, height=650)","text":"

        Visualize topics, their sizes, and their corresponding words.

        This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA.

        Parameters:

        Name Type Description Default topics List[int]

        A selection of topics to visualize Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None top_n_topics int

        Only select the top n most frequent topics

        None use_ctfidf bool

        Whether to use c-TF-IDF representations instead of the embeddings from the embedding model.

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Intertopic Distance Map</b>' width int

        The width of the figure.

        650 height int

        The height of the figure.

        650

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_topics()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_topics(\n    self,\n    topics: List[int] = None,\n    top_n_topics: int = None,\n    use_ctfidf: bool = False,\n    custom_labels: bool = False,\n    title: str = \"<b>Intertopic Distance Map</b>\",\n    width: int = 650,\n    height: int = 650,\n) -> go.Figure:\n    \"\"\"Visualize topics, their sizes, and their corresponding words.\n\n    This visualization is highly inspired by LDAvis, a great visualization\n    technique typically reserved for LDA.\n\n    Arguments:\n        topics: A selection of topics to visualize\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        top_n_topics: Only select the top n most frequent topics\n        use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model.\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_topics()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_topics(\n        self,\n        topics=topics,\n        top_n_topics=top_n_topics,\n        use_ctfidf=use_ctfidf,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_topics_over_time","title":"visualize_topics_over_time(self, topics_over_time, top_n_topics=None, topics=None, normalize_frequency=False, custom_labels=False, title='<b>Topics over Time</b>', width=1250, height=450)","text":"

        Visualize topics over time.

        Parameters:

        Name Type Description Default topics_over_time DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        None topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Topics over Time</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        450

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics over time, simply run:

        topics_over_time = topic_model.topics_over_time(docs, timestamps)\ntopic_model.visualize_topics_over_time(topics_over_time)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_over_time(topics_over_time)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_topics_over_time(\n    self,\n    topics_over_time: pd.DataFrame,\n    top_n_topics: int = None,\n    topics: List[int] = None,\n    normalize_frequency: bool = False,\n    custom_labels: bool = False,\n    title: str = \"<b>Topics over Time</b>\",\n    width: int = 1250,\n    height: int = 450,\n) -> go.Figure:\n    \"\"\"Visualize topics over time.\n\n    Arguments:\n        topics_over_time: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n    To visualize the topics over time, simply run:\n\n    ```python\n    topics_over_time = topic_model.topics_over_time(docs, timestamps)\n    topic_model.visualize_topics_over_time(topics_over_time)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_over_time(topics_over_time)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_topics_over_time(\n        self,\n        topics_over_time=topics_over_time,\n        top_n_topics=top_n_topics,\n        topics=topics,\n        normalize_frequency=normalize_frequency,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/bertopic.html#bertopic._bertopic.BERTopic.visualize_topics_per_class","title":"visualize_topics_per_class(self, topics_per_class, top_n_topics=10, topics=None, normalize_frequency=False, custom_labels=False, title='<b>Topics per Class</b>', width=1250, height=900)","text":"

        Visualize topics per class.

        Parameters:

        Name Type Description Default topics_per_class DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        10 topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels bool

        Whether to use custom topic labels that were defined using topic_model.set_topic_labels.

        False title str

        Title of the plot.

        '<b>Topics per Class</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        900

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics per class, simply run:

        topics_per_class = topic_model.topics_per_class(docs, classes)\ntopic_model.visualize_topics_per_class(topics_per_class)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_per_class(topics_per_class)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\_bertopic.py
        def visualize_topics_per_class(\n    self,\n    topics_per_class: pd.DataFrame,\n    top_n_topics: int = 10,\n    topics: List[int] = None,\n    normalize_frequency: bool = False,\n    custom_labels: bool = False,\n    title: str = \"<b>Topics per Class</b>\",\n    width: int = 1250,\n    height: int = 900,\n) -> go.Figure:\n    \"\"\"Visualize topics per class.\n\n    Arguments:\n        topics_per_class: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: Whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n    To visualize the topics per class, simply run:\n\n    ```python\n    topics_per_class = topic_model.topics_per_class(docs, classes)\n    topic_model.visualize_topics_per_class(topics_per_class)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_per_class(topics_per_class)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    \"\"\"\n    check_is_fitted(self)\n    return plotting.visualize_topics_per_class(\n        self,\n        topics_per_class=topics_per_class,\n        top_n_topics=top_n_topics,\n        topics=topics,\n        normalize_frequency=normalize_frequency,\n        custom_labels=custom_labels,\n        title=title,\n        width=width,\n        height=height,\n    )\n
        "},{"location":"api/ctfidf.html","title":"c-TF-IDF","text":"

        A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.

        c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes by joining all documents per class. Thus, each class is converted to a single document instead of set of documents. The frequency of each word x is extracted for each class c and is l1 normalized. This constitutes the term frequency.

        Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus the average number of words per class A divided by the frequency of word x across all classes.

        Parameters:

        Name Type Description Default bm25_weighting bool

        Uses BM25-inspired idf-weighting procedure instead of the procedure as defined in the c-TF-IDF formula. It uses the following weighting scheme: log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))

        False reduce_frequent_words bool

        Takes the square root of the bag-of-words after normalizing the matrix. Helps to reduce the impact of words that appear too frequently.

        False seed_words List[str]

        Specific words that will have their idf value increased by the value of seed_multiplier. NOTE: This will only increase the value of words that have an exact match.

        None seed_multiplier float

        The value with which the idf values of the words in seed_words are multiplied.

        2

        Examples:

        transformer = ClassTfidfTransformer()\n
        Source code in bertopic\\vectorizers\\_ctfidf.py
        class ClassTfidfTransformer(TfidfTransformer):\n    \"\"\"A Class-based TF-IDF procedure using scikit-learns TfidfTransformer as a base.\n\n    ![](../algorithm/c-TF-IDF.svg)\n\n    c-TF-IDF can best be explained as a TF-IDF formula adopted for multiple classes\n    by joining all documents per class. Thus, each class is converted to a single document\n    instead of set of documents. The frequency of each word **x** is extracted\n    for each class **c** and is **l1** normalized. This constitutes the term frequency.\n\n    Then, the term frequency is multiplied with IDF which is the logarithm of 1 plus\n    the average number of words per class **A** divided by the frequency of word **x**\n    across all classes.\n\n    Arguments:\n        bm25_weighting: Uses BM25-inspired idf-weighting procedure instead of the procedure\n                        as defined in the c-TF-IDF formula. It uses the following weighting scheme:\n                        `log(1+((avg_nr_samples - df + 0.5) / (df+0.5)))`\n        reduce_frequent_words: Takes the square root of the bag-of-words after normalizing the matrix.\n                               Helps to reduce the impact of words that appear too frequently.\n        seed_words: Specific words that will have their idf value increased by\n                    the value of `seed_multiplier`.\n                    NOTE: This will only increase the value of words that have an exact match.\n        seed_multiplier: The value with which the idf values of the words in `seed_words`\n                         are multiplied.\n\n    Examples:\n    ```python\n    transformer = ClassTfidfTransformer()\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        bm25_weighting: bool = False,\n        reduce_frequent_words: bool = False,\n        seed_words: List[str] = None,\n        seed_multiplier: float = 2,\n    ):\n        self.bm25_weighting = bm25_weighting\n        self.reduce_frequent_words = reduce_frequent_words\n        self.seed_words = seed_words\n        self.seed_multiplier = seed_multiplier\n        super(ClassTfidfTransformer, self).__init__()\n\n    def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):\n        \"\"\"Learn the idf vector (global term weights).\n\n        Arguments:\n            X: A matrix of term/token counts.\n            multiplier: A multiplier for increasing/decreasing certain IDF scores\n        \"\"\"\n        X = check_array(X, accept_sparse=(\"csr\", \"csc\"))\n        if not sp.issparse(X):\n            X = sp.csr_matrix(X)\n        dtype = np.float64\n\n        if self.use_idf:\n            _, n_features = X.shape\n\n            # Calculate the frequency of words across all classes\n            df = np.squeeze(np.asarray(X.sum(axis=0)))\n\n            # Calculate the average number of samples as regularization\n            avg_nr_samples = int(X.sum(axis=1).mean())\n\n            # BM25-inspired weighting procedure\n            if self.bm25_weighting:\n                idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5)))\n\n            # Divide the average number of samples by the word frequency\n            # +1 is added to force values to be positive\n            else:\n                idf = np.log((avg_nr_samples / df) + 1)\n\n            # Multiplier to increase/decrease certain idf scores\n            if multiplier is not None:\n                idf = idf * multiplier\n\n            self._idf_diag = sp.diags(\n                idf,\n                offsets=0,\n                shape=(n_features, n_features),\n                format=\"csr\",\n                dtype=dtype,\n            )\n\n        return self\n\n    def transform(self, X: sp.csr_matrix):\n        \"\"\"Transform a count-based matrix to c-TF-IDF.\n\n        Arguments:\n            X (sparse matrix): A matrix of term/token counts.\n\n        Returns:\n            X (sparse matrix): A c-TF-IDF matrix\n        \"\"\"\n        if self.use_idf:\n            X = normalize(X, axis=1, norm=\"l1\", copy=False)\n\n            if self.reduce_frequent_words:\n                X.data = np.sqrt(X.data)\n\n            X = X * self._idf_diag\n\n        return X\n
        "},{"location":"api/ctfidf.html#bertopic.vectorizers._ctfidf.ClassTfidfTransformer.fit","title":"fit(self, X, multiplier=None)","text":"

        Learn the idf vector (global term weights).

        Parameters:

        Name Type Description Default X csr_matrix

        A matrix of term/token counts.

        required multiplier ndarray

        A multiplier for increasing/decreasing certain IDF scores

        None Source code in bertopic\\vectorizers\\_ctfidf.py
        def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):\n    \"\"\"Learn the idf vector (global term weights).\n\n    Arguments:\n        X: A matrix of term/token counts.\n        multiplier: A multiplier for increasing/decreasing certain IDF scores\n    \"\"\"\n    X = check_array(X, accept_sparse=(\"csr\", \"csc\"))\n    if not sp.issparse(X):\n        X = sp.csr_matrix(X)\n    dtype = np.float64\n\n    if self.use_idf:\n        _, n_features = X.shape\n\n        # Calculate the frequency of words across all classes\n        df = np.squeeze(np.asarray(X.sum(axis=0)))\n\n        # Calculate the average number of samples as regularization\n        avg_nr_samples = int(X.sum(axis=1).mean())\n\n        # BM25-inspired weighting procedure\n        if self.bm25_weighting:\n            idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5)))\n\n        # Divide the average number of samples by the word frequency\n        # +1 is added to force values to be positive\n        else:\n            idf = np.log((avg_nr_samples / df) + 1)\n\n        # Multiplier to increase/decrease certain idf scores\n        if multiplier is not None:\n            idf = idf * multiplier\n\n        self._idf_diag = sp.diags(\n            idf,\n            offsets=0,\n            shape=(n_features, n_features),\n            format=\"csr\",\n            dtype=dtype,\n        )\n\n    return self\n
        "},{"location":"api/ctfidf.html#bertopic.vectorizers._ctfidf.ClassTfidfTransformer.transform","title":"transform(self, X)","text":"

        Transform a count-based matrix to c-TF-IDF.

        Parameters:

        Name Type Description Default X sparse matrix

        A matrix of term/token counts.

        required

        Returns:

        Type Description X (sparse matrix)

        A c-TF-IDF matrix

        Source code in bertopic\\vectorizers\\_ctfidf.py
        def transform(self, X: sp.csr_matrix):\n    \"\"\"Transform a count-based matrix to c-TF-IDF.\n\n    Arguments:\n        X (sparse matrix): A matrix of term/token counts.\n\n    Returns:\n        X (sparse matrix): A c-TF-IDF matrix\n    \"\"\"\n    if self.use_idf:\n        X = normalize(X, axis=1, norm=\"l1\", copy=False)\n\n        if self.reduce_frequent_words:\n            X.data = np.sqrt(X.data)\n\n        X = X * self._idf_diag\n\n    return X\n
        "},{"location":"api/onlinecv.html","title":"OnlineCountVectorizer","text":"

        An online variant of the CountVectorizer with updating vocabulary.

        At each .partial_fit, its vocabulary is updated based on any OOV words it might find. Then, .update_bow can be used to track and update the Bag-of-Words representation. These functions are separated such that the vectorizer can be used in iteration without updating the Bag-of-Words representation can might speed up the fitting process. However, the .update_bow function is used in BERTopic to track changes in the topic representations and allow for decay.

        This class inherits its parameters and attributes from: sklearn.feature_extraction.text.CountVectorizer

        Parameters:

        Name Type Description Default decay float

        A value between [0, 1] to weight the percentage of frequencies the previous bag-of-words should be decreased. For example, a value of .1 will decrease the frequencies in the bag-of-words matrix with 10% at each iteration.

        None delete_min_df float

        Delete words at each iteration from its vocabulary that are below a minimum frequency. This will keep the resulting bag-of-words matrix small such that it does not explode in size with increasing vocabulary. If decay is None then this equals min_df.

        None **kwargs

        Set of parameters inherited from: sklearn.feature_extraction.text.CountVectorizer In practice, this means that you can still use parameters from the original CountVectorizer, like stop_words and ngram_range.

        {}

        Attributes:

        Name Type Description X_ scipy.sparse.csr_matrix)

        The Bag-of-Words representation

        Examples:

        from bertopic.vectorizers import OnlineCountVectorizer\nvectorizer = OnlineCountVectorizer(stop_words=\"english\")\n\nfor index, doc in enumerate(my_docs):\n    vectorizer.partial_fit(doc)\n\n    # Update and clean the bow every 100 iterations:\n    if index % 100 == 0:\n        X = vectorizer.update_bow()\n

        To use the model in BERTopic:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import OnlineCountVectorizer\n\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        References

        Adapted from: https://github.com/idoshlomo/online_vectorizers

        Source code in bertopic\\vectorizers\\_online_cv.py
        class OnlineCountVectorizer(CountVectorizer):\n    \"\"\"An online variant of the CountVectorizer with updating vocabulary.\n\n    At each `.partial_fit`, its vocabulary is updated based on any OOV words\n    it might find. Then, `.update_bow` can be used to track and update\n    the Bag-of-Words representation. These functions are separated such that\n    the vectorizer can be used in iteration without updating the Bag-of-Words\n    representation can might speed up the fitting process. However, the\n    `.update_bow` function is used in BERTopic to track changes in the\n    topic representations and allow for decay.\n\n    This class inherits its parameters and attributes from:\n        `sklearn.feature_extraction.text.CountVectorizer`\n\n    Arguments:\n        decay: A value between [0, 1] to weight the percentage of frequencies\n               the previous bag-of-words should be decreased. For example,\n               a value of `.1` will decrease the frequencies in the bag-of-words\n               matrix with 10% at each iteration.\n        delete_min_df: Delete words at each iteration from its vocabulary\n                       that are below a minimum frequency.\n                       This will keep the resulting bag-of-words matrix small\n                       such that it does not explode in size with increasing\n                       vocabulary. If `decay` is None then this equals `min_df`.\n        **kwargs: Set of parameters inherited from:\n                  `sklearn.feature_extraction.text.CountVectorizer`\n                  In practice, this means that you can still use parameters\n                  from the original CountVectorizer, like `stop_words` and\n                  `ngram_range`.\n\n    Attributes:\n        X_ (scipy.sparse.csr_matrix) : The Bag-of-Words representation\n\n    Examples:\n    ```python\n    from bertopic.vectorizers import OnlineCountVectorizer\n    vectorizer = OnlineCountVectorizer(stop_words=\"english\")\n\n    for index, doc in enumerate(my_docs):\n        vectorizer.partial_fit(doc)\n\n        # Update and clean the bow every 100 iterations:\n        if index % 100 == 0:\n            X = vectorizer.update_bow()\n    ```\n\n    To use the model in BERTopic:\n\n    ```python\n    from bertopic import BERTopic\n    from bertopic.vectorizers import OnlineCountVectorizer\n\n    vectorizer_model = OnlineCountVectorizer(stop_words=\"english\")\n    topic_model = BERTopic(vectorizer_model=vectorizer_model)\n    ```\n\n    References:\n        Adapted from: https://github.com/idoshlomo/online_vectorizers\n    \"\"\"\n\n    def __init__(self, decay: float = None, delete_min_df: float = None, **kwargs):\n        self.decay = decay\n        self.delete_min_df = delete_min_df\n        super(OnlineCountVectorizer, self).__init__(**kwargs)\n\n    def partial_fit(self, raw_documents: List[str]) -> None:\n        \"\"\"Perform a partial fit and update vocabulary with OOV tokens.\n\n        Arguments:\n            raw_documents: A list of documents\n        \"\"\"\n        if not hasattr(self, \"vocabulary_\"):\n            return self.fit(raw_documents)\n\n        analyzer = self.build_analyzer()\n        analyzed_documents = [analyzer(doc) for doc in raw_documents]\n        new_tokens = set(chain.from_iterable(analyzed_documents))\n        oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))\n\n        if oov_tokens:\n            max_index = max(self.vocabulary_.values())\n            oov_vocabulary = dict(\n                zip(\n                    oov_tokens,\n                    list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)),\n                )\n            )\n            self.vocabulary_.update(oov_vocabulary)\n\n        return self\n\n    def update_bow(self, raw_documents: List[str]) -> csr_matrix:\n        \"\"\"Create or update the bag-of-words matrix.\n\n        Update the bag-of-words matrix by adding the newly transformed\n        documents. This may add empty columns if new words are found and/or\n        add empty rows if new topics are found.\n\n        During this process, the previous bag-of-words matrix might be\n        decayed if `self.decay` has been set during init. Similarly, words\n        that do not exceed `self.delete_min_df` are removed from its\n        vocabulary and bag-of-words matrix.\n\n        Arguments:\n            raw_documents: A list of documents\n\n        Returns:\n            X_: Bag-of-words matrix\n        \"\"\"\n        if hasattr(self, \"X_\"):\n            X = self.transform(raw_documents)\n\n            # Add empty columns if new words are found\n            columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)\n            self.X_ = sparse.hstack([self.X_, columns])\n\n            # Add empty rows if new topics are found\n            rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)\n            self.X_ = sparse.vstack([self.X_, rows])\n\n            # Decay of BoW matrix\n            if self.decay is not None:\n                self.X_ = self.X_ * (1 - self.decay)\n\n            self.X_ += X\n        else:\n            self.X_ = self.transform(raw_documents)\n\n        if self.delete_min_df is not None:\n            self._clean_bow()\n\n        return self.X_\n\n    def _clean_bow(self) -> None:\n        \"\"\"Remove words that do not exceed `self.delete_min_df`.\"\"\"\n        # Only keep words with a minimum frequency\n        indices = np.where(self.X_.sum(0) >= self.delete_min_df)[1]\n        indices_dict = {index: index for index in indices}\n        self.X_ = self.X_[:, indices]\n\n        # Update vocabulary with new words\n        new_vocab = {}\n        vocabulary_dict = {v: k for k, v in self.vocabulary_.items()}\n        for i, index in enumerate(indices):\n            if indices_dict.get(index) is not None:\n                new_vocab[vocabulary_dict[index]] = i\n\n        self.vocabulary_ = new_vocab\n
        "},{"location":"api/onlinecv.html#bertopic.vectorizers._online_cv.OnlineCountVectorizer.partial_fit","title":"partial_fit(self, raw_documents)","text":"

        Perform a partial fit and update vocabulary with OOV tokens.

        Parameters:

        Name Type Description Default raw_documents List[str]

        A list of documents

        required Source code in bertopic\\vectorizers\\_online_cv.py
        def partial_fit(self, raw_documents: List[str]) -> None:\n    \"\"\"Perform a partial fit and update vocabulary with OOV tokens.\n\n    Arguments:\n        raw_documents: A list of documents\n    \"\"\"\n    if not hasattr(self, \"vocabulary_\"):\n        return self.fit(raw_documents)\n\n    analyzer = self.build_analyzer()\n    analyzed_documents = [analyzer(doc) for doc in raw_documents]\n    new_tokens = set(chain.from_iterable(analyzed_documents))\n    oov_tokens = new_tokens.difference(set(self.vocabulary_.keys()))\n\n    if oov_tokens:\n        max_index = max(self.vocabulary_.values())\n        oov_vocabulary = dict(\n            zip(\n                oov_tokens,\n                list(range(max_index + 1, max_index + 1 + len(oov_tokens), 1)),\n            )\n        )\n        self.vocabulary_.update(oov_vocabulary)\n\n    return self\n
        "},{"location":"api/onlinecv.html#bertopic.vectorizers._online_cv.OnlineCountVectorizer.update_bow","title":"update_bow(self, raw_documents)","text":"

        Create or update the bag-of-words matrix.

        Update the bag-of-words matrix by adding the newly transformed documents. This may add empty columns if new words are found and/or add empty rows if new topics are found.

        During this process, the previous bag-of-words matrix might be decayed if self.decay has been set during init. Similarly, words that do not exceed self.delete_min_df are removed from its vocabulary and bag-of-words matrix.

        Parameters:

        Name Type Description Default raw_documents List[str]

        A list of documents

        required

        Returns:

        Type Description X_

        Bag-of-words matrix

        Source code in bertopic\\vectorizers\\_online_cv.py
        def update_bow(self, raw_documents: List[str]) -> csr_matrix:\n    \"\"\"Create or update the bag-of-words matrix.\n\n    Update the bag-of-words matrix by adding the newly transformed\n    documents. This may add empty columns if new words are found and/or\n    add empty rows if new topics are found.\n\n    During this process, the previous bag-of-words matrix might be\n    decayed if `self.decay` has been set during init. Similarly, words\n    that do not exceed `self.delete_min_df` are removed from its\n    vocabulary and bag-of-words matrix.\n\n    Arguments:\n        raw_documents: A list of documents\n\n    Returns:\n        X_: Bag-of-words matrix\n    \"\"\"\n    if hasattr(self, \"X_\"):\n        X = self.transform(raw_documents)\n\n        # Add empty columns if new words are found\n        columns = csr_matrix((self.X_.shape[0], X.shape[1] - self.X_.shape[1]), dtype=int)\n        self.X_ = sparse.hstack([self.X_, columns])\n\n        # Add empty rows if new topics are found\n        rows = csr_matrix((X.shape[0] - self.X_.shape[0], self.X_.shape[1]), dtype=int)\n        self.X_ = sparse.vstack([self.X_, rows])\n\n        # Decay of BoW matrix\n        if self.decay is not None:\n            self.X_ = self.X_ * (1 - self.decay)\n\n        self.X_ += X\n    else:\n        self.X_ = self.transform(raw_documents)\n\n    if self.delete_min_df is not None:\n        self._clean_bow()\n\n    return self.X_\n
        "},{"location":"api/backends/base.html","title":"BaseEmbedder","text":"

        The Base Embedder used for creating embedding models.

        Parameters:

        Name Type Description Default embedding_model

        The main embedding model to be used for extracting document and word embedding

        None word_embedding_model

        The embedding model used for extracting word embeddings only. If this model is selected, then the embedding_model is purely used for creating document embeddings.

        None Source code in bertopic\\backend\\_base.py
        class BaseEmbedder:\n    \"\"\"The Base Embedder used for creating embedding models.\n\n    Arguments:\n        embedding_model: The main embedding model to be used for extracting\n                         document and word embedding\n        word_embedding_model: The embedding model used for extracting word\n                              embeddings only. If this model is selected,\n                              then the `embedding_model` is purely used for\n                              creating document embeddings.\n    \"\"\"\n\n    def __init__(self, embedding_model=None, word_embedding_model=None):\n        self.embedding_model = embedding_model\n        self.word_embedding_model = word_embedding_model\n\n    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n documents/words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            documents: A list of documents or words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document/words embeddings with shape (n, m) with `n` documents/words\n            that each have an embeddings size of `m`\n        \"\"\"\n        pass\n\n    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            words: A list of words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Word embeddings with shape (n, m) with `n` words\n            that each have an embeddings size of `m`\n\n        \"\"\"\n        return self.embed(words, verbose)\n\n    def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            document: A list of documents to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document embeddings with shape (n, m) with `n` documents\n            that each have an embeddings size of `m`\n        \"\"\"\n        return self.embed(document, verbose)\n
        "},{"location":"api/backends/base.html#bertopic.backend._base.BaseEmbedder.embed","title":"embed(self, documents, verbose=False)","text":"

        Embed a list of n documents/words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents or words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document/words embeddings with shape (n, m) with n documents/words that each have an embeddings size of m

        Source code in bertopic\\backend\\_base.py
        def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n documents/words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        documents: A list of documents or words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document/words embeddings with shape (n, m) with `n` documents/words\n        that each have an embeddings size of `m`\n    \"\"\"\n    pass\n
        "},{"location":"api/backends/base.html#bertopic.backend._base.BaseEmbedder.embed_documents","title":"embed_documents(self, document, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default document List[str]

        A list of documents to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document embeddings with shape (n, m) with n documents that each have an embeddings size of m

        Source code in bertopic\\backend\\_base.py
        def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        document: A list of documents to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document embeddings with shape (n, m) with `n` documents\n        that each have an embeddings size of `m`\n    \"\"\"\n    return self.embed(document, verbose)\n
        "},{"location":"api/backends/base.html#bertopic.backend._base.BaseEmbedder.embed_words","title":"embed_words(self, words, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default words List[str]

        A list of words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Word embeddings with shape (n, m) with n words that each have an embeddings size of m

        Source code in bertopic\\backend\\_base.py
        def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        words: A list of words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Word embeddings with shape (n, m) with `n` words\n        that each have an embeddings size of `m`\n\n    \"\"\"\n    return self.embed(words, verbose)\n
        "},{"location":"api/backends/cohere.html","title":"CohereBackend","text":"

        Cohere Embedding Model.

        Parameters:

        Name Type Description Default client

        A cohere client.

        required embedding_model str

        A Cohere model. Default is \"large\". For an overview of models see: https://docs.cohere.ai/docs/generation-card

        'large' delay_in_seconds float

        If a batch_size is given, use this set the delay in seconds between batches.

        None batch_size int

        The size of each batch.

        None embed_kwargs Mapping[str, Any]

        Kwargs passed to cohere.Client.embed. Can be used to define additional parameters such as input_type

        {}

        Examples:

        import cohere\nfrom bertopic.backend import CohereBackend\n\nclient = cohere.Client(\"APIKEY\")\ncohere_model = CohereBackend(client)\n

        If you want to specify input_type:

        cohere_model = CohereBackend(\n    client,\n    embedding_model=\"embed-english-v3.0\",\n    embed_kwargs={\"input_type\": \"clustering\"}\n)\n
        Source code in bertopic\\backend\\_cohere.py
        class CohereBackend(BaseEmbedder):\n    \"\"\"Cohere Embedding Model.\n\n    Arguments:\n        client: A `cohere` client.\n        embedding_model: A Cohere model. Default is \"large\".\n                         For an overview of models see:\n                         https://docs.cohere.ai/docs/generation-card\n        delay_in_seconds: If a `batch_size` is given, use this set\n                          the delay in seconds between batches.\n        batch_size: The size of each batch.\n        embed_kwargs: Kwargs passed to `cohere.Client.embed`.\n                            Can be used to define additional parameters\n                            such as `input_type`\n\n    Examples:\n    ```python\n    import cohere\n    from bertopic.backend import CohereBackend\n\n    client = cohere.Client(\"APIKEY\")\n    cohere_model = CohereBackend(client)\n    ```\n\n    If you want to specify `input_type`:\n\n    ```python\n    cohere_model = CohereBackend(\n        client,\n        embedding_model=\"embed-english-v3.0\",\n        embed_kwargs={\"input_type\": \"clustering\"}\n    )\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        client,\n        embedding_model: str = \"large\",\n        delay_in_seconds: float = None,\n        batch_size: int = None,\n        embed_kwargs: Mapping[str, Any] = {},\n    ):\n        super().__init__()\n        self.client = client\n        self.embedding_model = embedding_model\n        self.delay_in_seconds = delay_in_seconds\n        self.batch_size = batch_size\n        self.embed_kwargs = embed_kwargs\n\n        if self.embed_kwargs.get(\"model\"):\n            self.embedding_model = embed_kwargs.get(\"model\")\n        else:\n            self.embed_kwargs[\"model\"] = self.embedding_model\n\n    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n documents/words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            documents: A list of documents or words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document/words embeddings with shape (n, m) with `n` documents/words\n            that each have an embeddings size of `m`\n        \"\"\"\n        # Batch-wise embedding extraction\n        if self.batch_size is not None:\n            embeddings = []\n            for batch in tqdm(self._chunks(documents), disable=not verbose):\n                response = self.client.embed(texts=batch, **self.embed_kwargs)\n                embeddings.extend(response.embeddings)\n\n                # Delay subsequent calls\n                if self.delay_in_seconds:\n                    time.sleep(self.delay_in_seconds)\n\n        # Extract embeddings all at once\n        else:\n            response = self.client.embed(texts=documents, **self.embed_kwargs)\n            embeddings = response.embeddings\n        return np.array(embeddings)\n\n    def _chunks(self, documents):\n        for i in range(0, len(documents), self.batch_size):\n            yield documents[i : i + self.batch_size]\n
        "},{"location":"api/backends/cohere.html#bertopic.backend._cohere.CohereBackend.embed","title":"embed(self, documents, verbose=False)","text":"

        Embed a list of n documents/words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents or words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document/words embeddings with shape (n, m) with n documents/words that each have an embeddings size of m

        Source code in bertopic\\backend\\_cohere.py
        def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n documents/words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        documents: A list of documents or words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document/words embeddings with shape (n, m) with `n` documents/words\n        that each have an embeddings size of `m`\n    \"\"\"\n    # Batch-wise embedding extraction\n    if self.batch_size is not None:\n        embeddings = []\n        for batch in tqdm(self._chunks(documents), disable=not verbose):\n            response = self.client.embed(texts=batch, **self.embed_kwargs)\n            embeddings.extend(response.embeddings)\n\n            # Delay subsequent calls\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n    # Extract embeddings all at once\n    else:\n        response = self.client.embed(texts=documents, **self.embed_kwargs)\n        embeddings = response.embeddings\n    return np.array(embeddings)\n
        "},{"location":"api/backends/openai.html","title":"OpenAIBackend","text":"

        OpenAI Embedding Model.

        Parameters:

        Name Type Description Default client OpenAI

        A openai.OpenAI client.

        required embedding_model str

        An OpenAI model. Default is For an overview of models see: https://platform.openai.com/docs/models/embeddings

        'text-embedding-ada-002' delay_in_seconds float

        If a batch_size is given, use this set the delay in seconds between batches.

        None batch_size int

        The size of each batch.

        None generator_kwargs Mapping[str, Any]

        Kwargs passed to openai.Embedding.create. Can be used to define custom engines or deployment_ids.

        {}

        Examples:

        import openai\nfrom bertopic.backend import OpenAIBackend\n\nclient = openai.OpenAI(api_key=\"sk-...\")\nopenai_embedder = OpenAIBackend(client, \"text-embedding-ada-002\")\n
        Source code in bertopic\\backend\\_openai.py
        class OpenAIBackend(BaseEmbedder):\n    \"\"\"OpenAI Embedding Model.\n\n    Arguments:\n        client: A `openai.OpenAI` client.\n        embedding_model: An OpenAI model. Default is\n                         For an overview of models see:\n                         https://platform.openai.com/docs/models/embeddings\n        delay_in_seconds: If a `batch_size` is given, use this set\n                          the delay in seconds between batches.\n        batch_size: The size of each batch.\n        generator_kwargs: Kwargs passed to `openai.Embedding.create`.\n                          Can be used to define custom engines or\n                          deployment_ids.\n\n    Examples:\n    ```python\n    import openai\n    from bertopic.backend import OpenAIBackend\n\n    client = openai.OpenAI(api_key=\"sk-...\")\n    openai_embedder = OpenAIBackend(client, \"text-embedding-ada-002\")\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        client: openai.OpenAI,\n        embedding_model: str = \"text-embedding-ada-002\",\n        delay_in_seconds: float = None,\n        batch_size: int = None,\n        generator_kwargs: Mapping[str, Any] = {},\n    ):\n        super().__init__()\n        self.client = client\n        self.embedding_model = embedding_model\n        self.delay_in_seconds = delay_in_seconds\n        self.batch_size = batch_size\n        self.generator_kwargs = generator_kwargs\n\n        if self.generator_kwargs.get(\"model\"):\n            self.embedding_model = generator_kwargs.get(\"model\")\n        elif not self.generator_kwargs.get(\"engine\"):\n            self.generator_kwargs[\"model\"] = self.embedding_model\n\n    def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n documents/words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            documents: A list of documents or words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document/words embeddings with shape (n, m) with `n` documents/words\n            that each have an embeddings size of `m`\n        \"\"\"\n        # Prepare documents, replacing empty strings with a single space\n        prepared_documents = [\" \" if doc == \"\" else doc for doc in documents]\n\n        # Batch-wise embedding extraction\n        if self.batch_size is not None:\n            embeddings = []\n            for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):\n                response = self.client.embeddings.create(input=batch, **self.generator_kwargs)\n                embeddings.extend([r.embedding for r in response.data])\n\n                # Delay subsequent calls\n                if self.delay_in_seconds:\n                    time.sleep(self.delay_in_seconds)\n\n        # Extract embeddings all at once\n        else:\n            response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)\n            embeddings = [r.embedding for r in response.data]\n        return np.array(embeddings)\n\n    def _chunks(self, documents):\n        for i in range(0, len(documents), self.batch_size):\n            yield documents[i : i + self.batch_size]\n
        "},{"location":"api/backends/openai.html#bertopic.backend._openai.OpenAIBackend.embed","title":"embed(self, documents, verbose=False)","text":"

        Embed a list of n documents/words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default documents List[str]

        A list of documents or words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document/words embeddings with shape (n, m) with n documents/words that each have an embeddings size of m

        Source code in bertopic\\backend\\_openai.py
        def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n documents/words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        documents: A list of documents or words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document/words embeddings with shape (n, m) with `n` documents/words\n        that each have an embeddings size of `m`\n    \"\"\"\n    # Prepare documents, replacing empty strings with a single space\n    prepared_documents = [\" \" if doc == \"\" else doc for doc in documents]\n\n    # Batch-wise embedding extraction\n    if self.batch_size is not None:\n        embeddings = []\n        for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):\n            response = self.client.embeddings.create(input=batch, **self.generator_kwargs)\n            embeddings.extend([r.embedding for r in response.data])\n\n            # Delay subsequent calls\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n    # Extract embeddings all at once\n    else:\n        response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)\n        embeddings = [r.embedding for r in response.data]\n    return np.array(embeddings)\n
        "},{"location":"api/backends/word_doc.html","title":"WordDocEmbedder","text":"

        Combine a document- and word-level embedder.

        Source code in bertopic\\backend\\_word_doc.py
        class WordDocEmbedder(BaseEmbedder):\n    \"\"\"Combine a document- and word-level embedder.\"\"\"\n\n    def __init__(self, embedding_model, word_embedding_model):\n        super().__init__()\n\n        self.embedding_model = select_backend(embedding_model)\n        self.word_embedding_model = select_backend(word_embedding_model)\n\n    def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            words: A list of words to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Word embeddings with shape (n, m) with `n` words\n            that each have an embeddings size of `m`\n\n        \"\"\"\n        return self.word_embedding_model.embed(words, verbose)\n\n    def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:\n        \"\"\"Embed a list of n words into an n-dimensional\n        matrix of embeddings.\n\n        Arguments:\n            document: A list of documents to be embedded\n            verbose: Controls the verbosity of the process\n\n        Returns:\n            Document embeddings with shape (n, m) with `n` documents\n            that each have an embeddings size of `m`\n        \"\"\"\n        return self.embedding_model.embed(document, verbose)\n
        "},{"location":"api/backends/word_doc.html#bertopic.backend._word_doc.WordDocEmbedder.embed_documents","title":"embed_documents(self, document, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default document List[str]

        A list of documents to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Document embeddings with shape (n, m) with n documents that each have an embeddings size of m

        Source code in bertopic\\backend\\_word_doc.py
        def embed_documents(self, document: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        document: A list of documents to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Document embeddings with shape (n, m) with `n` documents\n        that each have an embeddings size of `m`\n    \"\"\"\n    return self.embedding_model.embed(document, verbose)\n
        "},{"location":"api/backends/word_doc.html#bertopic.backend._word_doc.WordDocEmbedder.embed_words","title":"embed_words(self, words, verbose=False)","text":"

        Embed a list of n words into an n-dimensional matrix of embeddings.

        Parameters:

        Name Type Description Default words List[str]

        A list of words to be embedded

        required verbose bool

        Controls the verbosity of the process

        False

        Returns:

        Type Description ndarray

        Word embeddings with shape (n, m) with n words that each have an embeddings size of m

        Source code in bertopic\\backend\\_word_doc.py
        def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:\n    \"\"\"Embed a list of n words into an n-dimensional\n    matrix of embeddings.\n\n    Arguments:\n        words: A list of words to be embedded\n        verbose: Controls the verbosity of the process\n\n    Returns:\n        Word embeddings with shape (n, m) with `n` words\n        that each have an embeddings size of `m`\n\n    \"\"\"\n    return self.word_embedding_model.embed(words, verbose)\n
        "},{"location":"api/cluster/base.html","title":"BaseCluster","text":"

        The Base Cluster class.

        Using this class directly in BERTopic will make it skip over the cluster step. As a result, topics need to be passed to BERTopic in the form of its y parameter in order to create topic representations.

        Examples:

        This will skip over the cluster step in BERTopic:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseCluster\n\nempty_cluster_model = BaseCluster()\n\ntopic_model = BERTopic(hdbscan_model=empty_cluster_model)\n

        Then, this class can be used to perform manual topic modeling. That is, topic modeling on a topics that were already generated before without the need to learn them:

        topic_model.fit(docs, y=y)\n
        Source code in bertopic\\cluster\\_base.py
        class BaseCluster:\n    \"\"\"The Base Cluster class.\n\n    Using this class directly in BERTopic will make it skip\n    over the cluster step. As a result, topics need to be passed\n    to BERTopic in the form of its `y` parameter in order to create\n    topic representations.\n\n    Examples:\n    This will skip over the cluster step in BERTopic:\n\n    ```python\n    from bertopic import BERTopic\n    from bertopic.dimensionality import BaseCluster\n\n    empty_cluster_model = BaseCluster()\n\n    topic_model = BERTopic(hdbscan_model=empty_cluster_model)\n    ```\n\n    Then, this class can be used to perform manual topic modeling.\n    That is, topic modeling on a topics that were already generated before\n    without the need to learn them:\n\n    ```python\n    topic_model.fit(docs, y=y)\n    ```\n    \"\"\"\n\n    def fit(self, X, y=None):\n        if y is not None:\n            self.labels_ = y\n        else:\n            self.labels_ = None\n        return self\n\n    def transform(self, X: np.ndarray) -> np.ndarray:\n        return X\n
        "},{"location":"api/dimensionality/base.html","title":"BaseDimensionalityReduction","text":"

        The Base Dimensionality Reduction class.

        You can use this to skip over the dimensionality reduction step in BERTopic.

        Examples:

        This will skip over the reduction step in BERTopic:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\nempty_reduction_model = BaseDimensionalityReduction()\n\ntopic_model = BERTopic(umap_model=empty_reduction_model)\n
        Source code in bertopic\\dimensionality\\_base.py
        class BaseDimensionalityReduction:\n    \"\"\"The Base Dimensionality Reduction class.\n\n    You can use this to skip over the dimensionality reduction step in BERTopic.\n\n    Examples:\n    This will skip over the reduction step in BERTopic:\n\n    ```python\n    from bertopic import BERTopic\n    from bertopic.dimensionality import BaseDimensionalityReduction\n\n    empty_reduction_model = BaseDimensionalityReduction()\n\n    topic_model = BERTopic(umap_model=empty_reduction_model)\n    ```\n    \"\"\"\n\n    def fit(self, X: np.ndarray = None):\n        return self\n\n    def transform(self, X: np.ndarray) -> np.ndarray:\n        return X\n
        "},{"location":"api/plotting/barchart.html","title":"Barchart","text":"

        Visualize a barchart of selected topics.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        8 n_words int

        Number of words to show in a topic

        5 custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topic Word Scores</b>' width int

        The width of each figure.

        250 height int

        The height of each figure.

        250 autoscale bool

        Whether to automatically calculate the height of the figures to fit the whole bar text

        False

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the barchart of selected topics simply run:

        topic_model.visualize_barchart()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_barchart()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_barchart.py
        def visualize_barchart(\n    topic_model,\n    topics: List[int] = None,\n    top_n_topics: int = 8,\n    n_words: int = 5,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Topic Word Scores</b>\",\n    width: int = 250,\n    height: int = 250,\n    autoscale: bool = False,\n) -> go.Figure:\n    \"\"\"Visualize a barchart of selected topics.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_words: Number of words to show in a topic\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of each figure.\n        height: The height of each figure.\n        autoscale: Whether to automatically calculate the height of the figures to fit the whole bar text\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the barchart of selected topics\n    simply run:\n\n    ```python\n    topic_model.visualize_barchart()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_barchart()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/bar_chart.html\"\n    style=\"width:1100px; height: 660px; border: 0px;\"\"></iframe>\n    \"\"\"\n    colors = itertools.cycle([\"#D55E00\", \"#0072B2\", \"#CC79A7\", \"#E69F00\", \"#56B4E9\", \"#009E73\", \"#F0E442\"])\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list()[0:6])\n\n    # Initialize figure\n    if isinstance(custom_labels, str):\n        subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]\n        subplot_titles = [\"_\".join([label[0] for label in labels[:4]]) for labels in subplot_titles]\n        subplot_titles = [label if len(label) < 30 else label[:27] + \"...\" for label in subplot_titles]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]\n    else:\n        subplot_titles = [f\"Topic {topic}\" for topic in topics]\n    columns = 4\n    rows = int(np.ceil(len(topics) / columns))\n    fig = make_subplots(\n        rows=rows,\n        cols=columns,\n        shared_xaxes=False,\n        horizontal_spacing=0.1,\n        vertical_spacing=0.4 / rows if rows > 1 else 0,\n        subplot_titles=subplot_titles,\n    )\n\n    # Add barchart for each topic\n    row = 1\n    column = 1\n    for topic in topics:\n        words = [word + \"  \" for word, _ in topic_model.get_topic(topic)][:n_words][::-1]\n        scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]\n\n        fig.add_trace(\n            go.Bar(x=scores, y=words, orientation=\"h\", marker_color=next(colors)),\n            row=row,\n            col=column,\n        )\n\n        if autoscale:\n            if len(words) > 12:\n                height = 250 + (len(words) - 12) * 11\n\n            if len(words) > 9:\n                fig.update_yaxes(tickfont=dict(size=(height - 140) // len(words)))\n\n        if column == columns:\n            column = 1\n            row += 1\n        else:\n            column += 1\n\n    # Stylize graph\n    fig.update_layout(\n        template=\"plotly_white\",\n        showlegend=False,\n        title={\n            \"text\": f\"{title}\",\n            \"x\": 0.5,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        width=width * 4,\n        height=height * rows if rows > 1 else height * 1.3,\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n    )\n\n    fig.update_xaxes(showgrid=True)\n    fig.update_yaxes(showgrid=True)\n\n    return fig\n
        "},{"location":"api/plotting/distribution.html","title":"Distribution","text":"

        Visualize the distribution of topic probabilities.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required probabilities ndarray

        An array of probability scores

        required min_probability float

        The minimum probability score to visualize. All others are ignored.

        0.015 custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topic Probability Distribution</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        600

        Examples:

        Make sure to fit the model before and only input the probabilities of a single document:

        topic_model.visualize_distribution(probabilities[0])\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_distribution(probabilities[0])\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_distribution.py
        def visualize_distribution(\n    topic_model,\n    probabilities: np.ndarray,\n    min_probability: float = 0.015,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Topic Probability Distribution</b>\",\n    width: int = 800,\n    height: int = 600,\n) -> go.Figure:\n    \"\"\"Visualize the distribution of topic probabilities.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        probabilities: An array of probability scores\n        min_probability: The minimum probability score to visualize.\n                         All others are ignored.\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    Make sure to fit the model before and only input the\n    probabilities of a single document:\n\n    ```python\n    topic_model.visualize_distribution(probabilities[0])\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_distribution(probabilities[0])\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/probabilities.html\"\n    style=\"width:1000px; height: 500px; border: 0px;\"\"></iframe>\n    \"\"\"\n    if len(probabilities.shape) != 1:\n        raise ValueError(\n            \"This visualization cannot be used if you have set `calculate_probabilities` to False \"\n            \"as it uses the topic probabilities of all topics. \"\n        )\n    if len(probabilities[probabilities > min_probability]) == 0:\n        raise ValueError(\n            \"There are no values where `min_probability` is higher than the \"\n            \"probabilities that were supplied. Lower `min_probability` to prevent this error.\"\n        )\n\n    # Get values and indices equal or exceed the minimum probability\n    labels_idx = np.argwhere(probabilities >= min_probability).flatten()\n    vals = probabilities[labels_idx].tolist()\n\n    # Create labels\n    if isinstance(custom_labels, str):\n        labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in labels_idx]\n        labels = [\"_\".join([label[0] for label in l[:4]]) for l in labels]  # noqa: E741\n        labels = [label if len(label) < 30 else label[:27] + \"...\" for label in labels]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        labels = [topic_model.custom_labels_[idx + topic_model._outliers] for idx in labels_idx]\n    else:\n        labels = []\n        for idx in labels_idx:\n            words = topic_model.get_topic(idx)\n            if words:\n                label = [word[0] for word in words[:5]]\n                label = f\"<b>Topic {idx}</b>: {'_'.join(label)}\"\n                label = label[:40] + \"...\" if len(label) > 40 else label\n                labels.append(label)\n            else:\n                vals.remove(probabilities[idx])\n\n    # Create Figure\n    fig = go.Figure(\n        go.Bar(\n            x=vals,\n            y=labels,\n            marker=dict(\n                color=\"#C8D2D7\",\n                line=dict(color=\"#6E8484\", width=1),\n            ),\n            orientation=\"h\",\n        )\n    )\n\n    fig.update_layout(\n        xaxis_title=\"Probability\",\n        title={\n            \"text\": f\"{title}\",\n            \"y\": 0.95,\n            \"x\": 0.5,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        template=\"simple_white\",\n        width=width,\n        height=height,\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n    )\n\n    return fig\n
        "},{"location":"api/plotting/document_datamap.html","title":"Documents with DataMapPlot","text":"

        Visualize documents and their topics in 2D as a static plot for publication using DataMapPlot.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5]. Documents not in these topics will be shown as noise points.

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        'Documents and Topics' sub_title Optional[str]

        Sub-title of the plot.

        None width int

        The width of the figure.

        1200 height int

        The height of the figure.

        1200 **datamap_kwds

        All further keyword args will be passed on to DataMapPlot's create_plot function. See the DataMapPlot documentation for more details.

        {}

        Returns:

        Type Description figure

        A Matplotlib Figure object.

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_document_datamap(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\nfig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n

        Source code in bertopic\\plotting\\_datamap.py
        def visualize_document_datamap(\n    topic_model,\n    docs: List[str],\n    topics: List[int] = None,\n    embeddings: np.ndarray = None,\n    reduced_embeddings: np.ndarray = None,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"Documents and Topics\",\n    sub_title: Union[str, None] = None,\n    width: int = 1200,\n    height: int = 1200,\n    **datamap_kwds,\n) -> Figure:\n    \"\"\"Visualize documents and their topics in 2D as a static plot for publication using\n    DataMapPlot.\n\n    Arguments:\n        topic_model:  A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`. Documents not in these topics will be shown\n                as noise points.\n        embeddings:  The embeddings of all documents in `docs`.\n        reduced_embeddings:  The 2D reduced embeddings of all documents in `docs`.\n        custom_labels:  If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        sub_title: Sub-title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n        **datamap_kwds:  All further keyword args will be passed on to DataMapPlot's\n                         `create_plot` function. See the DataMapPlot documentation\n                         for more details.\n\n    Returns:\n        figure: A Matplotlib Figure object.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_document_datamap(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic().fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n    fig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n    ```\n    <img src=\"../../getting_started/visualization/datamapplot.png\",\n         alt=\"DataMapPlot of 20-Newsgroups\", width=800, height=800></img>\n    \"\"\"\n    topic_per_doc = topic_model.topics_\n\n    df = pd.DataFrame({\"topic\": np.array(topic_per_doc)})\n    df[\"doc\"] = docs\n    df[\"topic\"] = topic_per_doc\n\n    # Extract embeddings if not already done\n    if embeddings is None and reduced_embeddings is None:\n        embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n    else:\n        embeddings_to_reduce = embeddings\n\n    # Reduce input embeddings\n    if reduced_embeddings is None:\n        umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.15, metric=\"cosine\").fit(embeddings_to_reduce)\n        embeddings_2d = umap_model.embedding_\n    else:\n        embeddings_2d = reduced_embeddings\n\n    unique_topics = set(topic_per_doc)\n\n    # Prepare text and names\n    if isinstance(custom_labels, str):\n        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]\n        names = [\" \".join([label[0] for label in labels[:4]]) for labels in names]\n        names = [label if len(label) < 30 else label[:27] + \"...\" for label in names]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]\n    else:\n        names = [\n            f\"Topic-{topic}: \" + \" \".join([word for word, value in topic_model.get_topic(topic)][:3])\n            for topic in unique_topics\n        ]\n\n    topic_name_mapping = {topic_num: topic_name for topic_num, topic_name in zip(unique_topics, names)}\n    topic_name_mapping[-1] = \"Unlabelled\"\n\n    # If a set of topics is chosen, set everything else to \"Unlabelled\"\n    if topics is not None:\n        selected_topics = set(topics)\n        for topic_num in topic_name_mapping:\n            if topic_num not in selected_topics:\n                topic_name_mapping[topic_num] = \"Unlabelled\"\n\n    # Map in topic names and plot\n    named_topic_per_doc = pd.Series(topic_per_doc).map(topic_name_mapping).values\n\n    figure, axes = datamapplot.create_plot(\n        embeddings_2d,\n        named_topic_per_doc,\n        figsize=(width / 100, height / 100),\n        dpi=100,\n        title=title,\n        sub_title=sub_title,\n        **datamap_kwds,\n    )\n\n    return figure\n
        "},{"location":"api/plotting/documents.html","title":"Documents","text":"

        Visualize documents and their topics in 2D.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample float

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualization.

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_documents(docs)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_documents.py
        def visualize_documents(\n    topic_model,\n    docs: List[str],\n    topics: List[int] = None,\n    embeddings: np.ndarray = None,\n    reduced_embeddings: np.ndarray = None,\n    sample: float = None,\n    hide_annotations: bool = False,\n    hide_document_hover: bool = False,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Documents and Topics</b>\",\n    width: int = 1200,\n    height: int = 750,\n):\n    \"\"\"Visualize documents and their topics in 2D.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                             specific points. Helps to speed up generation of visualization.\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_documents(docs)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic\n    topic_model = BERTopic().fit(docs, embeddings)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_documents(docs, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../../getting_started/visualization/documents.html\"\n    style=\"width:1000px; height: 800px; border: 0px;\"\"></iframe>\n    \"\"\"\n    topic_per_doc = topic_model.topics_\n\n    # Sample the data to optimize for visualization and dimensionality reduction\n    if sample is None or sample > 1:\n        sample = 1\n\n    indices = []\n    for topic in set(topic_per_doc):\n        s = np.where(np.array(topic_per_doc) == topic)[0]\n        size = len(s) if len(s) < 100 else int(len(s) * sample)\n        indices.extend(np.random.choice(s, size=size, replace=False))\n    indices = np.array(indices)\n\n    df = pd.DataFrame({\"topic\": np.array(topic_per_doc)[indices]})\n    df[\"doc\"] = [docs[index] for index in indices]\n    df[\"topic\"] = [topic_per_doc[index] for index in indices]\n\n    # Extract embeddings if not already done\n    if sample is None:\n        if embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n        else:\n            embeddings_to_reduce = embeddings\n    else:\n        if embeddings is not None:\n            embeddings_to_reduce = embeddings[indices]\n        elif embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n\n    # Reduce input embeddings\n    if reduced_embeddings is None:\n        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric=\"cosine\").fit(embeddings_to_reduce)\n        embeddings_2d = umap_model.embedding_\n    elif sample is not None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings[indices]\n    elif sample is None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings\n\n    unique_topics = set(topic_per_doc)\n    if topics is None:\n        topics = unique_topics\n\n    # Combine data\n    df[\"x\"] = embeddings_2d[:, 0]\n    df[\"y\"] = embeddings_2d[:, 1]\n\n    # Prepare text and names\n    if isinstance(custom_labels, str):\n        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]\n        names = [\"_\".join([label[0] for label in labels[:4]]) for labels in names]\n        names = [label if len(label) < 30 else label[:27] + \"...\" for label in names]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]\n    else:\n        names = [\n            f\"{topic}_\" + \"_\".join([word for word, value in topic_model.get_topic(topic)][:3])\n            for topic in unique_topics\n        ]\n\n    # Visualize\n    fig = go.Figure()\n\n    # Outliers and non-selected topics\n    non_selected_topics = set(unique_topics).difference(topics)\n    if len(non_selected_topics) == 0:\n        non_selected_topics = [-1]\n\n    selection = df.loc[df.topic.isin(non_selected_topics), :]\n    selection[\"text\"] = \"\"\n    selection.loc[len(selection), :] = [\n        None,\n        None,\n        selection.x.mean(),\n        selection.y.mean(),\n        \"Other documents\",\n    ]\n\n    fig.add_trace(\n        go.Scattergl(\n            x=selection.x,\n            y=selection.y,\n            hovertext=selection.doc if not hide_document_hover else None,\n            hoverinfo=\"text\",\n            mode=\"markers+text\",\n            name=\"other\",\n            showlegend=False,\n            marker=dict(color=\"#CFD8DC\", size=5, opacity=0.5),\n        )\n    )\n\n    # Selected topics\n    for name, topic in zip(names, unique_topics):\n        if topic in topics and topic != -1:\n            selection = df.loc[df.topic == topic, :]\n            selection[\"text\"] = \"\"\n\n            if not hide_annotations:\n                selection.loc[len(selection), :] = [\n                    None,\n                    None,\n                    selection.x.mean(),\n                    selection.y.mean(),\n                    name,\n                ]\n\n            fig.add_trace(\n                go.Scattergl(\n                    x=selection.x,\n                    y=selection.y,\n                    hovertext=selection.doc if not hide_document_hover else None,\n                    hoverinfo=\"text\",\n                    text=selection.text,\n                    mode=\"markers+text\",\n                    name=name,\n                    textfont=dict(\n                        size=12,\n                    ),\n                    marker=dict(size=5, opacity=0.5),\n                )\n            )\n\n    # Add grid in a 'plus' shape\n    x_range = (\n        df.x.min() - abs((df.x.min()) * 0.15),\n        df.x.max() + abs((df.x.max()) * 0.15),\n    )\n    y_range = (\n        df.y.min() - abs((df.y.min()) * 0.15),\n        df.y.max() + abs((df.y.max()) * 0.15),\n    )\n    fig.add_shape(\n        type=\"line\",\n        x0=sum(x_range) / 2,\n        y0=y_range[0],\n        x1=sum(x_range) / 2,\n        y1=y_range[1],\n        line=dict(color=\"#CFD8DC\", width=2),\n    )\n    fig.add_shape(\n        type=\"line\",\n        x0=x_range[0],\n        y0=sum(y_range) / 2,\n        x1=x_range[1],\n        y1=sum(y_range) / 2,\n        line=dict(color=\"#9E9E9E\", width=2),\n    )\n    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text=\"D1\", showarrow=False, yshift=10)\n    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text=\"D2\", showarrow=False, xshift=10)\n\n    # Stylize layout\n    fig.update_layout(\n        template=\"simple_white\",\n        title={\n            \"text\": f\"{title}\",\n            \"x\": 0.5,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        width=width,\n        height=height,\n    )\n\n    fig.update_xaxes(visible=False)\n    fig.update_yaxes(visible=False)\n    return fig\n
        "},{"location":"api/plotting/dtm.html","title":"DTM","text":"

        Visualize topics over time.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics_over_time DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        None topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topics over Time</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        450

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics over time, simply run:

        topics_over_time = topic_model.topics_over_time(docs, timestamps)\ntopic_model.visualize_topics_over_time(topics_over_time)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_over_time(topics_over_time)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_topics_over_time.py
        def visualize_topics_over_time(\n    topic_model,\n    topics_over_time: pd.DataFrame,\n    top_n_topics: int = None,\n    topics: List[int] = None,\n    normalize_frequency: bool = False,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Topics over Time</b>\",\n    width: int = 1250,\n    height: int = 450,\n) -> go.Figure:\n    \"\"\"Visualize topics over time.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics_over_time: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n    To visualize the topics over time, simply run:\n\n    ```python\n    topics_over_time = topic_model.topics_over_time(docs, timestamps)\n    topic_model.visualize_topics_over_time(topics_over_time)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_over_time(topics_over_time)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/trump.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    colors = [\n        \"#E69F00\",\n        \"#56B4E9\",\n        \"#009E73\",\n        \"#F0E442\",\n        \"#D55E00\",\n        \"#0072B2\",\n        \"#CC79A7\",\n    ]\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        selected_topics = list(topics)\n    elif top_n_topics is not None:\n        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        selected_topics = sorted(freq_df.Topic.to_list())\n\n    # Prepare data\n    if isinstance(custom_labels, str):\n        topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]\n        topic_names = [\"_\".join([label[0] for label in labels[:4]]) for labels in topic_names]\n        topic_names = [label if len(label) < 30 else label[:27] + \"...\" for label in topic_names]\n        topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        topic_names = {\n            key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()\n        }\n    else:\n        topic_names = {\n            key: value[:40] + \"...\" if len(value) > 40 else value for key, value in topic_model.topic_labels_.items()\n        }\n    topics_over_time[\"Name\"] = topics_over_time.Topic.map(topic_names)\n    data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values([\"Topic\", \"Timestamp\"])\n\n    # Add traces\n    fig = go.Figure()\n    for index, topic in enumerate(data.Topic.unique()):\n        trace_data = data.loc[data.Topic == topic, :]\n        topic_name = trace_data.Name.values[0]\n        words = trace_data.Words.values\n        if normalize_frequency:\n            y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]\n        else:\n            y = trace_data.Frequency\n        fig.add_trace(\n            go.Scatter(\n                x=trace_data.Timestamp,\n                y=y,\n                mode=\"lines\",\n                marker_color=colors[index % 7],\n                hoverinfo=\"text\",\n                name=topic_name,\n                hovertext=[f\"<b>Topic {topic}</b><br>Words: {word}\" for word in words],\n            )\n        )\n\n    # Styling of the visualization\n    fig.update_xaxes(showgrid=True)\n    fig.update_yaxes(showgrid=True)\n    fig.update_layout(\n        yaxis_title=\"Normalized Frequency\" if normalize_frequency else \"Frequency\",\n        title={\n            \"text\": f\"{title}\",\n            \"y\": 0.95,\n            \"x\": 0.40,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        template=\"simple_white\",\n        width=width,\n        height=height,\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n        legend=dict(\n            title=\"<b>Global Topic Representation\",\n        ),\n    )\n    return fig\n
        "},{"location":"api/plotting/heatmap.html","title":"Heatmap","text":"

        Visualize a heatmap of the topic's similarity matrix.

        Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model), a heatmap is created showing the similarity between topics.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize.

        None top_n_topics int

        Only select the top n most frequent topics.

        None n_clusters int

        Create n clusters and order the similarity matrix by those clusters.

        None use_ctfidf bool

        Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used.

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Similarity Matrix</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        800

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the similarity matrix of topics simply run:

        topic_model.visualize_heatmap()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_heatmap()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_heatmap.py
        def visualize_heatmap(\n    topic_model,\n    topics: List[int] = None,\n    top_n_topics: int = None,\n    n_clusters: int = None,\n    use_ctfidf: bool = False,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Similarity Matrix</b>\",\n    width: int = 800,\n    height: int = 800,\n) -> go.Figure:\n    \"\"\"Visualize a heatmap of the topic's similarity matrix.\n\n    Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding\n    model), a heatmap is created showing the similarity between topics.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize.\n        top_n_topics: Only select the top n most frequent topics.\n        n_clusters: Create n clusters and order the similarity\n                    matrix by those clusters.\n        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings\n                    from the embedding model are used.\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the similarity matrix of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_heatmap()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_heatmap()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/heatmap.html\"\n    style=\"width:1000px; height: 720px; border: 0px;\"\"></iframe>\n    \"\"\"\n    embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][\n        topic_model._outliers :\n    ]\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list())\n\n    # Order heatmap by similar clusters of topics\n    sorted_topics = topics\n    if n_clusters:\n        if n_clusters >= len(set(topics)):\n            raise ValueError(\"Make sure to set `n_clusters` lower than \" \"the total number of unique topics.\")\n\n        distance_matrix = cosine_similarity(embeddings[topics])\n        Z = linkage(distance_matrix, \"ward\")\n        clusters = fcluster(Z, t=n_clusters, criterion=\"maxclust\")\n\n        # Extract new order of topics\n        mapping = {cluster: [] for cluster in clusters}\n        for topic, cluster in zip(topics, clusters):\n            mapping[cluster].append(topic)\n        mapping = [cluster for cluster in mapping.values()]\n        sorted_topics = [topic for cluster in mapping for topic in cluster]\n\n    # Select embeddings\n    indices = np.array([topics.index(topic) for topic in sorted_topics])\n    embeddings = embeddings[indices]\n    distance_matrix = cosine_similarity(embeddings)\n\n    # Create labels\n    if isinstance(custom_labels, str):\n        new_labels = [\n            [[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in sorted_topics\n        ]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        new_labels = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in sorted_topics]\n    else:\n        new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n\n    fig = px.imshow(\n        distance_matrix,\n        labels=dict(color=\"Similarity Score\"),\n        x=new_labels,\n        y=new_labels,\n        color_continuous_scale=\"GnBu\",\n    )\n\n    fig.update_layout(\n        title={\n            \"text\": f\"{title}\",\n            \"y\": 0.95,\n            \"x\": 0.55,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        width=width,\n        height=height,\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n    )\n    fig.update_layout(showlegend=True)\n    fig.update_layout(legend_title_text=\"Trend\")\n\n    return fig\n
        "},{"location":"api/plotting/hierarchical_documents.html","title":"Hierarchical Documents","text":"

        Visualize documents and their topics in 2D at different levels of hierarchy.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required docs List[str]

        The documents you used when calling either fit or fit_transform

        required hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children

        required topics List[int]

        A selection of topics to visualize. Not to be confused with the topics that you get from .fit_transform. For example, if you want to visualize only topics 1 through 5: topics = [1, 2, 3, 4, 5].

        None embeddings ndarray

        The embeddings of all documents in docs.

        None reduced_embeddings ndarray

        The 2D reduced embeddings of all documents in docs.

        None sample Union[float, int]

        The percentage of documents in each topic that you would like to keep. Value can be between 0 and 1. Setting this value to, for example, 0.1 (10% of documents in each topic) makes it easier to visualize millions of documents as a subset is chosen.

        None hide_annotations bool

        Hide the names of the traces on top of each cluster.

        False hide_document_hover bool

        Hide the content of the documents when hovering over specific points. Helps to speed up generation of visualizations.

        True nr_levels int

        The number of levels to be visualized in the hierarchy. First, the distances in hierarchical_topics.Distance are split in nr_levels lists of distances. Then, for each list of distances, the merged topics are selected that have a distance less or equal to the maximum distance of the selected list of distances. NOTE: To get all possible merged steps, make sure that nr_levels is equal to the length of hierarchical_topics.

        10 level_scale str

        Whether to apply a linear or logarithmic (log) scale levels of the distance vector. Linear scaling will perform an equal number of merges at each level while logarithmic scaling will perform more mergers in earlier levels to provide more resolution at higher levels (this can be used for when the number of topics is large).

        'linear' custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\". NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Documents and Topics</b>' width int

        The width of the figure.

        1200 height int

        The height of the figure.

        750

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n

        Do note that this re-calculates the embeddings and reduces them to 2D. The advised and preferred pipeline for using this function is as follows:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Reduce dimensionality of embeddings, this step is optional\n# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Or, if you have reduced the original embeddings already:\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\nfig.write_html(\"path/to/file.html\")\n

        Note

        This visualization was inspired by the scatter plot representation of Doc2Map: https://github.com/louisgeisler/Doc2Map

        Source code in bertopic\\plotting\\_hierarchical_documents.py
        def visualize_hierarchical_documents(\n    topic_model,\n    docs: List[str],\n    hierarchical_topics: pd.DataFrame,\n    topics: List[int] = None,\n    embeddings: np.ndarray = None,\n    reduced_embeddings: np.ndarray = None,\n    sample: Union[float, int] = None,\n    hide_annotations: bool = False,\n    hide_document_hover: bool = True,\n    nr_levels: int = 10,\n    level_scale: str = \"linear\",\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Hierarchical Documents and Topics</b>\",\n    width: int = 1200,\n    height: int = 750,\n) -> go.Figure:\n    \"\"\"Visualize documents and their topics in 2D at different levels of hierarchy.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        docs: The documents you used when calling either `fit` or `fit_transform`\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children\n        topics: A selection of topics to visualize.\n                Not to be confused with the topics that you get from `.fit_transform`.\n                For example, if you want to visualize only topics 1 through 5:\n                `topics = [1, 2, 3, 4, 5]`.\n        embeddings: The embeddings of all documents in `docs`.\n        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.\n        sample: The percentage of documents in each topic that you would like to keep.\n                Value can be between 0 and 1. Setting this value to, for example,\n                0.1 (10% of documents in each topic) makes it easier to visualize\n                millions of documents as a subset is chosen.\n        hide_annotations: Hide the names of the traces on top of each cluster.\n        hide_document_hover: Hide the content of the documents when hovering over\n                             specific points. Helps to speed up generation of visualizations.\n        nr_levels: The number of levels to be visualized in the hierarchy. First, the distances\n                   in `hierarchical_topics.Distance` are split in `nr_levels` lists of distances.\n                   Then, for each list of distances, the merged topics are selected that have a\n                   distance less or equal to the maximum distance of the selected list of distances.\n                   NOTE: To get all possible merged steps, make sure that `nr_levels` is equal to\n                   the length of `hierarchical_topics`.\n        level_scale: Whether to apply a linear or logarithmic (log) scale levels of the distance\n                     vector. Linear scaling will perform an equal number of merges at each level\n                     while logarithmic scaling will perform more mergers in earlier levels to\n                     provide more resolution at higher levels (this can be used for when the number\n                     of topics is large).\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n                       NOTE: Custom labels are only generated for the original\n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics)\n    ```\n\n    Do note that this re-calculates the embeddings and reduces them to 2D.\n    The advised and preferred pipeline for using this function is as follows:\n\n    ```python\n    from sklearn.datasets import fetch_20newsgroups\n    from sentence_transformers import SentenceTransformer\n    from bertopic import BERTopic\n    from umap import UMAP\n\n    # Prepare embeddings\n    docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n    sentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n    embeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n    # Train BERTopic and extract hierarchical topics\n    topic_model = BERTopic().fit(docs, embeddings)\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Reduce dimensionality of embeddings, this step is optional\n    # reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n    # Run the visualization with the original embeddings\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n    # Or, if you have reduced the original embeddings already:\n    topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    Note:\n        This visualization was inspired by the scatter plot representation of Doc2Map:\n        https://github.com/louisgeisler/Doc2Map\n\n    <iframe src=\"../../getting_started/visualization/hierarchical_documents.html\"\n    style=\"width:1000px; height: 770px; border: 0px;\"\"></iframe>\n    \"\"\"\n    topic_per_doc = topic_model.topics_\n\n    # Sample the data to optimize for visualization and dimensionality reduction\n    if sample is None or sample > 1:\n        sample = 1\n\n    indices = []\n    for topic in set(topic_per_doc):\n        s = np.where(np.array(topic_per_doc) == topic)[0]\n        size = len(s) if len(s) < 100 else int(len(s) * sample)\n        indices.extend(np.random.choice(s, size=size, replace=False))\n    indices = np.array(indices)\n\n    df = pd.DataFrame({\"topic\": np.array(topic_per_doc)[indices]})\n    df[\"doc\"] = [docs[index] for index in indices]\n    df[\"topic\"] = [topic_per_doc[index] for index in indices]\n\n    # Extract embeddings if not already done\n    if sample is None:\n        if embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n        else:\n            embeddings_to_reduce = embeddings\n    else:\n        if embeddings is not None:\n            embeddings_to_reduce = embeddings[indices]\n        elif embeddings is None and reduced_embeddings is None:\n            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method=\"document\")\n\n    # Reduce input embeddings\n    if reduced_embeddings is None:\n        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric=\"cosine\").fit(embeddings_to_reduce)\n        embeddings_2d = umap_model.embedding_\n    elif sample is not None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings[indices]\n    elif sample is None and reduced_embeddings is not None:\n        embeddings_2d = reduced_embeddings\n\n    # Combine data\n    df[\"x\"] = embeddings_2d[:, 0]\n    df[\"y\"] = embeddings_2d[:, 1]\n\n    # Create topic list for each level, levels are created by calculating the distance\n    distances = hierarchical_topics.Distance.to_list()\n    if level_scale == \"log\" or level_scale == \"logarithmic\":\n        log_indices = (\n            np.round(\n                np.logspace(\n                    start=math.log(1, 10),\n                    stop=math.log(len(distances) - 1, 10),\n                    num=nr_levels,\n                )\n            )\n            .astype(int)\n            .tolist()\n        )\n        log_indices.reverse()\n        max_distances = [distances[i] for i in log_indices]\n    elif level_scale == \"lin\" or level_scale == \"linear\":\n        max_distances = [\n            distances[indices[-1]] for indices in np.array_split(range(len(hierarchical_topics)), nr_levels)\n        ][::-1]\n    else:\n        raise ValueError(\"level_scale needs to be one of 'log' or 'linear'\")\n\n    for index, max_distance in enumerate(max_distances):\n        # Get topics below `max_distance`\n        mapping = {topic: topic for topic in df.topic.unique()}\n        selection = hierarchical_topics.loc[hierarchical_topics.Distance <= max_distance, :]\n        selection.Parent_ID = selection.Parent_ID.astype(int)\n        selection = selection.sort_values(\"Parent_ID\")\n\n        for row in selection.iterrows():\n            for topic in row[1].Topics:\n                mapping[topic] = row[1].Parent_ID\n\n        # Make sure the mappings are mapped 1:1\n        mappings = [True for _ in mapping]\n        while any(mappings):\n            for i, (key, value) in enumerate(mapping.items()):\n                if value in mapping.keys() and key != value:\n                    mapping[key] = mapping[value]\n                else:\n                    mappings[i] = False\n\n        # Create new column\n        df[f\"level_{index+1}\"] = df.topic.map(mapping)\n        df[f\"level_{index+1}\"] = df[f\"level_{index+1}\"].astype(int)\n\n    # Prepare topic names of original and merged topics\n    trace_names = []\n    topic_names = {}\n    for topic in range(hierarchical_topics.Parent_ID.astype(int).max()):\n        if topic < hierarchical_topics.Parent_ID.astype(int).min():\n            if topic_model.get_topic(topic):\n                if isinstance(custom_labels, str):\n                    trace_name = f\"{topic}_\" + \"_\".join(\n                        list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3]\n                    )\n                elif topic_model.custom_labels_ is not None and custom_labels:\n                    trace_name = topic_model.custom_labels_[topic + topic_model._outliers]\n                else:\n                    trace_name = f\"{topic}_\" + \"_\".join([word[:20] for word, _ in topic_model.get_topic(topic)][:3])\n                topic_names[topic] = {\n                    \"trace_name\": trace_name[:40],\n                    \"plot_text\": trace_name[:40],\n                }\n                trace_names.append(trace_name)\n        else:\n            trace_name = (\n                f\"{topic}_\"\n                + hierarchical_topics.loc[hierarchical_topics.Parent_ID == str(topic), \"Parent_Name\"].values[0]\n            )\n            plot_text = \"_\".join([name[:20] for name in trace_name.split(\"_\")[:3]])\n            topic_names[topic] = {\n                \"trace_name\": trace_name[:40],\n                \"plot_text\": plot_text[:40],\n            }\n            trace_names.append(trace_name)\n\n    # Prepare traces\n    all_traces = []\n    for level in range(len(max_distances)):\n        traces = []\n\n        # Outliers\n        if topic_model._outliers:\n            traces.append(\n                go.Scattergl(\n                    x=df.loc[(df[f\"level_{level+1}\"] == -1), \"x\"],\n                    y=df.loc[df[f\"level_{level+1}\"] == -1, \"y\"],\n                    mode=\"markers+text\",\n                    name=\"other\",\n                    hoverinfo=\"text\",\n                    hovertext=df.loc[(df[f\"level_{level+1}\"] == -1), \"doc\"] if not hide_document_hover else None,\n                    showlegend=False,\n                    marker=dict(color=\"#CFD8DC\", size=5, opacity=0.5),\n                )\n            )\n\n        # Selected topics\n        if topics:\n            selection = df.loc[(df.topic.isin(topics)), :]\n            unique_topics = sorted([int(topic) for topic in selection[f\"level_{level+1}\"].unique()])\n        else:\n            unique_topics = sorted([int(topic) for topic in df[f\"level_{level+1}\"].unique()])\n\n        for topic in unique_topics:\n            if topic != -1:\n                if topics:\n                    selection = df.loc[(df[f\"level_{level+1}\"] == topic) & (df.topic.isin(topics)), :]\n                else:\n                    selection = df.loc[df[f\"level_{level+1}\"] == topic, :]\n\n                if not hide_annotations:\n                    selection.loc[len(selection), :] = None\n                    selection[\"text\"] = \"\"\n                    selection.loc[len(selection) - 1, \"x\"] = selection.x.mean()\n                    selection.loc[len(selection) - 1, \"y\"] = selection.y.mean()\n                    selection.loc[len(selection) - 1, \"text\"] = topic_names[int(topic)][\"plot_text\"]\n\n                traces.append(\n                    go.Scattergl(\n                        x=selection.x,\n                        y=selection.y,\n                        text=selection.text if not hide_annotations else None,\n                        hovertext=selection.doc if not hide_document_hover else None,\n                        hoverinfo=\"text\",\n                        name=topic_names[int(topic)][\"trace_name\"],\n                        mode=\"markers+text\",\n                        marker=dict(size=5, opacity=0.5),\n                    )\n                )\n\n        all_traces.append(traces)\n\n    # Track and count traces\n    nr_traces_per_set = [len(traces) for traces in all_traces]\n    trace_indices = [(0, nr_traces_per_set[0])]\n    for index, nr_traces in enumerate(nr_traces_per_set[1:]):\n        start = trace_indices[index][1]\n        end = nr_traces + start\n        trace_indices.append((start, end))\n\n    # Visualization\n    fig = go.Figure()\n    for traces in all_traces:\n        for trace in traces:\n            fig.add_trace(trace)\n\n    for index in range(len(fig.data)):\n        if index >= nr_traces_per_set[0]:\n            fig.data[index].visible = False\n\n    # Create and add slider\n    steps = []\n    for index, indices in enumerate(trace_indices):\n        step = dict(\n            method=\"update\",\n            label=str(index),\n            args=[{\"visible\": [False] * len(fig.data)}],\n        )\n        for index in range(indices[1] - indices[0]):\n            step[\"args\"][0][\"visible\"][index + indices[0]] = True\n        steps.append(step)\n\n    sliders = [dict(currentvalue={\"prefix\": \"Level: \"}, pad={\"t\": 20}, steps=steps)]\n\n    # Add grid in a 'plus' shape\n    x_range = (\n        df.x.min() - abs((df.x.min()) * 0.15),\n        df.x.max() + abs((df.x.max()) * 0.15),\n    )\n    y_range = (\n        df.y.min() - abs((df.y.min()) * 0.15),\n        df.y.max() + abs((df.y.max()) * 0.15),\n    )\n    fig.add_shape(\n        type=\"line\",\n        x0=sum(x_range) / 2,\n        y0=y_range[0],\n        x1=sum(x_range) / 2,\n        y1=y_range[1],\n        line=dict(color=\"#CFD8DC\", width=2),\n    )\n    fig.add_shape(\n        type=\"line\",\n        x0=x_range[0],\n        y0=sum(y_range) / 2,\n        x1=x_range[1],\n        y1=sum(y_range) / 2,\n        line=dict(color=\"#9E9E9E\", width=2),\n    )\n    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text=\"D1\", showarrow=False, yshift=10)\n    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text=\"D2\", showarrow=False, xshift=10)\n\n    # Stylize layout\n    fig.update_layout(\n        sliders=sliders,\n        template=\"simple_white\",\n        title={\n            \"text\": f\"{title}\",\n            \"x\": 0.5,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        width=width,\n        height=height,\n    )\n\n    fig.update_xaxes(visible=False)\n    fig.update_yaxes(visible=False)\n    return fig\n
        "},{"location":"api/plotting/hierarchy.html","title":"Hierarchy","text":"

        Visualize a hierarchical structure of the topics.

        A ward linkage function is used to perform the hierarchical clustering based on the cosine distance matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model).

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required orientation str

        The orientation of the figure. Either 'left' or 'bottom'

        'left' topics List[int]

        A selection of topics to visualize

        None top_n_topics int

        Only select the top n most frequent topics

        None use_ctfidf bool

        Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings from the embedding model are used.

        True custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\". NOTE: Custom labels are only generated for the original un-merged topics.

        False title str

        Title of the plot.

        '<b>Hierarchical Clustering</b>' width int

        The width of the figure. Only works if orientation is set to 'left'

        1000 height int

        The height of the figure. Only works if orientation is set to 'bottom'

        600 hierarchical_topics DataFrame

        A dataframe that contains a hierarchy of topics represented by their parents and their children. NOTE: The hierarchical topic names are only visualized if both topics and top_n_topics are not set.

        None linkage_function Callable[[scipy.sparse._csr.csr_matrix], numpy.ndarray]

        The linkage function to use. Default is: lambda x: sch.linkage(x, 'ward', optimal_ordering=True) NOTE: Make sure to use the same linkage_function as used in topic_model.hierarchical_topics.

        None distance_function Callable[[scipy.sparse._csr.csr_matrix], scipy.sparse._csr.csr_matrix]

        The distance function to use on the c-TF-IDF matrix. Default is: lambda x: 1 - cosine_similarity(x). You can pass any function that returns either a square matrix of shape (n_samples, n_samples) with zeros on the diagonal and non-negative values or condensed distance matrix of shape (n_samples * (n_samples - 1) / 2,) containing the upper triangular of the distance matrix. NOTE: Make sure to use the same distance_function as used in topic_model.hierarchical_topics.

        None color_threshold int

        Value at which the separation of clusters will be made which will result in different colors for different clusters. A higher value will typically lead in less colored clusters.

        1

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the hierarchical structure of topics simply run:

        topic_model.visualize_hierarchy()\n

        If you also want the labels visualized of hierarchical topics, run the following:

        # Extract hierarchical topics and their representations\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n\n# Visualize these representations\ntopic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you want to save the resulting figure:

        fig = topic_model.visualize_hierarchy()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_hierarchy.py
        def visualize_hierarchy(\n    topic_model,\n    orientation: str = \"left\",\n    topics: List[int] = None,\n    top_n_topics: int = None,\n    use_ctfidf: bool = True,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Hierarchical Clustering</b>\",\n    width: int = 1000,\n    height: int = 600,\n    hierarchical_topics: pd.DataFrame = None,\n    linkage_function: Callable[[csr_matrix], np.ndarray] = None,\n    distance_function: Callable[[csr_matrix], csr_matrix] = None,\n    color_threshold: int = 1,\n) -> go.Figure:\n    \"\"\"Visualize a hierarchical structure of the topics.\n\n    A ward linkage function is used to perform the\n    hierarchical clustering based on the cosine distance\n    matrix between topic embeddings (either c-TF-IDF or the embeddings from the embedding model).\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        orientation: The orientation of the figure.\n                     Either 'left' or 'bottom'\n        topics: A selection of topics to visualize\n        top_n_topics: Only select the top n most frequent topics\n        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the embeddings\n                    from the embedding model are used.\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n                       NOTE: Custom labels are only generated for the original\n                       un-merged topics.\n        title: Title of the plot.\n        width: The width of the figure. Only works if orientation is set to 'left'\n        height: The height of the figure. Only works if orientation is set to 'bottom'\n        hierarchical_topics: A dataframe that contains a hierarchy of topics\n                             represented by their parents and their children.\n                             NOTE: The hierarchical topic names are only visualized\n                             if both `topics` and `top_n_topics` are not set.\n        linkage_function: The linkage function to use. Default is:\n                          `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`\n                          NOTE: Make sure to use the same `linkage_function` as used\n                          in `topic_model.hierarchical_topics`.\n        distance_function: The distance function to use on the c-TF-IDF matrix. Default is:\n                           `lambda x: 1 - cosine_similarity(x)`.\n                            You can pass any function that returns either a square matrix of\n                            shape (n_samples, n_samples) with zeros on the diagonal and\n                            non-negative values or condensed distance matrix of shape\n                            (n_samples * (n_samples - 1) / 2,) containing the upper\n                            triangular of the distance matrix.\n                           NOTE: Make sure to use the same `distance_function` as used\n                           in `topic_model.hierarchical_topics`.\n        color_threshold: Value at which the separation of clusters will be made which\n                         will result in different colors for different clusters.\n                         A higher value will typically lead in less colored clusters.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the hierarchical structure of\n    topics simply run:\n\n    ```python\n    topic_model.visualize_hierarchy()\n    ```\n\n    If you also want the labels visualized of hierarchical topics,\n    run the following:\n\n    ```python\n    # Extract hierarchical topics and their representations\n    hierarchical_topics = topic_model.hierarchical_topics(docs)\n\n    # Visualize these representations\n    topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n    ```\n\n    If you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_hierarchy()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/hierarchy.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    if distance_function is None:\n        distance_function = lambda x: 1 - cosine_similarity(x)\n\n    if linkage_function is None:\n        linkage_function = lambda x: sch.linkage(x, \"ward\", optimal_ordering=True)\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list())\n\n    # Select embeddings\n    all_topics = sorted(list(topic_model.get_topics().keys()))\n    indices = np.array([all_topics.index(topic) for topic in topics])\n\n    # Select topic embeddings\n    embeddings = select_topic_representation(topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf)[0][\n        indices\n    ]\n\n    # Annotations\n    if hierarchical_topics is not None and len(topics) == len(freq_df.Topic.to_list()):\n        annotations = _get_annotations(\n            topic_model=topic_model,\n            hierarchical_topics=hierarchical_topics,\n            embeddings=embeddings,\n            distance_function=distance_function,\n            linkage_function=linkage_function,\n            orientation=orientation,\n            custom_labels=custom_labels,\n        )\n    else:\n        annotations = None\n\n    # wrap distance function to validate input and return a condensed distance matrix\n    distance_function_viz = lambda x: validate_distance_matrix(distance_function(x), embeddings.shape[0])\n    # Create dendogram\n    fig = ff.create_dendrogram(\n        embeddings,\n        orientation=orientation,\n        distfun=distance_function_viz,\n        linkagefun=linkage_function,\n        hovertext=annotations,\n        color_threshold=color_threshold,\n    )\n\n    # Create nicer labels\n    axis = \"yaxis\" if orientation == \"left\" else \"xaxis\"\n    if isinstance(custom_labels, str):\n        new_labels = [\n            [[str(x), None]] + topic_model.topic_aspects_[custom_labels][x] for x in fig.layout[axis][\"ticktext\"]\n        ]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        new_labels = [\n            topic_model.custom_labels_[topics[int(x)] + topic_model._outliers] for x in fig.layout[axis][\"ticktext\"]\n        ]\n    else:\n        new_labels = [\n            [[str(topics[int(x)]), None]] + topic_model.get_topic(topics[int(x)]) for x in fig.layout[axis][\"ticktext\"]\n        ]\n        new_labels = [\"_\".join([label[0] for label in labels[:4]]) for labels in new_labels]\n        new_labels = [label if len(label) < 30 else label[:27] + \"...\" for label in new_labels]\n\n    # Stylize layout\n    fig.update_layout(\n        plot_bgcolor=\"#ECEFF1\",\n        template=\"plotly_white\",\n        title={\n            \"text\": f\"{title}\",\n            \"x\": 0.5,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n    )\n\n    # Stylize orientation\n    if orientation == \"left\":\n        fig.update_layout(\n            height=200 + (15 * len(topics)),\n            width=width,\n            yaxis=dict(tickmode=\"array\", ticktext=new_labels),\n        )\n\n        # Fix empty space on the bottom of the graph\n        y_max = max([trace[\"y\"].max() + 5 for trace in fig[\"data\"]])\n        y_min = min([trace[\"y\"].min() - 5 for trace in fig[\"data\"]])\n        fig.update_layout(yaxis=dict(range=[y_min, y_max]))\n\n    else:\n        fig.update_layout(\n            width=200 + (15 * len(topics)),\n            height=height,\n            xaxis=dict(tickmode=\"array\", ticktext=new_labels),\n        )\n\n    if hierarchical_topics is not None:\n        for index in [0, 3]:\n            axis = \"x\" if orientation == \"left\" else \"y\"\n            xs = [data[\"x\"][index] for data in fig.data if (data[\"text\"] and data[axis][index] > 0)]\n            ys = [data[\"y\"][index] for data in fig.data if (data[\"text\"] and data[axis][index] > 0)]\n            hovertext = [data[\"text\"][index] for data in fig.data if (data[\"text\"] and data[axis][index] > 0)]\n\n            fig.add_trace(\n                go.Scatter(\n                    x=xs,\n                    y=ys,\n                    marker_color=\"black\",\n                    hovertext=hovertext,\n                    hoverinfo=\"text\",\n                    mode=\"markers\",\n                    showlegend=False,\n                )\n            )\n    return fig\n
        "},{"location":"api/plotting/term.html","title":"Term Score Decline","text":"

        Visualize the ranks of all terms across all topics.

        Each topic is represented by a set of words. These words, however, do not all equally represent the topic. This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize. These will be colored red where all others will be colored black.

        None log_scale bool

        Whether to represent the ranking on a log scale

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Term score decline per Topic</b>' width int

        The width of the figure.

        800 height int

        The height of the figure.

        500

        Returns:

        Type Description fig

        A plotly figure

        Examples:

        To visualize the ranks of all words across all topics simply run:

        topic_model.visualize_term_rank()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_term_rank()\nfig.write_html(\"path/to/file.html\")\n

        Reference:

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        Source code in bertopic\\plotting\\_term_rank.py
        def visualize_term_rank(\n    topic_model,\n    topics: List[int] = None,\n    log_scale: bool = False,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Term score decline per Topic</b>\",\n    width: int = 800,\n    height: int = 500,\n) -> go.Figure:\n    \"\"\"Visualize the ranks of all terms across all topics.\n\n    Each topic is represented by a set of words. These words, however,\n    do not all equally represent the topic. This visualization shows\n    how many words are needed to represent a topic and at which point\n    the beneficial effect of adding words starts to decline.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize. These will be colored\n                red where all others will be colored black.\n        log_scale: Whether to represent the ranking on a log scale\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        fig: A plotly figure\n\n    Examples:\n    To visualize the ranks of all words across\n    all topics simply run:\n\n    ```python\n    topic_model.visualize_term_rank()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_term_rank()\n    fig.write_html(\"path/to/file.html\")\n    ```\n\n    <iframe src=\"../../getting_started/visualization/term_rank.html\"\n    style=\"width:1000px; height: 530px; border: 0px;\"\"></iframe>\n\n    <iframe src=\"../../getting_started/visualization/term_rank_log.html\"\n    style=\"width:1000px; height: 530px; border: 0px;\"\"></iframe>\n\n    Reference:\n\n    This visualization was heavily inspired by the\n    \"Term Probability Decline\" visualization found in an\n    analysis by the amazing [tmtoolkit](https://tmtoolkit.readthedocs.io/).\n    Reference to that specific analysis can be found\n    [here](https://wzbsocialsciencecenter.github.io/tm_corona/tm_analysis.html).\n    \"\"\"\n    topics = [] if topics is None else topics\n\n    topic_ids = topic_model.get_topic_info().Topic.unique().tolist()\n    topic_words = [topic_model.get_topic(topic) for topic in topic_ids]\n\n    values = np.array([[value[1] for value in values] for values in topic_words])\n    indices = np.array([[value + 1 for value in range(len(values))] for values in topic_words])\n\n    # Create figure\n    lines = []\n    for topic, x, y in zip(topic_ids, indices, values):\n        if not any(y > 1.5):\n            # labels\n            if isinstance(custom_labels, str):\n                label = f\"{topic}_\" + \"_\".join(list(zip(*topic_model.topic_aspects_[custom_labels][topic]))[0][:3])\n            elif topic_model.custom_labels_ is not None and custom_labels:\n                label = topic_model.custom_labels_[topic + topic_model._outliers]\n            else:\n                label = f\"<b>Topic {topic}</b>:\" + \"_\".join([word[0] for word in topic_model.get_topic(topic)])\n                label = label[:50]\n\n            # line parameters\n            color = \"red\" if topic in topics else \"black\"\n            opacity = 1 if topic in topics else 0.1\n            if any(y == 0):\n                y[y == 0] = min(values[values > 0])\n            y = np.log10(y, out=y, where=y > 0) if log_scale else y\n\n            line = go.Scatter(\n                x=x,\n                y=y,\n                name=\"\",\n                hovertext=label,\n                mode=\"lines+lines\",\n                opacity=opacity,\n                line=dict(color=color, width=1.5),\n            )\n            lines.append(line)\n\n    fig = go.Figure(data=lines)\n\n    # Stylize layout\n    fig.update_xaxes(range=[0, len(indices[0])], tick0=1, dtick=2)\n    fig.update_layout(\n        showlegend=False,\n        template=\"plotly_white\",\n        title={\n            \"text\": f\"{title}\",\n            \"y\": 0.9,\n            \"x\": 0.5,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        width=width,\n        height=height,\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n    )\n\n    fig.update_xaxes(title_text=\"Term Rank\")\n    if log_scale:\n        fig.update_yaxes(title_text=\"c-TF-IDF score (log scale)\")\n    else:\n        fig.update_yaxes(title_text=\"c-TF-IDF score\")\n\n    return fig\n
        "},{"location":"api/plotting/topics.html","title":"Topics","text":"

        Visualize topics, their sizes, and their corresponding words.

        This visualization is highly inspired by LDAvis, a great visualization technique typically reserved for LDA.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics List[int]

        A selection of topics to visualize

        None top_n_topics int

        Only select the top n most frequent topics

        None use_ctfidf bool

        Whether to use c-TF-IDF representations instead of the embeddings from the embedding model.

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Intertopic Distance Map</b>' width int

        The width of the figure.

        650 height int

        The height of the figure.

        650

        Examples:

        To visualize the topics simply run:

        topic_model.visualize_topics()\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics()\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_topics.py
        def visualize_topics(\n    topic_model,\n    topics: List[int] = None,\n    top_n_topics: int = None,\n    use_ctfidf: bool = False,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Intertopic Distance Map</b>\",\n    width: int = 650,\n    height: int = 650,\n) -> go.Figure:\n    \"\"\"Visualize topics, their sizes, and their corresponding words.\n\n    This visualization is highly inspired by LDAvis, a great visualization\n    technique typically reserved for LDA.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics: A selection of topics to visualize\n        top_n_topics: Only select the top n most frequent topics\n        use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model.\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Examples:\n    To visualize the topics simply run:\n\n    ```python\n    topic_model.visualize_topics()\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics()\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/viz.html\"\n    style=\"width:1000px; height: 680px; border: 0px;\"\"></iframe>\n    \"\"\"\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        topics = list(topics)\n    elif top_n_topics is not None:\n        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        topics = sorted(freq_df.Topic.to_list())\n\n    # Extract topic words and their frequencies\n    topic_list = sorted(topics)\n    frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list]\n    if isinstance(custom_labels, str):\n        words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list]\n        words = [\"_\".join([label[0] for label in labels[:4]]) for labels in words]\n        words = [label if len(label) < 30 else label[:27] + \"...\" for label in words]\n    elif custom_labels and topic_model.custom_labels_ is not None:\n        words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list]\n    else:\n        words = [\" | \".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]\n\n    # Embed c-TF-IDF into 2D\n    all_topics = sorted(list(topic_model.get_topics().keys()))\n    indices = np.array([all_topics.index(topic) for topic in topics])\n\n    embeddings, c_tfidf_used = select_topic_representation(\n        topic_model.c_tf_idf_,\n        topic_model.topic_embeddings_,\n        use_ctfidf=use_ctfidf,\n        output_ndarray=True,\n    )\n    embeddings = embeddings[indices]\n\n    if c_tfidf_used:\n        embeddings = MinMaxScaler().fit_transform(embeddings)\n        embeddings = UMAP(n_neighbors=2, n_components=2, metric=\"hellinger\", random_state=42).fit_transform(embeddings)\n    else:\n        embeddings = UMAP(n_neighbors=2, n_components=2, metric=\"cosine\", random_state=42).fit_transform(embeddings)\n\n    # Visualize with plotly\n    df = pd.DataFrame(\n        {\n            \"x\": embeddings[:, 0],\n            \"y\": embeddings[:, 1],\n            \"Topic\": topic_list,\n            \"Words\": words,\n            \"Size\": frequencies,\n        }\n    )\n    return _plotly_topic_visualization(df, topic_list, title, width, height)\n
        "},{"location":"api/plotting/topics_per_class.html","title":"Topics per Class","text":"

        Visualize topics per class.

        Parameters:

        Name Type Description Default topic_model

        A fitted BERTopic instance.

        required topics_per_class DataFrame

        The topics you would like to be visualized with the corresponding topic representation

        required top_n_topics int

        To visualize the most frequent topics instead of all

        10 topics List[int]

        Select which topics you would like to be visualized

        None normalize_frequency bool

        Whether to normalize each topic's frequency individually

        False custom_labels Union[bool, str]

        If bool, whether to use custom topic labels that were defined using topic_model.set_topic_labels. If str, it uses labels from other aspects, e.g., \"Aspect1\".

        False title str

        Title of the plot.

        '<b>Topics per Class</b>' width int

        The width of the figure.

        1250 height int

        The height of the figure.

        900

        Returns:

        Type Description Figure

        A plotly.graph_objects.Figure including all traces

        Examples:

        To visualize the topics per class, simply run:

        topics_per_class = topic_model.topics_per_class(docs, classes)\ntopic_model.visualize_topics_per_class(topics_per_class)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_topics_per_class(topics_per_class)\nfig.write_html(\"path/to/file.html\")\n
        Source code in bertopic\\plotting\\_topics_per_class.py
        def visualize_topics_per_class(\n    topic_model,\n    topics_per_class: pd.DataFrame,\n    top_n_topics: int = 10,\n    topics: List[int] = None,\n    normalize_frequency: bool = False,\n    custom_labels: Union[bool, str] = False,\n    title: str = \"<b>Topics per Class</b>\",\n    width: int = 1250,\n    height: int = 900,\n) -> go.Figure:\n    \"\"\"Visualize topics per class.\n\n    Arguments:\n        topic_model: A fitted BERTopic instance.\n        topics_per_class: The topics you would like to be visualized with the\n                          corresponding topic representation\n        top_n_topics: To visualize the most frequent topics instead of all\n        topics: Select which topics you would like to be visualized\n        normalize_frequency: Whether to normalize each topic's frequency individually\n        custom_labels: If bool, whether to use custom topic labels that were defined using\n                       `topic_model.set_topic_labels`.\n                       If `str`, it uses labels from other aspects, e.g., \"Aspect1\".\n        title: Title of the plot.\n        width: The width of the figure.\n        height: The height of the figure.\n\n    Returns:\n        A plotly.graph_objects.Figure including all traces\n\n    Examples:\n    To visualize the topics per class, simply run:\n\n    ```python\n    topics_per_class = topic_model.topics_per_class(docs, classes)\n    topic_model.visualize_topics_per_class(topics_per_class)\n    ```\n\n    Or if you want to save the resulting figure:\n\n    ```python\n    fig = topic_model.visualize_topics_per_class(topics_per_class)\n    fig.write_html(\"path/to/file.html\")\n    ```\n    <iframe src=\"../../getting_started/visualization/topics_per_class.html\"\n    style=\"width:1400px; height: 1000px; border: 0px;\"\"></iframe>\n    \"\"\"\n    colors = [\n        \"#E69F00\",\n        \"#56B4E9\",\n        \"#009E73\",\n        \"#F0E442\",\n        \"#D55E00\",\n        \"#0072B2\",\n        \"#CC79A7\",\n    ]\n\n    # Select topics based on top_n and topics args\n    freq_df = topic_model.get_topic_freq()\n    freq_df = freq_df.loc[freq_df.Topic != -1, :]\n    if topics is not None:\n        selected_topics = list(topics)\n    elif top_n_topics is not None:\n        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])\n    else:\n        selected_topics = sorted(freq_df.Topic.to_list())\n\n    # Prepare data\n    if isinstance(custom_labels, str):\n        topic_names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]\n        topic_names = [\"_\".join([label[0] for label in labels[:4]]) for labels in topic_names]\n        topic_names = [label if len(label) < 30 else label[:27] + \"...\" for label in topic_names]\n        topic_names = {key: topic_names[index] for index, key in enumerate(topic_model.topic_labels_.keys())}\n    elif topic_model.custom_labels_ is not None and custom_labels:\n        topic_names = {\n            key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()\n        }\n    else:\n        topic_names = {\n            key: value[:40] + \"...\" if len(value) > 40 else value for key, value in topic_model.topic_labels_.items()\n        }\n    topics_per_class[\"Name\"] = topics_per_class.Topic.map(topic_names)\n    data = topics_per_class.loc[topics_per_class.Topic.isin(selected_topics), :]\n\n    # Add traces\n    fig = go.Figure()\n    for index, topic in enumerate(selected_topics):\n        if index == 0:\n            visible = True\n        else:\n            visible = \"legendonly\"\n        trace_data = data.loc[data.Topic == topic, :]\n        topic_name = trace_data.Name.values[0]\n        words = trace_data.Words.values\n        if normalize_frequency:\n            x = normalize(trace_data.Frequency.values.reshape(1, -1))[0]\n        else:\n            x = trace_data.Frequency\n        fig.add_trace(\n            go.Bar(\n                y=trace_data.Class,\n                x=x,\n                visible=visible,\n                marker_color=colors[index % 7],\n                hoverinfo=\"text\",\n                name=topic_name,\n                orientation=\"h\",\n                hovertext=[f\"<b>Topic {topic}</b><br>Words: {word}\" for word in words],\n            )\n        )\n\n    # Styling of the visualization\n    fig.update_xaxes(showgrid=True)\n    fig.update_yaxes(showgrid=True)\n    fig.update_layout(\n        xaxis_title=\"Normalized Frequency\" if normalize_frequency else \"Frequency\",\n        yaxis_title=\"Class\",\n        title={\n            \"text\": f\"{title}\",\n            \"y\": 0.95,\n            \"x\": 0.40,\n            \"xanchor\": \"center\",\n            \"yanchor\": \"top\",\n            \"font\": dict(size=22, color=\"Black\"),\n        },\n        template=\"simple_white\",\n        width=width,\n        height=height,\n        hoverlabel=dict(bgcolor=\"white\", font_size=16, font_family=\"Rockwell\"),\n        legend=dict(\n            title=\"<b>Global Topic Representation\",\n        ),\n    )\n    return fig\n
        "},{"location":"api/representation/base.html","title":"BaseRepresentation","text":"

        The base representation model for fine-tuning topic representations.

        Source code in bertopic\\representation\\_base.py
        class BaseRepresentation(BaseEstimator):\n    \"\"\"The base representation model for fine-tuning topic representations.\"\"\"\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topics.\n\n        Each representation model that inherits this class will have\n        its arguments (topic_model, documents, c_tf_idf, topics)\n        automatically passed. Therefore, the representation model\n        will only have access to the information about topics related\n        to those arguments.\n\n        Arguments:\n            topic_model: The BERTopic model that is fitted until topic\n                         representations are calculated.\n            documents: A dataframe with columns \"Document\" and \"Topic\"\n                       that contains all documents with each corresponding\n                       topic.\n            c_tf_idf: A c-TF-IDF representation that is typically\n                      identical to `topic_model.c_tf_idf_` except for\n                      dynamic, class-based, and hierarchical topic modeling\n                      where it is calculated on a subset of the documents.\n            topics: A dictionary with topic (key) and tuple of word and\n                    weight (value) as calculated by c-TF-IDF. This is the\n                    default topics that are returned if no representation\n                    model is used.\n        \"\"\"\n        return topic_model.topic_representations_\n
        "},{"location":"api/representation/base.html#bertopic.representation._base.BaseRepresentation.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Each representation model that inherits this class will have its arguments (topic_model, documents, c_tf_idf, topics) automatically passed. Therefore, the representation model will only have access to the information about topics related to those arguments.

        Parameters:

        Name Type Description Default topic_model

        The BERTopic model that is fitted until topic representations are calculated.

        required documents DataFrame

        A dataframe with columns \"Document\" and \"Topic\" that contains all documents with each corresponding topic.

        required c_tf_idf csr_matrix

        A c-TF-IDF representation that is typically identical to topic_model.c_tf_idf_ except for dynamic, class-based, and hierarchical topic modeling where it is calculated on a subset of the documents.

        required topics Mapping[str, List[Tuple[str, float]]]

        A dictionary with topic (key) and tuple of word and weight (value) as calculated by c-TF-IDF. This is the default topics that are returned if no representation model is used.

        required Source code in bertopic\\representation\\_base.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topics.\n\n    Each representation model that inherits this class will have\n    its arguments (topic_model, documents, c_tf_idf, topics)\n    automatically passed. Therefore, the representation model\n    will only have access to the information about topics related\n    to those arguments.\n\n    Arguments:\n        topic_model: The BERTopic model that is fitted until topic\n                     representations are calculated.\n        documents: A dataframe with columns \"Document\" and \"Topic\"\n                   that contains all documents with each corresponding\n                   topic.\n        c_tf_idf: A c-TF-IDF representation that is typically\n                  identical to `topic_model.c_tf_idf_` except for\n                  dynamic, class-based, and hierarchical topic modeling\n                  where it is calculated on a subset of the documents.\n        topics: A dictionary with topic (key) and tuple of word and\n                weight (value) as calculated by c-TF-IDF. This is the\n                default topics that are returned if no representation\n                model is used.\n    \"\"\"\n    return topic_model.topic_representations_\n
        "},{"location":"api/representation/cohere.html","title":"Cohere","text":"

        Use the Cohere API to generate topic labels based on their generative model.

        Find more about their models here: https://docs.cohere.ai/docs

        Parameters:

        Name Type Description Default client

        A cohere.Client

        required model str

        Model to use within Cohere, defaults to \"xlarge\".

        'xlarge' prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and documents need to be inserted.

        None delay_in_seconds float

        The delay in seconds between consecutive prompts in order to prevent RateLimitErrors.

        None nr_docs int

        The number of documents to pass to OpenAI if a prompt with the [\"DOCUMENTS\"] tag is used.

        4 diversity float

        The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on doc_length * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None

        Usage:

        To use this, you will need to install cohere first:

        pip install cohere

        Then, get yourself an API key and use Cohere's API as follows:

        import cohere\nfrom bertopic.representation import Cohere\nfrom bertopic import BERTopic\n\n# Create your representation model\nco = cohere.Client(my_api_key)\nrepresentation_model = Cohere(co)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"I have the following documents: [DOCUMENTS]. What topic do they contain?\"\nrepresentation_model = Cohere(co, prompt=prompt)\n
        Source code in bertopic\\representation\\_cohere.py
        class Cohere(BaseRepresentation):\n    \"\"\"Use the Cohere API to generate topic labels based on their\n    generative model.\n\n    Find more about their models here:\n    https://docs.cohere.ai/docs\n\n    Arguments:\n        client: A `cohere.Client`\n        model: Model to use within Cohere, defaults to `\"xlarge\"`.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n                to decide where the keywords and documents need to be\n                inserted.\n        delay_in_seconds: The delay in seconds between consecutive prompts\n                                in order to prevent RateLimitErrors.\n        nr_docs: The number of documents to pass to OpenAI if a prompt\n                 with the `[\"DOCUMENTS\"]` tag is used.\n        diversity: The diversity of documents to pass to OpenAI.\n                   Accepts values between 0 and 1. A higher\n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document.\n                       * If tokenizer is 'char', then the document is split up\n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n\n    Usage:\n\n    To use this, you will need to install cohere first:\n\n    `pip install cohere`\n\n    Then, get yourself an API key and use Cohere's API as follows:\n\n    ```python\n    import cohere\n    from bertopic.representation import Cohere\n    from bertopic import BERTopic\n\n    # Create your representation model\n    co = cohere.Client(my_api_key)\n    representation_model = Cohere(co)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can also use a custom prompt:\n\n    ```python\n    prompt = \"I have the following documents: [DOCUMENTS]. What topic do they contain?\"\n    representation_model = Cohere(co, prompt=prompt)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        client,\n        model: str = \"xlarge\",\n        prompt: str = None,\n        delay_in_seconds: float = None,\n        nr_docs: int = 4,\n        diversity: float = None,\n        doc_length: int = None,\n        tokenizer: Union[str, Callable] = None,\n    ):\n        self.client = client\n        self.model = model\n        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT\n        self.default_prompt_ = DEFAULT_PROMPT\n        self.delay_in_seconds = delay_in_seconds\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n        self.prompts_ = []\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topics.\n\n        Arguments:\n            topic_model: Not used\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top 4 representative documents per topic\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n            c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity\n        )\n\n        # Generate using Cohere's Language Model\n        updated_topics = {}\n        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n            truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n            prompt = self._create_prompt(truncated_docs, topic, topics)\n            self.prompts_.append(prompt)\n\n            # Delay\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n            request = self.client.generate(\n                model=self.model,\n                prompt=prompt,\n                max_tokens=50,\n                num_generations=1,\n                stop_sequences=[\"\\n\"],\n            )\n            label = request.generations[0].text.strip()\n            updated_topics[topic] = [(label, 1)] + [(\"\", 0) for _ in range(9)]\n\n        return updated_topics\n\n    def _create_prompt(self, docs, topic, topics):\n        keywords = list(zip(*topics[topic]))[0]\n\n        # Use the Default Chat Prompt\n        if self.prompt == DEFAULT_PROMPT:\n            prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            prompt = self._replace_documents(prompt, docs)\n\n        # Use a custom prompt that leverages keywords, documents or both using\n        # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively\n        else:\n            prompt = self.prompt\n            if \"[KEYWORDS]\" in prompt:\n                prompt = prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            if \"[DOCUMENTS]\" in prompt:\n                prompt = self._replace_documents(prompt, docs)\n\n        return prompt\n\n    @staticmethod\n    def _replace_documents(prompt, docs):\n        to_replace = \"\"\n        for doc in docs:\n            to_replace += f\"- {doc}\\n\"\n        prompt = prompt.replace(\"[DOCUMENTS]\", to_replace)\n        return prompt\n
        "},{"location":"api/representation/cohere.html#bertopic.representation._cohere.Cohere.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Parameters:

        Name Type Description Default topic_model

        Not used

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_cohere.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topics.\n\n    Arguments:\n        topic_model: Not used\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top 4 representative documents per topic\n    repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n        c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity\n    )\n\n    # Generate using Cohere's Language Model\n    updated_topics = {}\n    for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n        truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n        prompt = self._create_prompt(truncated_docs, topic, topics)\n        self.prompts_.append(prompt)\n\n        # Delay\n        if self.delay_in_seconds:\n            time.sleep(self.delay_in_seconds)\n\n        request = self.client.generate(\n            model=self.model,\n            prompt=prompt,\n            max_tokens=50,\n            num_generations=1,\n            stop_sequences=[\"\\n\"],\n        )\n        label = request.generations[0].text.strip()\n        updated_topics[topic] = [(label, 1)] + [(\"\", 0) for _ in range(9)]\n\n    return updated_topics\n
        "},{"location":"api/representation/generation.html","title":"TextGeneration","text":"

        Text2Text or text generation with transformers.

        Parameters:

        Name Type Description Default model Union[str, pipeline]

        A transformers pipeline that should be initialized as \"text-generation\" for gpt-like models or \"text2text-generation\" for T5-like models. For example, pipeline('text-generation', model='gpt2'). If a string is passed, \"text-generation\" will be selected by default.

        required prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and documents need to be inserted.

        None pipeline_kwargs Mapping[str, Any]

        Kwargs that you can pass to the transformers.pipeline when it is called.

        {} random_state int

        A random state to be passed to transformers.set_seed

        42 nr_docs int

        The number of documents to pass to OpenAI if a prompt with the [\"DOCUMENTS\"] tag is used.

        4 diversity float

        The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on doc_length * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None

        Usage:

        To use a gpt-like model:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Create your representation model\ngenerator = pipeline('text-generation', model='gpt2')\nrepresentation_model = TextGeneration(generator)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTo pic(representation_model=representation_model)\n

        You can use a custom prompt and decide where the keywords should be inserted by using the [KEYWORDS] or documents with thte [DOCUMENTS] tag:

        from bertopic.representation import TextGeneration\n\nprompt = \"I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?\"\"\n\n# Create your representation model\ngenerator = pipeline('text2text-generation', model='google/flan-t5-base')\nrepresentation_model = TextGeneration(generator)\n
        Source code in bertopic\\representation\\_textgeneration.py
        class TextGeneration(BaseRepresentation):\n    \"\"\"Text2Text or text generation with transformers.\n\n    Arguments:\n        model: A transformers pipeline that should be initialized as \"text-generation\"\n               for gpt-like models or \"text2text-generation\" for T5-like models.\n               For example, `pipeline('text-generation', model='gpt2')`. If a string\n               is passed, \"text-generation\" will be selected by default.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n                to decide where the keywords and documents need to be\n                inserted.\n        pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline\n                         when it is called.\n        random_state: A random state to be passed to `transformers.set_seed`\n        nr_docs: The number of documents to pass to OpenAI if a prompt\n                 with the `[\"DOCUMENTS\"]` tag is used.\n        diversity: The diversity of documents to pass to OpenAI.\n                   Accepts values between 0 and 1. A higher\n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document.\n                       * If tokenizer is 'char', then the document is split up\n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n\n    Usage:\n\n    To use a gpt-like model:\n\n    ```python\n    from bertopic.representation import TextGeneration\n    from bertopic import BERTopic\n\n    # Create your representation model\n    generator = pipeline('text-generation', model='gpt2')\n    representation_model = TextGeneration(generator)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTo pic(representation_model=representation_model)\n    ```\n\n    You can use a custom prompt and decide where the keywords should\n    be inserted by using the `[KEYWORDS]` or documents with thte `[DOCUMENTS]` tag:\n\n    ```python\n    from bertopic.representation import TextGeneration\n\n    prompt = \"I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?\"\"\n\n    # Create your representation model\n    generator = pipeline('text2text-generation', model='google/flan-t5-base')\n    representation_model = TextGeneration(generator)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        model: Union[str, pipeline],\n        prompt: str = None,\n        pipeline_kwargs: Mapping[str, Any] = {},\n        random_state: int = 42,\n        nr_docs: int = 4,\n        diversity: float = None,\n        doc_length: int = None,\n        tokenizer: Union[str, Callable] = None,\n    ):\n        self.random_state = random_state\n        set_seed(random_state)\n        if isinstance(model, str):\n            self.model = pipeline(\"text-generation\", model=model)\n        elif isinstance(model, Pipeline):\n            self.model = model\n        else:\n            raise ValueError(\n                \"Make sure that the HF model that you\"\n                \"pass is either a string referring to a\"\n                \"HF model or a `transformers.pipeline` object.\"\n            )\n        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT\n        self.default_prompt_ = DEFAULT_PROMPT\n        self.pipeline_kwargs = pipeline_kwargs\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n\n        self.prompts_ = []\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topic representations and return a single label.\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top 4 representative documents per topic\n        if self.prompt != DEFAULT_PROMPT and \"[DOCUMENTS]\" in self.prompt:\n            repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n                c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity\n            )\n        else:\n            repr_docs_mappings = {topic: None for topic in topics.keys()}\n\n        updated_topics = {}\n        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n            # Prepare prompt\n            truncated_docs = (\n                [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n                if docs is not None\n                else docs\n            )\n            prompt = self._create_prompt(truncated_docs, topic, topics)\n            self.prompts_.append(prompt)\n\n            # Extract result from generator and use that as label\n            topic_description = self.model(prompt, **self.pipeline_kwargs)\n            topic_description = [\n                (description[\"generated_text\"].replace(prompt, \"\"), 1) for description in topic_description\n            ]\n\n            if len(topic_description) < 10:\n                topic_description += [(\"\", 0) for _ in range(10 - len(topic_description))]\n\n            updated_topics[topic] = topic_description\n\n        return updated_topics\n\n    def _create_prompt(self, docs, topic, topics):\n        keywords = \", \".join(list(zip(*topics[topic]))[0])\n\n        # Use the default prompt and replace keywords\n        if self.prompt == DEFAULT_PROMPT:\n            prompt = self.prompt.replace(\"[KEYWORDS]\", keywords)\n\n        # Use a prompt that leverages either keywords or documents in\n        # a custom location\n        else:\n            prompt = self.prompt\n            if \"[KEYWORDS]\" in prompt:\n                prompt = prompt.replace(\"[KEYWORDS]\", keywords)\n            if \"[DOCUMENTS]\" in prompt:\n                to_replace = \"\"\n                for doc in docs:\n                    to_replace += f\"- {doc}\\n\"\n                prompt = prompt.replace(\"[DOCUMENTS]\", to_replace)\n\n        return prompt\n
        "},{"location":"api/representation/generation.html#bertopic.representation._textgeneration.TextGeneration.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topic representations and return a single label.

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_textgeneration.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topic representations and return a single label.\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top 4 representative documents per topic\n    if self.prompt != DEFAULT_PROMPT and \"[DOCUMENTS]\" in self.prompt:\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n            c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity\n        )\n    else:\n        repr_docs_mappings = {topic: None for topic in topics.keys()}\n\n    updated_topics = {}\n    for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n        # Prepare prompt\n        truncated_docs = (\n            [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n            if docs is not None\n            else docs\n        )\n        prompt = self._create_prompt(truncated_docs, topic, topics)\n        self.prompts_.append(prompt)\n\n        # Extract result from generator and use that as label\n        topic_description = self.model(prompt, **self.pipeline_kwargs)\n        topic_description = [\n            (description[\"generated_text\"].replace(prompt, \"\"), 1) for description in topic_description\n        ]\n\n        if len(topic_description) < 10:\n            topic_description += [(\"\", 0) for _ in range(10 - len(topic_description))]\n\n        updated_topics[topic] = topic_description\n\n    return updated_topics\n
        "},{"location":"api/representation/keybert.html","title":"KeyBERTInspired","text":"Source code in bertopic\\representation\\_keybert.py
        class KeyBERTInspired(BaseRepresentation):\n    def __init__(\n        self,\n        top_n_words: int = 10,\n        nr_repr_docs: int = 5,\n        nr_samples: int = 500,\n        nr_candidate_words: int = 100,\n        random_state: int = 42,\n    ):\n        \"\"\"Use a KeyBERT-like model to fine-tune the topic representations.\n\n        The algorithm follows KeyBERT but does some optimization in\n        order to speed up inference.\n\n        The steps are as follows. First, we extract the top n representative\n        documents per topic. To extract the representative documents, we\n        randomly sample a number of candidate documents per cluster\n        which is controlled by the `nr_samples` parameter. Then,\n        the top n representative documents  are extracted by calculating\n        the c-TF-IDF representation for the  candidate documents and finding,\n        through cosine similarity, which are closest to the topic c-TF-IDF representation.\n        Next, the top n words per topic are extracted based on their\n        c-TF-IDF representation, which is controlled by the `nr_repr_docs`\n        parameter.\n\n        Then, we extract the embeddings for words and representative documents\n        and create topic embeddings by averaging the representative documents.\n        Finally, the most similar words to each topic are extracted by\n        calculating the cosine similarity between word and topic embeddings.\n\n        Arguments:\n            top_n_words: The top n words to extract per topic.\n            nr_repr_docs: The number of representative documents to extract per cluster.\n            nr_samples: The number of candidate documents to extract per cluster.\n            nr_candidate_words: The number of candidate words per cluster.\n            random_state: The random state for randomly sampling candidate documents.\n\n        Usage:\n\n        ```python\n        from bertopic.representation import KeyBERTInspired\n        from bertopic import BERTopic\n\n        # Create your representation model\n        representation_model = KeyBERTInspired()\n\n        # Use the representation model in BERTopic on top of the default pipeline\n        topic_model = BERTopic(representation_model=representation_model)\n        ```\n        \"\"\"\n        self.top_n_words = top_n_words\n        self.nr_repr_docs = nr_repr_docs\n        self.nr_samples = nr_samples\n        self.nr_candidate_words = nr_candidate_words\n        self.random_state = random_state\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topics.\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # We extract the top n representative documents per class\n        _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(\n            c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs\n        )\n\n        # We extract the top n words per class\n        topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)\n\n        # We calculate the similarity between word and document embeddings and create\n        # topic embeddings from the representative document embeddings\n        sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices)\n\n        # Find the best matching words based on the similarity matrix for each topic\n        updated_topics = self._extract_top_words(words, topics, sim_matrix)\n\n        return updated_topics\n\n    def _extract_candidate_words(\n        self,\n        topic_model,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"For each topic, extract candidate words based on the c-TF-IDF\n        representation.\n\n        Arguments:\n            topic_model: A BERTopic model\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The top words per topic\n\n        Returns:\n            topics: The `self.top_n_words` per topic\n        \"\"\"\n        labels = [int(label) for label in sorted(list(topics.keys()))]\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = topic_model.vectorizer_model.get_feature_names_out()\n        else:\n            words = topic_model.vectorizer_model.get_feature_names()\n\n        indices = topic_model._top_n_idx_sparse(c_tf_idf, self.nr_candidate_words)\n        scores = topic_model._top_n_values_sparse(c_tf_idf, indices)\n        sorted_indices = np.argsort(scores, 1)\n        indices = np.take_along_axis(indices, sorted_indices, axis=1)\n        scores = np.take_along_axis(scores, sorted_indices, axis=1)\n\n        # Get top 30 words per topic based on c-TF-IDF score\n        topics = {\n            label: [\n                (words[word_index], score) if word_index is not None and score > 0 else (\"\", 0.00001)\n                for word_index, score in zip(indices[index][::-1], scores[index][::-1])\n            ]\n            for index, label in enumerate(labels)\n        }\n        topics = {label: list(zip(*values[: self.nr_candidate_words]))[0] for label, values in topics.items()}\n\n        return topics\n\n    def _extract_embeddings(\n        self,\n        topic_model,\n        topics: Mapping[str, List[Tuple[str, float]]],\n        representative_docs: List[str],\n        repr_doc_indices: List[List[int]],\n    ) -> Union[np.ndarray, List[str]]:\n        \"\"\"Extract the representative document embeddings and create topic embeddings.\n        Then extract word embeddings and calculate the cosine similarity between topic\n        embeddings and the word embeddings. Topic embeddings are the average of\n        representative document embeddings.\n\n        Arguments:\n            topic_model: A BERTopic model\n            topics: The top words per topic\n            representative_docs: A flat list of representative documents\n            repr_doc_indices: The indices of representative documents\n                              that belong to each topic\n\n        Returns:\n            sim: The similarity matrix between word and topic embeddings\n            vocab: The complete vocabulary of input documents\n        \"\"\"\n        # Calculate representative docs embeddings and create topic embeddings\n        repr_embeddings = topic_model._extract_embeddings(representative_docs, method=\"document\", verbose=False)\n        topic_embeddings = [np.mean(repr_embeddings[i[0] : i[-1] + 1], axis=0) for i in repr_doc_indices]\n\n        # Calculate word embeddings and extract best matching with updated topic_embeddings\n        vocab = list(set([word for words in topics.values() for word in words]))\n        word_embeddings = topic_model._extract_embeddings(vocab, method=\"document\", verbose=False)\n        sim = cosine_similarity(topic_embeddings, word_embeddings)\n\n        return sim, vocab\n\n    def _extract_top_words(\n        self,\n        vocab: List[str],\n        topics: Mapping[str, List[Tuple[str, float]]],\n        sim: np.ndarray,\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract the top n words per topic based on the\n        similarity matrix between topics and words.\n\n        Arguments:\n            vocab: The complete vocabulary of input documents\n            labels: All topic labels\n            topics: The top words per topic\n            sim: The similarity matrix between word and topic embeddings\n\n        Returns:\n            updated_topics: The updated topic representations\n        \"\"\"\n        labels = [int(label) for label in sorted(list(topics.keys()))]\n        updated_topics = {}\n        for i, topic in enumerate(labels):\n            indices = [vocab.index(word) for word in topics[topic]]\n            values = sim[:, indices][i]\n            word_indices = [indices[index] for index in np.argsort(values)[-self.top_n_words :]]\n            updated_topics[topic] = [\n                (vocab[index], val) for val, index in zip(np.sort(values)[-self.top_n_words :], word_indices)\n            ][::-1]\n\n        return updated_topics\n
        "},{"location":"api/representation/keybert.html#bertopic.representation._keybert.KeyBERTInspired.__init__","title":"__init__(self, top_n_words=10, nr_repr_docs=5, nr_samples=500, nr_candidate_words=100, random_state=42) special","text":"

        Use a KeyBERT-like model to fine-tune the topic representations.

        The algorithm follows KeyBERT but does some optimization in order to speed up inference.

        The steps are as follows. First, we extract the top n representative documents per topic. To extract the representative documents, we randomly sample a number of candidate documents per cluster which is controlled by the nr_samples parameter. Then, the top n representative documents are extracted by calculating the c-TF-IDF representation for the candidate documents and finding, through cosine similarity, which are closest to the topic c-TF-IDF representation. Next, the top n words per topic are extracted based on their c-TF-IDF representation, which is controlled by the nr_repr_docs parameter.

        Then, we extract the embeddings for words and representative documents and create topic embeddings by averaging the representative documents. Finally, the most similar words to each topic are extracted by calculating the cosine similarity between word and topic embeddings.

        Parameters:

        Name Type Description Default top_n_words int

        The top n words to extract per topic.

        10 nr_repr_docs int

        The number of representative documents to extract per cluster.

        5 nr_samples int

        The number of candidate documents to extract per cluster.

        500 nr_candidate_words int

        The number of candidate words per cluster.

        100 random_state int

        The random state for randomly sampling candidate documents.

        42

        Usage:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        Source code in bertopic\\representation\\_keybert.py
        def __init__(\n    self,\n    top_n_words: int = 10,\n    nr_repr_docs: int = 5,\n    nr_samples: int = 500,\n    nr_candidate_words: int = 100,\n    random_state: int = 42,\n):\n    \"\"\"Use a KeyBERT-like model to fine-tune the topic representations.\n\n    The algorithm follows KeyBERT but does some optimization in\n    order to speed up inference.\n\n    The steps are as follows. First, we extract the top n representative\n    documents per topic. To extract the representative documents, we\n    randomly sample a number of candidate documents per cluster\n    which is controlled by the `nr_samples` parameter. Then,\n    the top n representative documents  are extracted by calculating\n    the c-TF-IDF representation for the  candidate documents and finding,\n    through cosine similarity, which are closest to the topic c-TF-IDF representation.\n    Next, the top n words per topic are extracted based on their\n    c-TF-IDF representation, which is controlled by the `nr_repr_docs`\n    parameter.\n\n    Then, we extract the embeddings for words and representative documents\n    and create topic embeddings by averaging the representative documents.\n    Finally, the most similar words to each topic are extracted by\n    calculating the cosine similarity between word and topic embeddings.\n\n    Arguments:\n        top_n_words: The top n words to extract per topic.\n        nr_repr_docs: The number of representative documents to extract per cluster.\n        nr_samples: The number of candidate documents to extract per cluster.\n        nr_candidate_words: The number of candidate words per cluster.\n        random_state: The random state for randomly sampling candidate documents.\n\n    Usage:\n\n    ```python\n    from bertopic.representation import KeyBERTInspired\n    from bertopic import BERTopic\n\n    # Create your representation model\n    representation_model = KeyBERTInspired()\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n    \"\"\"\n    self.top_n_words = top_n_words\n    self.nr_repr_docs = nr_repr_docs\n    self.nr_samples = nr_samples\n    self.nr_candidate_words = nr_candidate_words\n    self.random_state = random_state\n
        "},{"location":"api/representation/keybert.html#bertopic.representation._keybert.KeyBERTInspired.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        The topic c-TF-IDF representation

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_keybert.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topics.\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: The topic c-TF-IDF representation\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # We extract the top n representative documents per class\n    _, representative_docs, repr_doc_indices, _ = topic_model._extract_representative_docs(\n        c_tf_idf, documents, topics, self.nr_samples, self.nr_repr_docs\n    )\n\n    # We extract the top n words per class\n    topics = self._extract_candidate_words(topic_model, c_tf_idf, topics)\n\n    # We calculate the similarity between word and document embeddings and create\n    # topic embeddings from the representative document embeddings\n    sim_matrix, words = self._extract_embeddings(topic_model, topics, representative_docs, repr_doc_indices)\n\n    # Find the best matching words based on the similarity matrix for each topic\n    updated_topics = self._extract_top_words(words, topics, sim_matrix)\n\n    return updated_topics\n
        "},{"location":"api/representation/langchain.html","title":"LangChain","text":"

        Using chains in langchain to generate topic labels.

        The classic example uses langchain.chains.question_answering.load_qa_chain. This returns a chain that takes a list of documents and a question as input.

        You can also use Runnables such as those composed using the LangChain Expression Language.

        Parameters:

        Name Type Description Default chain

        The langchain chain or Runnable with a batch method. Input keys must be input_documents and question. Output key must be output_text.

        required prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" in the prompt to decide where the keywords need to be inserted. Keywords won't be included unless indicated. Unlike other representation models, Langchain does not use the \"[DOCUMENTS]\" tag to insert documents into the prompt. The load_qa_chain function formats the representative documents within the prompt.

        None nr_docs int

        The number of documents to pass to LangChain

        4 diversity float

        The diversity of documents to pass to LangChain. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on doc_length. They are decoded with whitespaces. * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None chain_config

        The configuration for the langchain chain. Can be used to set options like max_concurrency to avoid rate limiting errors.

        None

        Usage:

        To use this, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai:

        pip install langchain pip install openai

        Then, you can create your chain as follows:

        from langchain.chains.question_answering import load_qa_chain\nfrom langchain.llms import OpenAI\nchain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type=\"stuff\")\n

        Finally, you can pass the chain to BERTopic as follows:

        from bertopic.representation import LangChain\n\n# Create your representation model\nrepresentation_model = LangChain(chain)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"What are these documents about? Please give a single label.\"\nrepresentation_model = LangChain(chain, prompt=prompt)\n

        You can also use a Runnable instead of a chain. The example below uses the LangChain Expression Language:

        from bertopic.representation import LangChain\nfrom langchain.chains.question_answering import load_qa_chain\nfrom langchain.chat_models import ChatAnthropic\nfrom langchain.schema.document import Document\nfrom langchain.schema.runnable import RunnablePassthrough\nfrom langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer\n\nprompt = ...\nllm = ...\n\n# We will construct a special privacy-preserving chain using Microsoft Presidio\n\npii_handler = PresidioReversibleAnonymizer(analyzed_fields=[\"PERSON\"])\n\nchain = (\n    {\n        \"input_documents\": (\n            lambda inp: [\n                Document(\n                    page_content=pii_handler.anonymize(\n                        d.page_content,\n                        language=\"en\",\n                    ),\n                )\n                for d in inp[\"input_documents\"]\n            ]\n        ),\n        \"question\": RunnablePassthrough(),\n    }\n    | load_qa_chain(representation_llm, chain_type=\"stuff\")\n    | (lambda output: {\"output_text\": pii_handler.deanonymize(output[\"output_text\"])})\n)\n\nrepresentation_model = LangChain(chain, prompt=representation_prompt)\n
        Source code in bertopic\\representation\\_langchain.py
        class LangChain(BaseRepresentation):\n    \"\"\"Using chains in langchain to generate topic labels.\n\n    The classic example uses `langchain.chains.question_answering.load_qa_chain`.\n    This returns a chain that takes a list of documents and a question as input.\n\n    You can also use Runnables such as those composed using the LangChain Expression Language.\n\n    Arguments:\n        chain: The langchain chain or Runnable with a `batch` method.\n               Input keys must be `input_documents` and `question`.\n               Output key must be `output_text`.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                 NOTE: Use `\"[KEYWORDS]\"` in the prompt\n                 to decide where the keywords need to be\n                 inserted. Keywords won't be included unless\n                 indicated. Unlike other representation models,\n                 Langchain does not use the `\"[DOCUMENTS]\"` tag\n                 to insert documents into the prompt. The load_qa_chain function\n                 formats the representative documents within the prompt.\n        nr_docs: The number of documents to pass to LangChain\n        diversity: The diversity of documents to pass to LangChain.\n                   Accepts values between 0 and 1. A higher\n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document.\n                       * If tokenizer is 'char', then the document is split up\n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and truncated depending on `doc_length`. They are decoded with\n                         whitespaces.\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n        chain_config: The configuration for the langchain chain. Can be used to set options\n                      like max_concurrency to avoid rate limiting errors.\n    Usage:\n\n    To use this, you will need to install the langchain package first.\n    Additionally, you will need an underlying LLM to support langchain,\n    like openai:\n\n    `pip install langchain`\n    `pip install openai`\n\n    Then, you can create your chain as follows:\n\n    ```python\n    from langchain.chains.question_answering import load_qa_chain\n    from langchain.llms import OpenAI\n    chain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type=\"stuff\")\n    ```\n\n    Finally, you can pass the chain to BERTopic as follows:\n\n    ```python\n    from bertopic.representation import LangChain\n\n    # Create your representation model\n    representation_model = LangChain(chain)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can also use a custom prompt:\n\n    ```python\n    prompt = \"What are these documents about? Please give a single label.\"\n    representation_model = LangChain(chain, prompt=prompt)\n    ```\n\n    You can also use a Runnable instead of a chain.\n    The example below uses the LangChain Expression Language:\n\n    ```python\n    from bertopic.representation import LangChain\n    from langchain.chains.question_answering import load_qa_chain\n    from langchain.chat_models import ChatAnthropic\n    from langchain.schema.document import Document\n    from langchain.schema.runnable import RunnablePassthrough\n    from langchain_experimental.data_anonymizer.presidio import PresidioReversibleAnonymizer\n\n    prompt = ...\n    llm = ...\n\n    # We will construct a special privacy-preserving chain using Microsoft Presidio\n\n    pii_handler = PresidioReversibleAnonymizer(analyzed_fields=[\"PERSON\"])\n\n    chain = (\n        {\n            \"input_documents\": (\n                lambda inp: [\n                    Document(\n                        page_content=pii_handler.anonymize(\n                            d.page_content,\n                            language=\"en\",\n                        ),\n                    )\n                    for d in inp[\"input_documents\"]\n                ]\n            ),\n            \"question\": RunnablePassthrough(),\n        }\n        | load_qa_chain(representation_llm, chain_type=\"stuff\")\n        | (lambda output: {\"output_text\": pii_handler.deanonymize(output[\"output_text\"])})\n    )\n\n    representation_model = LangChain(chain, prompt=representation_prompt)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        chain,\n        prompt: str = None,\n        nr_docs: int = 4,\n        diversity: float = None,\n        doc_length: int = None,\n        tokenizer: Union[str, Callable] = None,\n        chain_config=None,\n    ):\n        self.chain = chain\n        self.prompt = prompt if prompt is not None else DEFAULT_PROMPT\n        self.default_prompt_ = DEFAULT_PROMPT\n        self.chain_config = chain_config\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, int]]]:\n        \"\"\"Extract topics.\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top 4 representative documents per topic\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n            c_tf_idf=c_tf_idf,\n            documents=documents,\n            topics=topics,\n            nr_samples=500,\n            nr_repr_docs=self.nr_docs,\n            diversity=self.diversity,\n        )\n\n        # Generate label using langchain's batch functionality\n        chain_docs: List[List[Document]] = [\n            [\n                Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc))\n                for doc in docs\n            ]\n            for docs in repr_docs_mappings.values()\n        ]\n\n        # `self.chain` must take `input_documents` and `question` as input keys\n        # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS]\n        if \"[KEYWORDS]\" in self.prompt:\n            prompts = []\n            for topic in topics:\n                keywords = list(zip(*topics[topic]))[0]\n                prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n                prompts.append(prompt)\n\n            inputs = [{\"input_documents\": docs, \"question\": prompt} for docs, prompt in zip(chain_docs, prompts)]\n\n        else:\n            inputs = [{\"input_documents\": docs, \"question\": self.prompt} for docs in chain_docs]\n\n        # `self.chain` must return a dict with an `output_text` key\n        # same output key as the `StuffDocumentsChain` returned by `load_qa_chain`\n        outputs = self.chain.batch(inputs=inputs, config=self.chain_config)\n        labels = [output[\"output_text\"].strip() for output in outputs]\n\n        updated_topics = {\n            topic: [(label, 1)] + [(\"\", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels)\n        }\n\n        return updated_topics\n
        "},{"location":"api/representation/langchain.html#bertopic.representation._langchain.LangChain.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        The topic c-TF-IDF representation

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_langchain.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, int]]]:\n    \"\"\"Extract topics.\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: The topic c-TF-IDF representation\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top 4 representative documents per topic\n    repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n        c_tf_idf=c_tf_idf,\n        documents=documents,\n        topics=topics,\n        nr_samples=500,\n        nr_repr_docs=self.nr_docs,\n        diversity=self.diversity,\n    )\n\n    # Generate label using langchain's batch functionality\n    chain_docs: List[List[Document]] = [\n        [\n            Document(page_content=truncate_document(topic_model, self.doc_length, self.tokenizer, doc))\n            for doc in docs\n        ]\n        for docs in repr_docs_mappings.values()\n    ]\n\n    # `self.chain` must take `input_documents` and `question` as input keys\n    # Use a custom prompt that leverages keywords, using the tag: [KEYWORDS]\n    if \"[KEYWORDS]\" in self.prompt:\n        prompts = []\n        for topic in topics:\n            keywords = list(zip(*topics[topic]))[0]\n            prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            prompts.append(prompt)\n\n        inputs = [{\"input_documents\": docs, \"question\": prompt} for docs, prompt in zip(chain_docs, prompts)]\n\n    else:\n        inputs = [{\"input_documents\": docs, \"question\": self.prompt} for docs in chain_docs]\n\n    # `self.chain` must return a dict with an `output_text` key\n    # same output key as the `StuffDocumentsChain` returned by `load_qa_chain`\n    outputs = self.chain.batch(inputs=inputs, config=self.chain_config)\n    labels = [output[\"output_text\"].strip() for output in outputs]\n\n    updated_topics = {\n        topic: [(label, 1)] + [(\"\", 0) for _ in range(9)] for topic, label in zip(repr_docs_mappings.keys(), labels)\n    }\n\n    return updated_topics\n
        "},{"location":"api/representation/mmr.html","title":"MaximalMarginalRelevance","text":"

        Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document.

        MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.

        Parameters:

        Name Type Description Default diversity float

        How diverse the select keywords/keyphrases are. Values range between 0 and 1 with 0 being not diverse at all and 1 being most diverse.

        0.1 top_n_words int

        The number of keywords/keyhprases to return

        10

        Usage:

        from bertopic.representation import MaximalMarginalRelevance\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = MaximalMarginalRelevance(diversity=0.3)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        Source code in bertopic\\representation\\_mmr.py
        class MaximalMarginalRelevance(BaseRepresentation):\n    \"\"\"Calculate Maximal Marginal Relevance (MMR)\n    between candidate keywords and the document.\n\n    MMR considers the similarity of keywords/keyphrases with the\n    document, along with the similarity of already selected\n    keywords and keyphrases. This results in a selection of keywords\n    that maximize their within diversity with respect to the document.\n\n    Arguments:\n        diversity: How diverse the select keywords/keyphrases are.\n                    Values range between 0 and 1 with 0 being not diverse at all\n                    and 1 being most diverse.\n        top_n_words: The number of keywords/keyhprases to return\n\n    Usage:\n\n    ```python\n    from bertopic.representation import MaximalMarginalRelevance\n    from bertopic import BERTopic\n\n    # Create your representation model\n    representation_model = MaximalMarginalRelevance(diversity=0.3)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n    \"\"\"\n\n    def __init__(self, diversity: float = 0.1, top_n_words: int = 10):\n        self.diversity = diversity\n        self.top_n_words = top_n_words\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topic representations.\n\n        Arguments:\n            topic_model: The BERTopic model\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        if topic_model.embedding_model is None:\n            warnings.warn(\n                \"MaximalMarginalRelevance can only be used BERTopic was instantiated\"\n                \"with the `embedding_model` parameter.\"\n            )\n            return topics\n\n        updated_topics = {}\n        for topic, topic_words in topics.items():\n            words = [word[0] for word in topic_words]\n            word_embeddings = topic_model._extract_embeddings(words, method=\"word\", verbose=False)\n            topic_embedding = topic_model._extract_embeddings(\" \".join(words), method=\"word\", verbose=False).reshape(\n                1, -1\n            )\n            topic_words = mmr(\n                topic_embedding,\n                word_embeddings,\n                words,\n                self.diversity,\n                self.top_n_words,\n            )\n            updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]\n        return updated_topics\n
        "},{"location":"api/representation/mmr.html#bertopic.representation._mmr.MaximalMarginalRelevance.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topic representations.

        Parameters:

        Name Type Description Default topic_model

        The BERTopic model

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_mmr.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topic representations.\n\n    Arguments:\n        topic_model: The BERTopic model\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    if topic_model.embedding_model is None:\n        warnings.warn(\n            \"MaximalMarginalRelevance can only be used BERTopic was instantiated\"\n            \"with the `embedding_model` parameter.\"\n        )\n        return topics\n\n    updated_topics = {}\n    for topic, topic_words in topics.items():\n        words = [word[0] for word in topic_words]\n        word_embeddings = topic_model._extract_embeddings(words, method=\"word\", verbose=False)\n        topic_embedding = topic_model._extract_embeddings(\" \".join(words), method=\"word\", verbose=False).reshape(\n            1, -1\n        )\n        topic_words = mmr(\n            topic_embedding,\n            word_embeddings,\n            words,\n            self.diversity,\n            self.top_n_words,\n        )\n        updated_topics[topic] = [(word, value) for word, value in topics[topic] if word in topic_words]\n    return updated_topics\n
        "},{"location":"api/representation/openai.html","title":"OpenAI","text":"

        Using the OpenAI API to generate topic labels based on one of their Completion of ChatCompletion models.

        The default method is openai.Completion if chat=False. The prompts will also need to follow a completion task. If you are looking for a more interactive chats, use chat=True with model=gpt-3.5-turbo.

        For an overview see: https://platform.openai.com/docs/models

        Parameters:

        Name Type Description Default client

        A openai.OpenAI client

        required model str

        Model to use within OpenAI, defaults to \"text-ada-001\". NOTE: If a gpt-3.5-turbo model is used, make sure to set chat to True.

        'text-embedding-3-small' generator_kwargs Mapping[str, Any]

        Kwargs passed to openai.Completion.create for fine-tuning the output.

        {} prompt str

        The prompt to be used in the model. If no prompt is given, self.default_prompt_ is used instead. NOTE: Use \"[KEYWORDS]\" and \"[DOCUMENTS]\" in the prompt to decide where the keywords and documents need to be inserted.

        None delay_in_seconds float

        The delay in seconds between consecutive prompts in order to prevent RateLimitErrors.

        None exponential_backoff bool

        Retry requests with a random exponential backoff. A short sleep is used when a rate limit error is hit, then the requests is retried. Increase the sleep length if errors are hit until 10 unsuccessful requests. If True, overrides delay_in_seconds.

        False chat bool

        Set this to True if a GPT-3.5 model is used. See: https://platform.openai.com/docs/models/gpt-3-5

        False nr_docs int

        The number of documents to pass to OpenAI if a prompt with the [\"DOCUMENTS\"] tag is used.

        4 diversity float

        The diversity of documents to pass to OpenAI. Accepts values between 0 and 1. A higher values results in passing more diverse documents whereas lower values passes more similar documents.

        None doc_length int

        The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.

        None tokenizer Union[str, Callable]

        The tokenizer used to calculate to split the document into segments used to count the length of a document. * If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length * If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length * If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on doc_length * If tokenizer is a callable, then that callable is used to tokenize the document. These tokens are counted and truncated depending on doc_length

        None

        Usage:

        To use this, you will need to install the openai package first:

        pip install openai

        Then, get yourself an API key and use OpenAI's API as follows:

        import openai\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Create your representation model\nclient = openai.OpenAI(api_key=MY_API_KEY)\nrepresentation_model = OpenAI(client, delay_in_seconds=5)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"I have the following documents: [DOCUMENTS] \\nThese documents are about the following topic: '\"\nrepresentation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5)\n

        If you want to use OpenAI's ChatGPT model:

        representation_model = OpenAI(client, model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n
        Source code in bertopic\\representation\\_openai.py
        class OpenAI(BaseRepresentation):\n    r\"\"\"Using the OpenAI API to generate topic labels based\n    on one of their Completion of ChatCompletion models.\n\n    The default method is `openai.Completion` if `chat=False`.\n    The prompts will also need to follow a completion task. If you\n    are looking for a more interactive chats, use `chat=True`\n    with `model=gpt-3.5-turbo`.\n\n    For an overview see:\n    https://platform.openai.com/docs/models\n\n    Arguments:\n        client: A `openai.OpenAI` client\n        model: Model to use within OpenAI, defaults to `\"text-ada-001\"`.\n               NOTE: If a `gpt-3.5-turbo` model is used, make sure to set\n               `chat` to True.\n        generator_kwargs: Kwargs passed to `openai.Completion.create`\n                          for fine-tuning the output.\n        prompt: The prompt to be used in the model. If no prompt is given,\n                `self.default_prompt_` is used instead.\n                NOTE: Use `\"[KEYWORDS]\"` and `\"[DOCUMENTS]\"` in the prompt\n                to decide where the keywords and documents need to be\n                inserted.\n        delay_in_seconds: The delay in seconds between consecutive prompts\n                          in order to prevent RateLimitErrors.\n        exponential_backoff: Retry requests with a random exponential backoff.\n                             A short sleep is used when a rate limit error is hit,\n                             then the requests is retried. Increase the sleep length\n                             if errors are hit until 10 unsuccessful requests.\n                             If True, overrides `delay_in_seconds`.\n        chat: Set this to True if a GPT-3.5 model is used.\n              See: https://platform.openai.com/docs/models/gpt-3-5\n        nr_docs: The number of documents to pass to OpenAI if a prompt\n                 with the `[\"DOCUMENTS\"]` tag is used.\n        diversity: The diversity of documents to pass to OpenAI.\n                   Accepts values between 0 and 1. A higher\n                   values results in passing more diverse documents\n                   whereas lower values passes more similar documents.\n        doc_length: The maximum length of each document. If a document is longer,\n                    it will be truncated. If None, the entire document is passed.\n        tokenizer: The tokenizer used to calculate to split the document into segments\n                   used to count the length of a document.\n                       * If tokenizer is 'char', then the document is split up\n                         into characters which are counted to adhere to `doc_length`\n                       * If tokenizer is 'whitespace', the document is split up\n                         into words separated by whitespaces. These words are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is 'vectorizer', then the internal CountVectorizer\n                         is used to tokenize the document. These tokens are counted\n                         and truncated depending on `doc_length`\n                       * If tokenizer is a callable, then that callable is used to tokenize\n                         the document. These tokens are counted and truncated depending\n                         on `doc_length`\n\n    Usage:\n\n    To use this, you will need to install the openai package first:\n\n    `pip install openai`\n\n    Then, get yourself an API key and use OpenAI's API as follows:\n\n    ```python\n    import openai\n    from bertopic.representation import OpenAI\n    from bertopic import BERTopic\n\n    # Create your representation model\n    client = openai.OpenAI(api_key=MY_API_KEY)\n    representation_model = OpenAI(client, delay_in_seconds=5)\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can also use a custom prompt:\n\n    ```python\n    prompt = \"I have the following documents: [DOCUMENTS] \\nThese documents are about the following topic: '\"\n    representation_model = OpenAI(client, prompt=prompt, delay_in_seconds=5)\n    ```\n\n    If you want to use OpenAI's ChatGPT model:\n\n    ```python\n    representation_model = OpenAI(client, model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        client,\n        model: str = \"text-embedding-3-small\",\n        prompt: str = None,\n        generator_kwargs: Mapping[str, Any] = {},\n        delay_in_seconds: float = None,\n        exponential_backoff: bool = False,\n        chat: bool = False,\n        nr_docs: int = 4,\n        diversity: float = None,\n        doc_length: int = None,\n        tokenizer: Union[str, Callable] = None,\n    ):\n        self.client = client\n        self.model = model\n\n        if prompt is None:\n            self.prompt = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT\n        else:\n            self.prompt = prompt\n\n        self.default_prompt_ = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT\n        self.delay_in_seconds = delay_in_seconds\n        self.exponential_backoff = exponential_backoff\n        self.chat = chat\n        self.nr_docs = nr_docs\n        self.diversity = diversity\n        self.doc_length = doc_length\n        self.tokenizer = tokenizer\n        self.prompts_ = []\n\n        self.generator_kwargs = generator_kwargs\n        if self.generator_kwargs.get(\"model\"):\n            self.model = generator_kwargs.get(\"model\")\n            del self.generator_kwargs[\"model\"]\n        if self.generator_kwargs.get(\"prompt\"):\n            del self.generator_kwargs[\"prompt\"]\n        if not self.generator_kwargs.get(\"stop\") and not chat:\n            self.generator_kwargs[\"stop\"] = \"\\n\"\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topics.\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: The topic c-TF-IDF representation\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Extract the top n representative documents per topic\n        repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n            c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity\n        )\n\n        # Generate using OpenAI's Language Model\n        updated_topics = {}\n        for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n            truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n            prompt = self._create_prompt(truncated_docs, topic, topics)\n            self.prompts_.append(prompt)\n\n            # Delay\n            if self.delay_in_seconds:\n                time.sleep(self.delay_in_seconds)\n\n            if self.chat:\n                messages = [\n                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n                    {\"role\": \"user\", \"content\": prompt},\n                ]\n                kwargs = {\n                    \"model\": self.model,\n                    \"messages\": messages,\n                    **self.generator_kwargs,\n                }\n                if self.exponential_backoff:\n                    response = chat_completions_with_backoff(self.client, **kwargs)\n                else:\n                    response = self.client.chat.completions.create(**kwargs)\n\n                # Check whether content was actually generated\n                # Addresses #1570 for potential issues with OpenAI's content filter\n                if hasattr(response.choices[0].message, \"content\"):\n                    label = response.choices[0].message.content.strip().replace(\"topic: \", \"\")\n                else:\n                    label = \"No label returned\"\n            else:\n                if self.exponential_backoff:\n                    response = completions_with_backoff(\n                        self.client,\n                        model=self.model,\n                        prompt=prompt,\n                        **self.generator_kwargs,\n                    )\n                else:\n                    response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs)\n                label = response.choices[0].text.strip()\n\n            updated_topics[topic] = [(label, 1)]\n\n        return updated_topics\n\n    def _create_prompt(self, docs, topic, topics):\n        keywords = list(zip(*topics[topic]))[0]\n\n        # Use the Default Chat Prompt\n        if self.prompt == DEFAULT_CHAT_PROMPT or self.prompt == DEFAULT_PROMPT:\n            prompt = self.prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            prompt = self._replace_documents(prompt, docs)\n\n        # Use a custom prompt that leverages keywords, documents or both using\n        # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively\n        else:\n            prompt = self.prompt\n            if \"[KEYWORDS]\" in prompt:\n                prompt = prompt.replace(\"[KEYWORDS]\", \", \".join(keywords))\n            if \"[DOCUMENTS]\" in prompt:\n                prompt = self._replace_documents(prompt, docs)\n\n        return prompt\n\n    @staticmethod\n    def _replace_documents(prompt, docs):\n        to_replace = \"\"\n        for doc in docs:\n            to_replace += f\"- {doc}\\n\"\n        prompt = prompt.replace(\"[DOCUMENTS]\", to_replace)\n        return prompt\n
        "},{"location":"api/representation/openai.html#bertopic.representation._openai.OpenAI.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        The topic c-TF-IDF representation

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_openai.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topics.\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: The topic c-TF-IDF representation\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Extract the top n representative documents per topic\n    repr_docs_mappings, _, _, _ = topic_model._extract_representative_docs(\n        c_tf_idf, documents, topics, 500, self.nr_docs, self.diversity\n    )\n\n    # Generate using OpenAI's Language Model\n    updated_topics = {}\n    for topic, docs in tqdm(repr_docs_mappings.items(), disable=not topic_model.verbose):\n        truncated_docs = [truncate_document(topic_model, self.doc_length, self.tokenizer, doc) for doc in docs]\n        prompt = self._create_prompt(truncated_docs, topic, topics)\n        self.prompts_.append(prompt)\n\n        # Delay\n        if self.delay_in_seconds:\n            time.sleep(self.delay_in_seconds)\n\n        if self.chat:\n            messages = [\n                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n                {\"role\": \"user\", \"content\": prompt},\n            ]\n            kwargs = {\n                \"model\": self.model,\n                \"messages\": messages,\n                **self.generator_kwargs,\n            }\n            if self.exponential_backoff:\n                response = chat_completions_with_backoff(self.client, **kwargs)\n            else:\n                response = self.client.chat.completions.create(**kwargs)\n\n            # Check whether content was actually generated\n            # Addresses #1570 for potential issues with OpenAI's content filter\n            if hasattr(response.choices[0].message, \"content\"):\n                label = response.choices[0].message.content.strip().replace(\"topic: \", \"\")\n            else:\n                label = \"No label returned\"\n        else:\n            if self.exponential_backoff:\n                response = completions_with_backoff(\n                    self.client,\n                    model=self.model,\n                    prompt=prompt,\n                    **self.generator_kwargs,\n                )\n            else:\n                response = self.client.completions.create(model=self.model, prompt=prompt, **self.generator_kwargs)\n            label = response.choices[0].text.strip()\n\n        updated_topics[topic] = [(label, 1)]\n\n    return updated_topics\n
        "},{"location":"api/representation/pos.html","title":"PartOfSpeech","text":"

        Extract Topic Keywords based on their Part-of-Speech.

        DEFAULT_PATTERNS = [ [{'POS': 'ADJ'}, {'POS': 'NOUN'}], [{'POS': 'NOUN'}], [{'POS': 'ADJ'}] ]

        From candidate topics, as extracted with c-TF-IDF, find documents that contain keywords found in the candidate topics. These candidate documents then serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic.

        These candidate keywords are first judged by whether they fall within the DEFAULT_PATTERNS or the user-defined pattern. Then, the resulting keywords are sorted by their respective c-TF-IDF values.

        Parameters:

        Name Type Description Default model Union[str, spacy.language.Language]

        The Spacy model to use

        'en_core_web_sm' top_n_words int

        The top n words to extract

        10 pos_patterns List[str]

        Patterns for Spacy to use. See https://spacy.io/usage/rule-based-matching

        None

        Usage:

        from bertopic.representation import PartOfSpeech\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can define custom POS patterns to be extracted:

        pos_patterns = [\n            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n            [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n]\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\", pos_patterns=pos_patterns)\n
        Source code in bertopic\\representation\\_pos.py
        class PartOfSpeech(BaseRepresentation):\n    \"\"\"Extract Topic Keywords based on their Part-of-Speech.\n\n    DEFAULT_PATTERNS = [\n                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n                [{'POS': 'NOUN'}],\n                [{'POS': 'ADJ'}]\n    ]\n\n    From candidate topics, as extracted with c-TF-IDF,\n    find documents that contain keywords found in the\n    candidate topics. These candidate documents then\n    serve as the representative set of documents from\n    which the Spacy model can extract a set of candidate\n    keywords for each topic.\n\n    These candidate keywords are first judged by whether\n    they fall within the DEFAULT_PATTERNS or the user-defined\n    pattern. Then, the resulting keywords are sorted by\n    their respective c-TF-IDF values.\n\n    Arguments:\n        model: The Spacy model to use\n        top_n_words: The top n words to extract\n        pos_patterns: Patterns for Spacy to use.\n                      See https://spacy.io/usage/rule-based-matching\n\n    Usage:\n\n    ```python\n    from bertopic.representation import PartOfSpeech\n    from bertopic import BERTopic\n\n    # Create your representation model\n    representation_model = PartOfSpeech(\"en_core_web_sm\")\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n\n    You can define custom POS patterns to be extracted:\n\n    ```python\n    pos_patterns = [\n                [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n                [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n    ]\n    representation_model = PartOfSpeech(\"en_core_web_sm\", pos_patterns=pos_patterns)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        model: Union[str, Language] = \"en_core_web_sm\",\n        top_n_words: int = 10,\n        pos_patterns: List[str] = None,\n    ):\n        if isinstance(model, str):\n            self.model = spacy.load(model)\n        elif isinstance(model, Language):\n            self.model = model\n        else:\n            raise ValueError(\n                \"Make sure that the Spacy model that you\"\n                \"pass is either a string referring to a\"\n                \"Spacy model or a Spacy nlp object.\"\n            )\n\n        self.top_n_words = top_n_words\n\n        if pos_patterns is None:\n            self.pos_patterns = [\n                [{\"POS\": \"ADJ\"}, {\"POS\": \"NOUN\"}],\n                [{\"POS\": \"NOUN\"}],\n                [{\"POS\": \"ADJ\"}],\n            ]\n        else:\n            self.pos_patterns = pos_patterns\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topics.\n\n        Arguments:\n            topic_model: A BERTopic model\n            documents: All input documents\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        matcher = Matcher(self.model.vocab)\n        matcher.add(\"Pattern\", self.pos_patterns)\n\n        candidate_topics = {}\n        for topic, values in topics.items():\n            keywords = list(zip(*values))[0]\n\n            # Extract candidate documents\n            candidate_documents = []\n            for keyword in keywords:\n                selection = documents.loc[documents.Topic == topic, :]\n                selection = selection.loc[selection.Document.str.contains(keyword), \"Document\"]\n                if len(selection) > 0:\n                    for document in selection[:2]:\n                        candidate_documents.append(document)\n            candidate_documents = list(set(candidate_documents))\n\n            # Extract keywords\n            docs_pipeline = self.model.pipe(candidate_documents)\n            updated_keywords = []\n            for doc in docs_pipeline:\n                matches = matcher(doc)\n                for _, start, end in matches:\n                    updated_keywords.append(doc[start:end].text)\n            candidate_topics[topic] = list(set(updated_keywords))\n\n        # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n        # and will be removed in 1.2. Please use get_feature_names_out instead.\n        if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n            words = list(topic_model.vectorizer_model.get_feature_names_out())\n        else:\n            words = list(topic_model.vectorizer_model.get_feature_names())\n\n        # Match updated keywords with c-TF-IDF values\n        words_lookup = dict(zip(words, range(len(words))))\n        updated_topics = {topic: [] for topic in topics.keys()}\n\n        for topic, candidate_keywords in candidate_topics.items():\n            word_indices = np.sort(\n                [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup]\n            )\n            vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]\n            indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]\n            vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]\n            topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]\n            updated_topics[topic] = topic_words\n            if len(updated_topics[topic]) < self.top_n_words:\n                updated_topics[topic] += [(\"\", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))]\n\n        return updated_topics\n
        "},{"location":"api/representation/pos.html#bertopic.representation._pos.PartOfSpeech.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Parameters:

        Name Type Description Default topic_model

        A BERTopic model

        required documents DataFrame

        All input documents

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_pos.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topics.\n\n    Arguments:\n        topic_model: A BERTopic model\n        documents: All input documents\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    matcher = Matcher(self.model.vocab)\n    matcher.add(\"Pattern\", self.pos_patterns)\n\n    candidate_topics = {}\n    for topic, values in topics.items():\n        keywords = list(zip(*values))[0]\n\n        # Extract candidate documents\n        candidate_documents = []\n        for keyword in keywords:\n            selection = documents.loc[documents.Topic == topic, :]\n            selection = selection.loc[selection.Document.str.contains(keyword), \"Document\"]\n            if len(selection) > 0:\n                for document in selection[:2]:\n                    candidate_documents.append(document)\n        candidate_documents = list(set(candidate_documents))\n\n        # Extract keywords\n        docs_pipeline = self.model.pipe(candidate_documents)\n        updated_keywords = []\n        for doc in docs_pipeline:\n            matches = matcher(doc)\n            for _, start, end in matches:\n                updated_keywords.append(doc[start:end].text)\n        candidate_topics[topic] = list(set(updated_keywords))\n\n    # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0\n    # and will be removed in 1.2. Please use get_feature_names_out instead.\n    if version.parse(sklearn_version) >= version.parse(\"1.0.0\"):\n        words = list(topic_model.vectorizer_model.get_feature_names_out())\n    else:\n        words = list(topic_model.vectorizer_model.get_feature_names())\n\n    # Match updated keywords with c-TF-IDF values\n    words_lookup = dict(zip(words, range(len(words))))\n    updated_topics = {topic: [] for topic in topics.keys()}\n\n    for topic, candidate_keywords in candidate_topics.items():\n        word_indices = np.sort(\n            [words_lookup.get(keyword) for keyword in candidate_keywords if keyword in words_lookup]\n        )\n        vals = topic_model.c_tf_idf_[:, word_indices][topic + topic_model._outliers]\n        indices = np.argsort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]\n        vals = np.sort(np.array(vals.todense().reshape(1, -1))[0])[-self.top_n_words :][::-1]\n        topic_words = [(words[word_indices[index]], val) for index, val in zip(indices, vals)]\n        updated_topics[topic] = topic_words\n        if len(updated_topics[topic]) < self.top_n_words:\n            updated_topics[topic] += [(\"\", 0) for _ in range(self.top_n_words - len(updated_topics[topic]))]\n\n    return updated_topics\n
        "},{"location":"api/representation/zeroshot.html","title":"ZeroShotClassification","text":"

        Zero-shot Classification on topic keywords with candidate labels.

        Parameters:

        Name Type Description Default candidate_topics List[str]

        A list of labels to assign to the topics if they exceed min_prob

        required model str

        A transformers pipeline that should be initialized as \"zero-shot-classification\". For example, pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")

        'facebook/bart-large-mnli' pipeline_kwargs Mapping[str, Any]

        Kwargs that you can pass to the transformers.pipeline when it is called. NOTE: Use {\"multi_label\": True} to extract multiple labels for each topic.

        {} min_prob float

        The minimum probability to assign a candidate label to a topic

        0.8

        Usage:

        from bertopic.representation import ZeroShotClassification\nfrom bertopic import BERTopic\n\n# Create your representation model\ncandidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\nrepresentation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        Source code in bertopic\\representation\\_zeroshot.py
        class ZeroShotClassification(BaseRepresentation):\n    \"\"\"Zero-shot Classification on topic keywords with candidate labels.\n\n    Arguments:\n        candidate_topics: A list of labels to assign to the topics if they\n                          exceed `min_prob`\n        model: A transformers pipeline that should be initialized as\n               \"zero-shot-classification\". For example,\n               `pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")`\n        pipeline_kwargs: Kwargs that you can pass to the transformers.pipeline\n                         when it is called. NOTE: Use `{\"multi_label\": True}`\n                         to extract multiple labels for each topic.\n        min_prob: The minimum probability to assign a candidate label to a topic\n\n    Usage:\n\n    ```python\n    from bertopic.representation import ZeroShotClassification\n    from bertopic import BERTopic\n\n    # Create your representation model\n    candidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\n    representation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n    # Use the representation model in BERTopic on top of the default pipeline\n    topic_model = BERTopic(representation_model=representation_model)\n    ```\n    \"\"\"\n\n    def __init__(\n        self,\n        candidate_topics: List[str],\n        model: str = \"facebook/bart-large-mnli\",\n        pipeline_kwargs: Mapping[str, Any] = {},\n        min_prob: float = 0.8,\n    ):\n        self.candidate_topics = candidate_topics\n        if isinstance(model, str):\n            self.model = pipeline(\"zero-shot-classification\", model=model)\n        elif isinstance(model, Pipeline):\n            self.model = model\n        else:\n            raise ValueError(\n                \"Make sure that the HF model that you\"\n                \"pass is either a string referring to a\"\n                \"HF model or a `transformers.pipeline` object.\"\n            )\n        self.pipeline_kwargs = pipeline_kwargs\n        self.min_prob = min_prob\n\n    def extract_topics(\n        self,\n        topic_model,\n        documents: pd.DataFrame,\n        c_tf_idf: csr_matrix,\n        topics: Mapping[str, List[Tuple[str, float]]],\n    ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\"Extract topics.\n\n        Arguments:\n            topic_model: Not used\n            documents: Not used\n            c_tf_idf: Not used\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        # Classify topics\n        topic_descriptions = [\" \".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()]\n        classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs)\n\n        # Extract labels\n        updated_topics = {}\n        for topic, classification in zip(topics.keys(), classifications):\n            topic_description = topics[topic]\n\n            # Multi-label assignment\n            if self.pipeline_kwargs.get(\"multi_label\"):\n                topic_description = []\n                for label, score in zip(classification[\"labels\"], classification[\"scores\"]):\n                    if score > self.min_prob:\n                        topic_description.append((label, score))\n\n            # Single label assignment\n            elif classification[\"scores\"][0] > self.min_prob:\n                topic_description = [(classification[\"labels\"][0], classification[\"scores\"][0])]\n\n            # Make sure that 10 items are returned\n            if len(topic_description) == 0:\n                topic_description = topics[topic]\n            elif len(topic_description) < 10:\n                topic_description += [(\"\", 0) for _ in range(10 - len(topic_description))]\n            updated_topics[topic] = topic_description\n\n        return updated_topics\n
        "},{"location":"api/representation/zeroshot.html#bertopic.representation._zeroshot.ZeroShotClassification.extract_topics","title":"extract_topics(self, topic_model, documents, c_tf_idf, topics)","text":"

        Extract topics.

        Parameters:

        Name Type Description Default topic_model

        Not used

        required documents DataFrame

        Not used

        required c_tf_idf csr_matrix

        Not used

        required topics Mapping[str, List[Tuple[str, float]]]

        The candidate topics as calculated with c-TF-IDF

        required

        Returns:

        Type Description updated_topics

        Updated topic representations

        Source code in bertopic\\representation\\_zeroshot.py
        def extract_topics(\n    self,\n    topic_model,\n    documents: pd.DataFrame,\n    c_tf_idf: csr_matrix,\n    topics: Mapping[str, List[Tuple[str, float]]],\n) -> Mapping[str, List[Tuple[str, float]]]:\n    \"\"\"Extract topics.\n\n    Arguments:\n        topic_model: Not used\n        documents: Not used\n        c_tf_idf: Not used\n        topics: The candidate topics as calculated with c-TF-IDF\n\n    Returns:\n        updated_topics: Updated topic representations\n    \"\"\"\n    # Classify topics\n    topic_descriptions = [\" \".join(list(zip(*topics[topic]))[0]) for topic in topics.keys()]\n    classifications = self.model(topic_descriptions, self.candidate_topics, **self.pipeline_kwargs)\n\n    # Extract labels\n    updated_topics = {}\n    for topic, classification in zip(topics.keys(), classifications):\n        topic_description = topics[topic]\n\n        # Multi-label assignment\n        if self.pipeline_kwargs.get(\"multi_label\"):\n            topic_description = []\n            for label, score in zip(classification[\"labels\"], classification[\"scores\"]):\n                if score > self.min_prob:\n                    topic_description.append((label, score))\n\n        # Single label assignment\n        elif classification[\"scores\"][0] > self.min_prob:\n            topic_description = [(classification[\"labels\"][0], classification[\"scores\"][0])]\n\n        # Make sure that 10 items are returned\n        if len(topic_description) == 0:\n            topic_description = topics[topic]\n        elif len(topic_description) < 10:\n            topic_description += [(\"\", 0) for _ in range(10 - len(topic_description))]\n        updated_topics[topic] = topic_description\n\n    return updated_topics\n
        "},{"location":"getting_started/best_practices/best_practices.html","title":"Best Practices","text":"

        - Overview of Best Practices

        Through the nature of BERTopic, its modularity, many variations of the topic modeling technique is possible. However, during the development and through the usage of the package, a set of best practices have been developed that generally lead to great results.

        The following are a number of steps, parameters, and settings that you can use that will generally improve the quality of the resulting topics. In other words, after going through the quick start and getting a feeling for the API these steps should get you to the next level of performance.

        Note

        Although these are called best practices, it does not necessarily mean that they work across all use cases perfectly. The underlying modular nature of BERTopic is meant to take different use cases into account. After going through these practices it is advised to fine-tune wherever necessary.

        To showcase how these \"best practices\" work, we will go through an example dataset and apply all practices to it.

        "},{"location":"getting_started/best_practices/best_practices.html#data","title":"Data","text":"

        For this example, we will use a dataset containing abstracts and metadata from ArXiv articles.

        from datasets import load_dataset\n\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\n\n# Extract abstracts to train on and corresponding titles\nabstracts = dataset[\"abstract\"]\ntitles = dataset[\"title\"]\n

        Sentence Splitter

        Whenever you have large documents, you typically want to split them up into either paragraphs or sentences. A nice way to do so is by using NLTK's sentence splitter which is nothing more than:

        from nltk.tokenize import sent_tokenize, word_tokenize\nsentences = [sent_tokenize(abstract) for abstract in abstracts]\nsentences = [sentence for doc in sentences for sentence in doc]\n
        "},{"location":"getting_started/best_practices/best_practices.html#pre-calculate-embeddings","title":"Pre-calculate Embeddings","text":"

        After having created our data, namely abstracts, we can dive into the very first best practice, pre-calculating embeddings.

        BERTopic works by converting documents into numerical values, called embeddings. This process can be very costly, especially if we want to iterate over parameters. Instead, we can calculate those embeddings once and feed them to BERTopic to skip calculating embeddings each time.

        from sentence_transformers import SentenceTransformer\n\n# Pre-calculate embeddings\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = embedding_model.encode(abstracts, show_progress_bar=True)\n

        Tip

        New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the MTEB leaderboard. It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.

        "},{"location":"getting_started/best_practices/best_practices.html#preventing-stochastic-behavior","title":"Preventing Stochastic Behavior","text":"

        In BERTopic, we generally use a dimensionality reduction algorithm to reduce the size of the embeddings. This is done to prevent the curse of dimensionality to a certain degree.

        As a default, this is done with UMAP which is an incredible algorithm for reducing dimensional space. However, by default, it shows stochastic behavior which creates different results each time you run it. To prevent that, we will need to set a random_state of the model before passing it to BERTopic.

        As a result, we can now fully reproduce the results each time we run the model.

        from umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n
        "},{"location":"getting_started/best_practices/best_practices.html#controlling-number-of-topics","title":"Controlling Number of Topics","text":"

        There is a parameter to control the number of topics, namely nr_topics. This parameter, however, merges topics after they have been created. It is a parameter that supports creating a fixed number of topics.

        However, it is advised to control the number of topics through the cluster model which is by default HDBSCAN. HDBSCAN has a parameter, namely min_cluster_size that indirectly controls the number of topics that will be created.

        A higher min_cluster_size will generate fewer topics and a lower min_cluster_size will generate more topics.

        Here, we will go with min_cluster_size=150 to prevent too many micro-clusters from being created:

        from hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)\n
        "},{"location":"getting_started/best_practices/best_practices.html#improving-default-representation","title":"Improving Default Representation","text":"

        The default representation of topics is calculated through c-TF-IDF. However, c-TF-IDF is powered by the CountVectorizer which converts text into tokens. Using the CountVectorizer, we can do a number of things:

        • Remove stopwords
        • Ignore infrequent words
        • Increase the n-gram range

        In other words, we can preprocess the topic representations after documents are assigned to topics. This will not influence the clustering process in any way.

        Here, we will ignore English stopwords and infrequent words. Moreover, by increasing the n-gram range we will consider topic representations that are made up of one or two words.

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(stop_words=\"english\", min_df=2, ngram_range=(1, 2))\n
        "},{"location":"getting_started/best_practices/best_practices.html#additional-representations","title":"Additional Representations","text":"

        Previously, we have tuned the default representation but there are quite a number of other topic representations in BERTopic that we can choose from. From KeyBERTInspired and PartOfSpeech, to OpenAI's ChatGPT and open-source alternatives, many representations are possible.

        In BERTopic, you can model many different topic representations simultaneously to test them out and get different perspectives of topic descriptions. This is called multi-aspect topic modeling.

        Here, we will demonstrate a number of interesting and useful representations in BERTopic:

        • KeyBERTInspired
        • A method that derives inspiration from how KeyBERT works
        • PartOfSpeech
        • Using SpaCy's POS tagging to extract words
        • MaximalMarginalRelevance
        • Diversify the topic words
        • OpenAI
        • Use ChatGPT to label our topics
        import openai\nfrom bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech\n\n# KeyBERT\nkeybert_model = KeyBERTInspired()\n\n# Part-of-Speech\npos_model = PartOfSpeech(\"en_core_web_sm\")\n\n# MMR\nmmr_model = MaximalMarginalRelevance(diversity=0.3)\n\n# GPT-3.5\nclient = openai.OpenAI(api_key=\"sk-...\")\nprompt = \"\"\"\nI have a topic that contains the following documents: \n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:\ntopic: <topic label>\n\"\"\"\nopenai_model = OpenAI(client, model=\"gpt-3.5-turbo\", exponential_backoff=True, chat=True, prompt=prompt)\n\n# All representation models\nrepresentation_model = {\n    \"KeyBERT\": keybert_model,\n    # \"OpenAI\": openai_model,  # Uncomment if you will use OpenAI\n    \"MMR\": mmr_model,\n    \"POS\": pos_model\n}\n
        "},{"location":"getting_started/best_practices/best_practices.html#training","title":"Training","text":"

        Now that we have a set of best practices, we can use them in our training loop. Here, several different representations, keywords and labels for our topics will be created. If you want to iterate over the topic model it is advised to use the pre-calculated embeddings as that significantly speeds up training.

        from bertopic import BERTopic\n\ntopic_model = BERTopic(\n\n  # Pipeline models\n  embedding_model=embedding_model,\n  umap_model=umap_model,\n  hdbscan_model=hdbscan_model,\n  vectorizer_model=vectorizer_model,\n  representation_model=representation_model,\n\n  # Hyperparameters\n  top_n_words=10,\n  verbose=True\n)\n\n# Train model\ntopics, probs = topic_model.fit_transform(abstracts, embeddings)\n\n# Show topics\ntopic_model.get_topic_info()\n

        To get all representations for a single topic, we simply run the following:

        >>> topic_model.get_topic(1, full=True)\n{'Main': [('adversarial', 0.028838938990764302),\n  ('attacks', 0.021726302042463556),\n  ('attack', 0.016803574415028524),\n  ('robustness', 0.013046135743326167),\n  ('adversarial examples', 0.01151254557995679),\n  ('examples', 0.009920962487998853),\n  ('perturbations', 0.009053305826870773),\n  ('adversarial attacks', 0.008747627064844006),\n  ('malware', 0.007675131707700338),\n  ('defense', 0.007365955840313783)],\n 'KeyBERT': [('adversarial training', 0.76427937),\n  ('adversarial attack', 0.74271905),\n  ('vulnerable adversarial', 0.73302543),\n  ('adversarial', 0.7311052),\n  ('adversarial examples', 0.7179245),\n  ('adversarial attacks', 0.7082),\n  ('adversarially', 0.7005141),\n  ('adversarial robustness', 0.69911957),\n  ('adversarial perturbations', 0.6588783),\n  ('adversary', 0.4467769)],\n 'OpenAI': [('Adversarial attacks and defense', 1)],\n 'MMR': [('adversarial', 0.028838938990764302),\n  ('attacks', 0.021726302042463556),\n  ('attack', 0.016803574415028524),\n  ('robustness', 0.013046135743326167),\n  ('adversarial examples', 0.01151254557995679),\n  ('examples', 0.009920962487998853),\n  ('perturbations', 0.009053305826870773),\n  ('adversarial attacks', 0.008747627064844006),\n  ('malware', 0.007675131707700338),\n  ('defense', 0.007365955840313783)],\n 'POS': [('adversarial', 0.028838938990764302),\n  ('attacks', 0.021726302042463556),\n  ('attack', 0.016803574415028524),\n  ('robustness', 0.013046135743326167),\n  ('adversarial examples', 0.01151254557995679),\n  ('examples', 0.009920962487998853),\n  ('perturbations', 0.009053305826870773),\n  ('adversarial attacks', 0.008747627064844006),\n  ('malware', 0.007675131707700338),\n  ('defense', 0.007365955840313783)]}\n

        NOTE: The labels generated by OpenAI's ChatGPT are especially interesting to use throughout your model. Below, we will go into more detail how to set that as a custom label.

        Parameters

        If you would like to return the topic-document probability matrix, then it is advised to use calculate_probabilities=True. Do note that this can significantly slow down training. To speed it up, use cuML's HDBSCAN instead. You could also approximate the topic-document probability matrix with .approximate_distribution which will be discussed later.

        "},{"location":"getting_started/best_practices/best_practices.html#custom-labels","title":"(Custom) Labels","text":"

        The default label of each topic are the top 3 words in each topic combined with an underscore between them.

        This, of course, might not be the best label that you can think of for a certain topic. Instead, we can use .set_topic_labels to manually label all or certain topics.

        We can also use .set_topic_labels to use one of the other topic representations that we had before, like KeyBERTInspired or even OpenAI.

        # Label the topics yourself\ntopic_model.set_topic_labels({1: \"Space Travel\", 7: \"Religion\"})\n\n# or use one of the other topic representations, like KeyBERTInspired\nkeybert_topic_labels = {topic: \" | \".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_[\"KeyBERT\"].items()}\ntopic_model.set_topic_labels(keybert_topic_labels)\n\n# or ChatGPT's labels\nchatgpt_topic_labels = {topic: \" | \".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_[\"OpenAI\"].items()}\nchatgpt_topic_labels[-1] = \"Outlier Topic\"\ntopic_model.set_topic_labels(chatgpt_topic_labels)\n

        Now that we have set the updated topic labels, we can access them with the many functions used throughout BERTopic. Most notably, you can show the updated labels in visualizations with the custom_labels=True parameters.

        If we were to run topic_model.get_topic_info() it will now include the column CustomName. That is the custom label that we just created for each topic.

        "},{"location":"getting_started/best_practices/best_practices.html#topic-document-distribution","title":"Topic-Document Distribution","text":"

        If using calculate_probabilities=True is not possible, then you can approximate the topic-document distributions using .approximate_distribution. It is a fast and flexible method for creating different topic-document distributions.

        # `topic_distr` contains the distribution of topics in each document\ntopic_distr, _ = topic_model.approximate_distribution(abstracts, window=8, stride=4)\n

        Next, lets take a look at a specific abstract and see how the topic distribution was extracted:

        # Visualize the topic-document distribution for a single document\ntopic_model.visualize_distribution(topic_distr[abstract_id], custom_labels=True)\n

        It seems to have extracted a number of topics that are relevant and shows the distributions of these topics across the abstract. We can go one step further and visualize them on a token-level:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(abstracts[abstract_id], calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(abstracts[abstract_id], topic_token_distr[0])\ndf\n

        use_embedding_model

        As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected embedding_model instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower:

        topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True)\n
        "},{"location":"getting_started/best_practices/best_practices.html#outlier-reduction","title":"Outlier Reduction","text":"

        By default, HDBSCAN generates outliers which is a helpful mechanic in creating accurate topic representations. However, you might want to assign every single document to a topic. We can use .reduce_outliers to map some or all outliers to a topic:

        # Reduce outliers\nnew_topics = topic_model.reduce_outliers(abstracts, topics)\n\n# Reduce outliers with pre-calculate embeddings instead\nnew_topics = topic_model.reduce_outliers(abstracts, topics, strategy=\"embeddings\", embeddings=embeddings)\n

        Update Topics with Outlier Reduction

        After having generated updated topic assignments, we can pass them to BERTopic in order to update the topic representations:

        topic_model.update_topics(docs, topics=new_topics)\n

        It is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2.

        "},{"location":"getting_started/best_practices/best_practices.html#visualize-topics","title":"Visualize Topics","text":"

        With visualizations, we are closing into the realm of subjective \"best practices\". These are things that I generally do because I like the representations but your experience might differ.

        Having said that, there are two visualizations that are my go-to when visualizing the topics themselves:

        • topic_model.visualize_topics()
        • topic_model.visualize_hierarchy()
        # Visualize topics with custom labels\ntopic_model.visualize_topics(custom_labels=True)\n\n# Visualize hierarchy with custom labels\ntopic_model.visualize_hierarchy(custom_labels=True)\n
        "},{"location":"getting_started/best_practices/best_practices.html#visualize-documents","title":"Visualize Documents","text":"

        When visualizing documents, it helps to have embedded the documents beforehand to speed up computation. Fortunately, we have already done that as a \"best practice\".

        Visualizing documents in 2-dimensional space helps in understanding the underlying structure of the documents and topics.

        # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\n

        The following plot is interactive which means that you can zoom in, double click on a label to only see that one and generally interact with the plot:

        # Visualize the documents in 2-dimensional space and show the titles on hover instead of the abstracts\n# NOTE: You can hide the hover with `hide_document_hover=True` which is especially helpful if you have a large dataset\n# NOTE: You can also hide the annotations with `hide_annotations=True` which is helpful to see the larger structure\ntopic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, custom_labels=True)\n

        2-dimensional space

        Although visualizing the documents in 2-dimensional gives an idea of their underlying structure, there is a risk involved.

        Visualizing the documents in 2-dimensional space means that we have lost significant information since the original embeddings were more than 384 dimensions. Condensing all that information in 2 dimensions is simply not possible. In other words, it is merely an approximation, albeit quite an accurate one.

        "},{"location":"getting_started/best_practices/best_practices.html#serialization","title":"Serialization","text":"

        When saving a BERTopic model, there are several ways in doing so. You can either save the entire model with pickle, pytorch, or safetensors.

        Personally, I would advise going with safetensors whenever possible. The reason for this is that the format allows for a very small topic model to be saved and shared.

        When saving a model with safetensors, it skips over saving the dimensionality reduction and clustering models. The .transform function will still work without these models but instead assign topics based on the similarity between document embeddings and the topic embeddings.

        As a result, the .transform step might give different results but it is generally worth it considering the smaller and significantly faster model.

        embedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"my_model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n

        Embedding Model

        Using safetensors, we are not saving the underlying embedding model but merely a pointer to the model. For example, in the above example we are saving the string \"sentence-transformers/all-MiniLM-L6-v2\" so that we can load in the embedding model alongside the topic model.

        This currently only works if you are using a sentence transformer model. If you are using a different model, you can load it in when loading the topic model like this:

        from sentence_transformers import SentenceTransformer\n\n# Define embedding model\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n\n# Load model and add embedding model\nloaded_model = BERTopic.load(\"my_model_dir\", embedding_model=embedding_model)\n
        "},{"location":"getting_started/best_practices/best_practices.html#inference","title":"Inference","text":"

        To speed up the inference, we can leverage a \"best practice\" that we used before, namely serialization. When you save a model as safetensors and then load it in, we are removing the dimensionality reduction and clustering steps from the pipeline.

        Instead, the assignment of topics is done through cosine similarity of document embeddings and topic embeddings. This speeds up inferences significantly.

        To show its effect, let's start by disabling the logger:

        from bertopic._utils import MyLogger\nlogger = MyLogger()\nlogger.configure(\"ERROR\")\nloaded_model.verbose = False\ntopic_model.verbose = False\n

        Then, we run inference on both the loaded model and the non-loaded model:

        >>> %timeit loaded_model.transform(abstracts[:100])\n343 ms \u00b1 31.1 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\n
        >>> %timeit topic_model.transform(abstracts[:100])\n1.37 s \u00b1 166 ms per loop (mean \u00b1 std. dev. of 7 runs, 1 loop each)\n

        Based on the above, the loaded_model seems to be quite a bit faster for inference than the original topic_model.

        "},{"location":"getting_started/clustering/clustering.html","title":"3. Clustering","text":"

        After reducing the dimensionality of our input embeddings, we need to cluster them into groups of similar embeddings to extract our topics. This process of clustering is quite important because the more performant our clustering technique the more accurate our topic representations are.

        In BERTopic, we typically use HDBSCAN as it is quite capable of capturing structures with different densities. However, there is not one perfect clustering model and you might want to be using something entirely different for your use case. Moreover, what if a new state-of-the-art model is released tomorrow? We would like to be able to use that in BERTopic, right? Since BERTopic assumes some independence among steps, we can allow for this modularity:

        As a result, the hdbscan_model parameter in BERTopic now allows for a variety of clustering models. To do so, the class should have the following attributes:

        • .fit(X)
          • A function that can be used to fit the model
        • .predict(X)
          • A predict function that transforms the input to cluster labels
        • .labels_
          • The labels after fitting the model

        In other words, it should have the following structure:

        class ClusterModel:\n    def fit(self, X):\n        self.labels_ = None\n        return self\n\n    def predict(self, X):\n        return X\n

        In this section, we will go through several examples of clustering algorithms and how they can be implemented.

        "},{"location":"getting_started/clustering/clustering.html#hdbscan","title":"HDBSCAN","text":"

        As a default, BERTopic uses HDBSCAN to perform its clustering. To use a HDBSCAN model with custom parameters, we simply define it and pass it to BERTopic:

        from bertopic import BERTopic\nfrom hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model)\n

        Here, we can define any parameters in HDBSCAN to optimize for the best performance based on whatever validation metrics you are using.

        "},{"location":"getting_started/clustering/clustering.html#k-means","title":"k-Means","text":"

        Although HDBSCAN works quite well in BERTopic and is typically advised, you might want to be using k-Means instead. It allows you to select how many clusters you would like and forces every single point to be in a cluster. Therefore, no outliers will be created. This also has disadvantages. When you force every single point in a cluster, it will mean that the cluster is highly likely to contain noise which can hurt the topic representations. As a small tip, using the vectorizer_model=CountVectorizer(stop_words=\"english\") helps quite a bit to then improve the topic representation.

        Having said that, using k-Means is quite straightforward:

        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\n\ncluster_model = KMeans(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n

        Note

        As you might have noticed, the cluster_model is passed to hdbscan_model which might be a bit confusing considering you are not passing an HDBSCAN model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible.

        "},{"location":"getting_started/clustering/clustering.html#agglomerative-clustering","title":"Agglomerative Clustering","text":"

        Like k-Means, there are a bunch more clustering algorithms in sklearn that you can be using. Some of these models do not have a .predict() method but still can be used in BERTopic. However, using BERTopic's .transform() function will then give errors.

        Here, we will demonstrate Agglomerative Clustering:

        from bertopic import BERTopic\nfrom sklearn.cluster import AgglomerativeClustering\n\ncluster_model = AgglomerativeClustering(n_clusters=50)\ntopic_model = BERTopic(hdbscan_model=cluster_model)\n
        "},{"location":"getting_started/clustering/clustering.html#cuml-hdbscan","title":"cuML HDBSCAN","text":"

        Although the original HDBSCAN implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, we can use cuML to speed up HDBSCAN through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model)\n

        The great thing about using cuML's HDBSCAN implementation is that it supports many features of the original implementation. In other words, calculate_probabilities=True also works!

        Note

        As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., .transform) using cuML's HDBSCAN. However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., .fit and .fit_transform).

        Note

        If you want to install cuML together with BERTopic using Google Colab, you can run the following code:

        !pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64\n
        "},{"location":"getting_started/ctfidf/ctfidf.html","title":"c-TF-IDF","text":"

        In BERTopic, in order to get an accurate representation of the topics from our bag-of-words matrix, TF-IDF was adjusted to work on a cluster/categorical/topic level instead of a document level. This adjusted TF-IDF representation is called c-TF-IDF and takes into account what makes the documents in one cluster different from documents in another cluster:

        Each cluster is converted to a single document instead of a set of documents. Then, we extract the frequency of word x in class c, where c refers to the cluster we created before. This results in our class-based tf representation. This representation is L1-normalized to account for the differences in topic sizes. Then, we take the logarithm of one plus the average number of words per class A divided by the frequency of word x across all classes. We add plus one within the logarithm to force values to be positive. This results in our class-based idf representation. Like with the classic TF-IDF, we then multiply tf with idf to get the importance score per word in each class. In other words, the classical TF-IDF procedure is not used here but a modified version of the algorithm that allows for a much better representation.

        Since the topic representation is somewhat independent of the clustering step, we can change how the c-TF-IDF representation will look like. This can be in the form of parameter tuning, different weighting schemes, or using a diversity metric on top of it. This allows for some modularity concerning the weighting scheme:

        This class-based TF-IDF representation is enabled by default in BERTopic. However, we can explicitly pass it to BERTopic through the ctfidf_model allowing for parameter tuning and the customization of the topic extraction technique:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer()\ntopic_model = BERTopic(ctfidf_model=ctfidf_model )\n
        "},{"location":"getting_started/ctfidf/ctfidf.html#parameters","title":"Parameters","text":"

        There are two parameters worth exploring in the ClassTfidfTransformer, namely bm25_weighting and reduce_frequent_words.

        "},{"location":"getting_started/ctfidf/ctfidf.html#bm25_weighting","title":"bm25_weighting","text":"

        The bm25_weighting is a boolean parameter that indicates whether a class-based BM-25 weighting measure is used instead of the default method as defined in the formula at the beginning of this page.

        Instead of using the following weighting scheme:

        the class-based BM-25 weighting is used instead:

        At smaller datasets, this variant can be more robust to stop words that appear in your data. It can be enabled as follows:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(bm25_weighting=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model )\n
        "},{"location":"getting_started/ctfidf/ctfidf.html#reduce_frequent_words","title":"reduce_frequent_words","text":"

        Some words appear quite often in every topic but are generally not considered stop words as found in the CountVectorizer(stop_words=\"english\") list. To further reduce these frequent words, we can use reduce_frequent_words to take the square root of the term frequency after applying the weighting scheme.

        Instead of the default term frequency:

        we take the square root of the term frequency after normalizing the frequency matrix:

        Although seemingly a small change, it can have quite a large effect on the number of stop words in the resulting topic representations. It can be enabled as follows:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model )\n

        Tip

        Both parameters can be used simultaneously: ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

        "},{"location":"getting_started/dim_reduction/dim_reduction.html","title":"2. Dimensionality Reduction","text":"

        An important aspect of BERTopic is the dimensionality reduction of the input embeddings. As embeddings are often high in dimensionality, clustering becomes difficult due to the curse of dimensionality.

        A solution is to reduce the dimensionality of the embeddings to a workable dimensional space (e.g., 5) for clustering algorithms to work with. UMAP is used as a default in BERTopic since it can capture both the local and global high-dimensional space in lower dimensions. However, there are other solutions out there, such as PCA that users might be interested in trying out. Since BERTopic allows assumes some independency between steps, we can use any other dimensionality reduction algorithm. The image below illustrates this modularity:

        As a result, the umap_model parameter in BERTopic now allows for a variety of dimensionality reduction models. To do so, the class should have the following attributes:

        • .fit(X)
          • A function that can be used to fit the model
        • .transform(X)
          • A transform function that transforms the input to a lower dimensional size

        In other words, it should have the following structure:

        class DimensionalityReduction:\n    def fit(self, X):\n        return self\n\n    def transform(self, X):\n        return X\n

        In this section, we will go through several examples of dimensionality reduction techniques and how they can be implemented.

        "},{"location":"getting_started/dim_reduction/dim_reduction.html#umap","title":"UMAP","text":"

        As a default, BERTopic uses UMAP to perform its dimensionality reduction. To use a UMAP model with custom parameters, we simply define it and pass it to BERTopic:

        from bertopic import BERTopic\nfrom umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')\ntopic_model = BERTopic(umap_model=umap_model)\n

        Here, we can define any parameters in UMAP to optimize for the best performance based on whatever validation metrics you are using.

        "},{"location":"getting_started/dim_reduction/dim_reduction.html#pca","title":"PCA","text":"

        Although UMAP works quite well in BERTopic and is typically advised, you might want to be using PCA instead. It can be faster to train and perform inference. To use PCA, we can simply import it from sklearn and pass it to the umap_model parameter:

        from bertopic import BERTopic\nfrom sklearn.decomposition import PCA\n\ndim_model = PCA(n_components=5)\ntopic_model = BERTopic(umap_model=dim_model)\n

        As a small note, PCA and k-Means have worked quite well in my experiments and might be interesting to use instead of PCA and HDBSCAN.

        Note

        As you might have noticed, the dim_model is passed to umap_model which might be a bit confusing considering you are not passing a UMAP model. For now, the name of the parameter is kept the same to adhere to the current state of the API. Changing the name could lead to deprecation issues, which I want to prevent as much as possible.

        "},{"location":"getting_started/dim_reduction/dim_reduction.html#truncated-svd","title":"Truncated SVD","text":"

        Like PCA, there are a bunch more dimensionality reduction techniques in sklearn that you can be using. Here, we will demonstrate Truncated SVD but any model can be used as long as it has both a .fit() and .transform() method:

        from bertopic import BERTopic\nfrom sklearn.decomposition import TruncatedSVD\n\ndim_model = TruncatedSVD(n_components=5)\ntopic_model = BERTopic(umap_model=dim_model)\n
        "},{"location":"getting_started/dim_reduction/dim_reduction.html#cuml-umap","title":"cuML UMAP","text":"

        Although the original UMAP implementation is an amazing technique, it may have difficulty handling large amounts of data. Instead, we can use cuML to speed up UMAP through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.manifold import UMAP\n\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\ntopic_model = BERTopic(umap_model=umap_model)\n

        Note

        If you want to install cuML together with BERTopic using Google Colab, you can run the following code:

        !pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64\n
        "},{"location":"getting_started/dim_reduction/dim_reduction.html#skip-dimensionality-reduction","title":"Skip dimensionality reduction","text":"

        Although BERTopic applies dimensionality reduction as a default in its pipeline, this is a step that you might want to skip. We generate an \"empty\" model that simply returns the data pass it to:

        from bertopic import BERTopic\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\n# Fit BERTopic without actually performing any dimensionality reduction\nempty_dimensionality_model = BaseDimensionalityReduction()\ntopic_model = BERTopic(umap_model=empty_dimensionality_model)\n

        In other words, we go from this pipeline:

        SBERT UMAP HDBSCAN c-TF-IDF Embeddings Dimensionality reduction Clustering Topic representation

        To the following pipeline:

        SBERT HDBSCAN c-TF-IDF Embeddings Clustering Topic representation

        "},{"location":"getting_started/distribution/distribution.html","title":"Topic Distributions","text":"

        BERTopic approaches topic modeling as a cluster task and attempts to cluster semantically similar documents to extract common topics. A disadvantage of using such a method is that each document is assigned to a single cluster and therefore also a single topic. In practice, documents may contain a mixture of topics. This can be accounted for by splitting up the documents into sentences and feeding those to BERTopic.

        Another option is to use a cluster model that can perform soft clustering, like HDBSCAN. As BERTopic focuses on modularity, we may still want to model that mixture of topics even when we are using a hard-clustering model, like k-Means without the need to split up our documents. This is where .approximate_distribution comes in!

        the right problem is difficult Solving right the Solving the right problem right problem is problem is difficult create token sets topic-token set similarity document-topic distribution multi-topic assignment on a token level solving topic 2 topic 1 topic 3 topic 4 the right problem is difficult 0.75 0.32 0.16 0.21 0.29 0.81 0.47 0.26 0.12 0.33

        To perform this approximation, each document is split into tokens according to the provided tokenizer in the CountVectorizer. Then, a sliding window is applied on each document creating subsets of the document. For example, with a window size of 3 and stride of 1, the document:

        Solving the right problem is difficult.

        can be split up into solving the right, the right problem, right problem is, and problem is difficult. These are called token sets. For each of these token sets, we calculate their c-TF-IDF representation and find out how similar they are to the previously generated topics. Then, the similarities to the topics for each token set are summed to create a topic distribution for the entire document.

        Although it is often said that documents can contain a mixture of topics, these are often modeled by assigning each word to a single topic. With this approach, we take into account that there may be multiple topics for a single word.

        We can make this multiple-topic word assignment a bit more accurate by then splitting these token sets up into individual tokens and assigning the topic distributions for each token set to each individual token. That way, we can visualize the extent to which a certain word contributes to a document's topic distribution.

        "},{"location":"getting_started/distribution/distribution.html#example","title":"Example","text":"

        To calculate our topic distributions, we first need to fit a basic topic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic().fit(docs)\n

        After doing so, we can approximate the topic distributions for your documents:

        topic_distr, _ = topic_model.approximate_distribution(docs)\n

        The resulting topic_distr is a n x m matrix where n are the topics and m the documents. We can then visualize the distribution of topics in a document:

        topic_model.visualize_distribution(topic_distr[1])\n

        Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])\ndf\n

        Tip

        You can also approximate the topic distributions for unseen documents. It will not be as accurate as .transform but it is quite fast and can serve you well in a production setting.

        Note

        To get the stylized dataframe for .visualize_approximate_distribution you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via pip install jinja2

        "},{"location":"getting_started/distribution/distribution.html#parameters","title":"Parameters","text":"

        There are a few parameters that are of interest which will be discussed below.

        "},{"location":"getting_started/distribution/distribution.html#batch_size","title":"batch_size","text":"

        Creating token sets for each document can result in quite a large list of token sets. The similarity of these token sets with the topics can result a large matrix that might not fit into memory anymore. To circumvent this, we can process batches of documents instead to minimize the memory overload. The value for batch_size indicates the number of documents that will be processed at once:

        topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=500)\n
        "},{"location":"getting_started/distribution/distribution.html#window","title":"window","text":"

        The number of tokens that are combined into token sets are defined by the window parameter. Seeing as we are performing a sliding window, we can change the size of the window. A larger window takes more tokens into account but setting it too large can result in considering too much information. Personally, I like to have this window between 4 and 8:

        topic_distr, _ = topic_model.approximate_distribution(docs, window=4)\n
        "},{"location":"getting_started/distribution/distribution.html#stride","title":"stride","text":"

        The sliding window that is performed on a document shifts, as a default, 1 token to the right each time to create its token sets. As a result, especially with large windows, a single token gets judged several times. We can use the stride parameter to increase the number of tokens the window shifts to the right. By increasing this value, we are judging each token less frequently which often results in a much faster calculation. Combining this parameter with window is preferred. For example, if we have a very large dataset, we can set stride=4 and window=8 to judge token sets that contain 8 tokens but that are shifted with 4 steps each time. As a result, this increases the computational speed quite a bit:

        topic_distr, _ = topic_model.approximate_distribution(docs, window=4)\n
        "},{"location":"getting_started/distribution/distribution.html#use_embedding_model","title":"use_embedding_model","text":"

        As a default, we compare the c-TF-IDF calculations between the token sets and all topics. Due to its bag-of-word representation, this is quite fast. However, you might want to use the selected embedding_model instead to do this comparison. Do note that due to the many token sets, it is often computationally quite a bit slower:

        topic_distr, _ = topic_model.approximate_distribution(docs, use_embedding_model=True)\n
        "},{"location":"getting_started/embeddings/embeddings.html","title":"Embedding Models","text":"

        BERTopic starts with transforming our input documents into numerical representations. Although there are many ways this can be achieved, we typically use sentence-transformers (\"all-MiniLM-L6-v2\") as it is quite capable of capturing the semantic similarity between documents.

        However, there is not one perfect embedding model and you might want to be using something entirely different for your use case. Since BERTopic assumes some independence among steps, we can allow for this modularity:

        This modularity allows us not only to choose any embedding model to convert our documents into numerical representations, we can use essentially any data to perform our clustering. When new state-of-the-art pre-trained embedding models are released, BERTopic will be able to use them. As a result, BERTopic grows with any new models being released. Out of the box, BERTopic supports several embedding techniques. In this section, we will go through several of them and how they can be implemented.

        "},{"location":"getting_started/embeddings/embeddings.html#sentence-transformers","title":"Sentence Transformers","text":"

        You can select any model from sentence-transformers here and pass it through BERTopic with embedding_model:

        from bertopic import BERTopic\ntopic_model = BERTopic(embedding_model=\"all-MiniLM-L6-v2\")\n

        Or select a SentenceTransformer model with your parameters:

        from sentence_transformers import SentenceTransformer\n\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\ntopic_model = BERTopic(embedding_model=sentence_model)\n

        Tip 1!

        This embedding back-end was put here first for a reason, sentence-transformers works amazing out of the box! Playing around with different models can give you great results. Also, make sure to frequently visit this page as new models are often released.

        Tip 2!

        New embedding models are released frequently and their performance keeps getting better. To keep track of the best embedding models out there, you can visit the MTEB leaderboard. It is an excellent place for selecting the embedding that works best for you. For example, if you want the best of the best, then the top 5 models might the place to look.

        Many of these models can be used with SentenceTransformers in BERTopic, like so:

        from sentence_transformers import SentenceTransformer\n\nembedding_model = SentenceTransformer(\"BAAI/bge-base-en-v1.5\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#hugging-face-transformers","title":"\ud83e\udd17 Hugging Face Transformers","text":"

        To use a Hugging Face transformers model, load in a pipeline and point to any model found on their model hub (https://huggingface.co/models):

        from transformers.pipelines import pipeline\n\nembedding_model = pipeline(\"feature-extraction\", model=\"distilbert-base-cased\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n

        Tip!

        These transformers also work quite well using sentence-transformers which has great optimizations tricks that make using it a bit faster.

        "},{"location":"getting_started/embeddings/embeddings.html#flair","title":"Flair","text":"

        Flair allows you to choose almost any embedding model that is publicly available. Flair can be used as follows:

        from flair.embeddings import TransformerDocumentEmbeddings\n\nroberta = TransformerDocumentEmbeddings('roberta-base')\ntopic_model = BERTopic(embedding_model=roberta)\n

        You can select any \ud83e\udd17 transformers model here.

        Moreover, you can also use Flair to use word embeddings and pool them to create document embeddings. Under the hood, Flair simply averages all word embeddings in a document. Then, we can easily pass it to BERTopic to use those word embeddings as document embeddings:

        from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings\n\nglove_embedding = WordEmbeddings('crawl')\ndocument_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])\n\ntopic_model = BERTopic(embedding_model=document_glove_embeddings)\n
        "},{"location":"getting_started/embeddings/embeddings.html#spacy","title":"Spacy","text":"

        Spacy is an amazing framework for processing text. There are many models available across many languages for modeling text.

        To use Spacy's non-transformer models in BERTopic:

        import spacy\n\nnlp = spacy.load(\"en_core_web_md\", exclude=['tagger', 'parser', 'ner', \n                                            'attribute_ruler', 'lemmatizer'])\n\ntopic_model = BERTopic(embedding_model=nlp)\n

        Using spacy-transformer models:

        import spacy\n\nspacy.prefer_gpu()\nnlp = spacy.load(\"en_core_web_trf\", exclude=['tagger', 'parser', 'ner', \n                                             'attribute_ruler', 'lemmatizer'])\n\ntopic_model = BERTopic(embedding_model=nlp)\n

        If you run into memory issues with spacy-transformer models, try:

        import spacy\nfrom thinc.api import set_gpu_allocator, require_gpu\n\nnlp = spacy.load(\"en_core_web_trf\", exclude=['tagger', 'parser', 'ner', \n                                             'attribute_ruler', 'lemmatizer'])\nset_gpu_allocator(\"pytorch\")\nrequire_gpu(0)\n\ntopic_model = BERTopic(embedding_model=nlp)\n
        "},{"location":"getting_started/embeddings/embeddings.html#universal-sentence-encoder-use","title":"Universal Sentence Encoder (USE)","text":"

        The Universal Sentence Encoder encodes text into high-dimensional vectors that are used here for embedding the documents. The model is trained and optimized for greater-than-word length text, such as sentences, phrases, or short paragraphs.

        Using USE in BERTopic is rather straightforward:

        import tensorflow_hub\nembedding_model = tensorflow_hub.load(\"https://tfhub.dev/google/universal-sentence-encoder/4\")\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#gensim","title":"Gensim","text":"

        BERTopic supports the gensim.downloader module, which allows it to download any word embedding model supported by Gensim. Typically, these are Glove, Word2Vec, or FastText embeddings:

        import gensim.downloader as api\nft = api.load('fasttext-wiki-news-subwords-300')\ntopic_model = BERTopic(embedding_model=ft)\n

        Tip!

        Gensim is primarily used for Word Embedding models. This works typically best for short documents since the word embeddings are pooled.

        "},{"location":"getting_started/embeddings/embeddings.html#scikit-learn-embeddings","title":"Scikit-Learn Embeddings","text":"

        Scikit-Learn is a framework for more than just machine learning. It offers many preprocessing tools, some of which can be used to create representations for text. Many of these tools are relatively lightweight and do not require a GPU. While the representations may be less expressive than many BERT models, the fact that it runs much faster can make it a relevant candidate to consider.

        If you have a scikit-learn compatible pipeline that you'd like to use to embed text then you can also pass this to BERTopic.

        from sklearn.pipeline import make_pipeline\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\npipe = make_pipeline(\n    TfidfVectorizer(),\n    TruncatedSVD(100)\n)\n\ntopic_model = BERTopic(embedding_model=pipe)\n

        Warning

        One caveat to be aware of is that scikit-learns base Pipeline class does not support the .partial_fit()-API. If you have a pipeline that theoretically should be able to support online learning then you might want to explore the scikit-partial project. Moreover, since this backend does not generate representations on a word level, it does not support the bertopic.representation models.

        "},{"location":"getting_started/embeddings/embeddings.html#openai","title":"OpenAI","text":"

        To use OpenAI's external API, we need to define our key and explicitly call bertopic.backend.OpenAIBackend to be used in our topic model:

        import openai\nfrom bertopic.backend import OpenAIBackend\n\nclient = openai.OpenAI(api_key=\"sk-...\")\nembedding_model = OpenAIBackend(client, \"text-embedding-ada-002\")\n\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#cohere","title":"Cohere","text":"

        To use Cohere's external API, we need to define our key and explicitly call bertopic.backend.CohereBackend to be used in our topic model:

        import cohere\nfrom bertopic.backend import CohereBackend\n\nclient = cohere.Client(\"MY_API_KEY\")\nembedding_model = CohereBackend(client)\n\ntopic_model = BERTopic(embedding_model=embedding_model)\n
        "},{"location":"getting_started/embeddings/embeddings.html#multimodal","title":"Multimodal","text":"

        To create embeddings for both text and images in the same vector space, we can use the MultiModalBackend. This model uses a clip-vit based model that is capable of embedding text, images, or both:

        from bertopic.backend import MultiModalBackend\nmodel = MultiModalBackend('clip-ViT-B-32', batch_size=32)\n\n# Embed documents only\ndoc_embeddings = model.embed_documents(docs)\n\n# Embedding images only\nimage_embeddings = model.embed_images(images)\n\n# Embed both images and documents, then average them\ndoc_image_embeddings = model.embed(docs, images)\n
        "},{"location":"getting_started/embeddings/embeddings.html#custom-backend","title":"Custom Backend","text":"

        If your backend or model cannot be found in the ones currently available, you can use the bertopic.backend.BaseEmbedder class to create your backend. Below, you will find an example of creating a SentenceTransformer backend for BERTopic:

        from bertopic.backend import BaseEmbedder\nfrom sentence_transformers import SentenceTransformer\n\nclass CustomEmbedder(BaseEmbedder):\n    def __init__(self, embedding_model):\n        super().__init__()\n        self.embedding_model = embedding_model\n\n    def embed(self, documents, verbose=False):\n        embeddings = self.embedding_model.encode(documents, show_progress_bar=verbose)\n        return embeddings \n\n# Create custom backend\nembedding_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\ncustom_embedder = CustomEmbedder(embedding_model=embedding_model)\n\n# Pass custom backend to bertopic\ntopic_model = BERTopic(embedding_model=custom_embedder)\n
        "},{"location":"getting_started/embeddings/embeddings.html#custom-embeddings","title":"Custom Embeddings","text":"

        The base models in BERTopic are BERT-based models that work well with document similarity tasks. Your documents, however, might be too specific for a general pre-trained model to be used. Fortunately, you can use the embedding model in BERTopic to create document features.

        You only need to prepare the document embeddings yourself and pass them through fit_transform of BERTopic:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train our topic model using our pre-trained sentence-transformers embeddings\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n

        As you can see above, we used a SentenceTransformer model to create the embedding. You could also have used \ud83e\udd17 transformers, Doc2Vec, or any other embedding method.

        "},{"location":"getting_started/embeddings/embeddings.html#tf-idf","title":"TF-IDF","text":"

        As mentioned above, any embedding technique can be used. However, when running UMAP, the typical distance metric is cosine which does not work quite well for a TF-IDF matrix. Instead, BERTopic will recognize that a sparse matrix is passed and use hellinger instead which works quite well for the similarity between probability distributions.

        We simply create a TF-IDF matrix and use them as embeddings in our fit_transform method:

        from sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\n# Create TF-IDF sparse matrix\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nvectorizer = TfidfVectorizer(min_df=5)\nembeddings = vectorizer.fit_transform(docs)\n\n# Train our topic model using TF-IDF vectors\ntopic_model = BERTopic(stop_words=\"english\")\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n

        Here, you will probably notice that creating the embeddings is quite fast whereas fit_transform is quite slow. This is to be expected as reducing the dimensionality of a large sparse matrix takes some time. The inverse of using transformer embeddings is true: creating the embeddings is slow whereas fit_transform is quite fast.

        "},{"location":"getting_started/guided/guided.html","title":"Guided Topic Modeling","text":"

        Guided Topic Modeling or Seeded Topic Modeling is a collection of techniques that guides the topic modeling approach by setting several seed topics to which the model will converge to. These techniques allow the user to set a predefined number of topic representations that are sure to be in documents. For example, take an IT business that has a ticket system for the software their clients use. Those tickets may typically contain information about a specific bug regarding login issues that the IT business is aware of.

        To model that bug, we can create a seed topic representation containing the words bug, login, password, and username. By defining those words, a Guided Topic Modeling approach will try to converge at least one topic to those words.

        \"drug cancer drugs doctor\" \"windows drive dos file\" \"space launch orbit lunar\" Concatenate and embed the keywords/keyphrases using the embedding model. For each document, generate labels by finding which seeded topic fits best based on cosine similarity between embeddings. Average the embedding of each document with the selected seeded topic. Define seed topics through keywords or keyphrases. \"drug\", \"cancer\", \"drugs\", \"doctor\" Seed topic 1 Seed topic 2 Seed topic 3 \"windows\", \"drive\", \"dos\", \"file\" \"space\", \"launch\", \"orbit\", \"lunar\" Seed topic 3 Seed topic 2 No seed topic match found Seed topic 2 seed topic embedding document embedding + 2 Multiply the IDF values of the seeded keywords across all topics with 1.2. Word IDF Multiplier Adjusted IDF drug 1.2 .55 .66 1.2 doctor .78 .94 cat 1 .22 .22 1 dog .11 .11 space 1.2 .35 .42 1.2 launch .89 1.07

        Guided BERTopic has two main steps:

        First, we create embeddings for each seeded topic by joining them and passing them through the document embedder. These embeddings will be compared with the existing document embeddings through cosine similarity and assigned a label. If the document is most similar to a seeded topic, then it will get that topic's label. If it is most similar to the average document embedding, it will get the -1 label. These labels are then passed through UMAP to create a semi-supervised approach that should nudge the topic creation to the seeded topics.

        Second, we take all words in seed_topic_list and assign them a multiplier larger than 1. Those multipliers will be used to increase the IDF values of the words across all topics thereby increasing the likelihood that a seeded topic word will appear in a topic. This does, however, also increase the chance of an irrelevant topic having unrelated words. In practice, this should not be an issue since the IDF value is likely to remain low regardless of the multiplier. The multiplier is now a fixed value but may change to something more elegant, like taking the distribution of IDF values and its position into account when defining the multiplier.

        "},{"location":"getting_started/guided/guided.html#example","title":"Example","text":"

        To demonstrate Guided BERTopic, we use the 20 Newsgroups dataset as our example. We have frequently used this dataset in BERTopic examples and we sometimes see a topic generated about health with words such as drug and cancer being important. However, due to the stochastic nature of UMAP, this topic is not always found.

        In order to guide BERTopic to that topic, we create a seed topic list that we pass through our model. However, there may be several other topics that we know should be in the documents. Let's also initialize those:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\n\nseed_topic_list = [[\"drug\", \"cancer\", \"drugs\", \"doctor\"],\n                   [\"windows\", \"drive\", \"dos\", \"file\"],\n                   [\"space\", \"launch\", \"orbit\", \"lunar\"]]\n\ntopic_model = BERTopic(seed_topic_list=seed_topic_list)\ntopics, probs = topic_model.fit_transform(docs)\n

        As you can see above, the seed_topic_list contains a list of topic representations. By defining the above topics BERTopic is more likely to model the defined seeded topics. However, BERTopic is merely nudged towards creating those topics. In practice, if the seeded topics do not exist or might be divided into smaller topics, then they will not be modeled. Thus, seed topics need to be accurate to accurately converge towards them.

        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html","title":"Hierarchical Topic Modeling","text":"

        When tweaking your topic model, the number of topics that are generated has a large effect on the quality of the topic representations. Some topics could be merged and having an understanding of the effect will help you understand which topics should and which should not be merged.

        That is where hierarchical topic modeling comes in. It tries to model the possible hierarchical nature of the topics you have created to understand which topics are similar to each other. Moreover, you will have more insight into sub-topics that might exist in your data.

        Create a distance matrix by calculating the cosine similarity between c-TF-IDF representations of each topic. Apply a linkage function of choice on the distance matrix to model the hierarchical structure of topics. Topic 26 Topic 1 Topic 38 Topic 42 re-calculate c-TF-IDF Update the c-TF-IDF representation based on the collection of documents across the merged topics. Topic 1 .12 .12 .53 .53 .74 .74 .89 .89 .24 .24 .01 .01 1 1 1 1 ... ... ... ... ... ... ... ... 1 2 3 1 2 3 n ... . . . n

        In BERTopic, we can approximate this potential hierarchy by making use of our topic-term matrix (c-TF-IDF matrix). This matrix contains information about the importance of every word in every topic and makes for a nice numerical representation of our topics. The smaller the distance between two c-TF-IDF representations, the more similar we assume they are. In practice, this process of merging topics is done through the hierarchical clustering capabilities of scipy (see here). It allows for several linkage methods through which we can approximate our topic hierarchy. As a default, we are using the ward but many others are available.

        Whenever we merge two topics, we can calculate the c-TF-IDF representation of these two merged by summing their bag-of-words representation. We assume that two sets of topics are merged and that all others are kept the same, regardless of their location in the hierarchy. This helps us isolate the potential effect of merging sets of topics. As a result, we can see the topic representation at each level in the tree.

        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#example","title":"Example","text":"

        To demonstrate hierarchical topic modeling with BERTopic, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents.

        First, we train a basic BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\n

        Next, we can use our fitted BERTopic model to extract possible hierarchies from our c-TF-IDF matrix:

        hierarchical_topics = topic_model.hierarchical_topics(docs)\n

        The resulting hierarchical_topics is a dataframe in which merged topics are described. For example, if you would merge two topics, what would the topic representation of the new topic be?

        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#linkage-functions","title":"Linkage functions","text":"

        When creating the potential hierarchical nature of topics, we use Scipy's ward linkage function as a default to generate the hierarchy. However, you might want to use a different linkage function for your use case, such as single, complete, average, centroid, or median. In BERTopic, you can define the linkage function yourself, including the distance function that you would like to use:

        from scipy.cluster import hierarchy as sch\nfrom bertopic import BERTopic\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Hierarchical topics\nlinkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)\nhierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)\n
        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#visualizations","title":"Visualizations","text":"

        To visualize these results, we can start by running a familiar function, namely topic_model.visualize_hierarchy:

        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you hover over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes.

        Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use topic_model.get_topic_tree to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged:

        >>> tree = topic_model.get_topic_tree(hierarchical_topics)\n>>> print(tree)\n.\n\u2514\u2500atheists_atheism_god_moral_atheist\n     \u251c\u2500atheists_atheism_god_atheist_argument\n     \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n     \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n     \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n
        Click here to view the full tree.
          .\n  \u251c\u2500people_armenian_said_god_armenians\n  \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u2502    \u251c\u2500jehovah_lord_mormon_mcconkie_god\n  \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500ra_satan_thou_god_lucifer \u2500\u2500 Topic: 94\n  \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500jehovah_lord_mormon_mcconkie_unto \u2500\u2500 Topic: 78\n  \u2502    \u2502    \u2502    \u2514\u2500jesus_mary_god_hell_sin\n  \u2502    \u2502    \u2502         \u251c\u2500jesus_hell_god_eternal_heaven\n  \u2502    \u2502    \u2502         \u2502    \u251c\u2500hell_jesus_eternal_god_heaven\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500jesus_tomb_disciples_resurrection_john \u2500\u2500 Topic: 69\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500hell_eternal_god_jesus_heaven \u2500\u2500 Topic: 53\n  \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500aaron_baptism_sin_law_god \u2500\u2500 Topic: 89\n  \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500mary_sin_maria_priest_conception \u2500\u2500 Topic: 56\n  \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500marriage_married_marry_ceremony_marriages \u2500\u2500 Topic: 110\n  \u2502    \u2514\u2500people_armenian_armenians_said_mr\n  \u2502         \u251c\u2500people_armenian_armenians_said_israel\n  \u2502         \u2502    \u251c\u2500god_homosexual_homosexuality_atheists_sex\n  \u2502         \u2502    \u2502    \u251c\u2500homosexual_homosexuality_sex_gay_homosexuals\n  \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500kinsey_sex_gay_men_sexual \u2500\u2500 Topic: 44\n  \u2502         \u2502    \u2502    \u2502    \u2514\u2500homosexuality_homosexual_sin_homosexuals_gay\n  \u2502         \u2502    \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500gay_homosexual_homosexuals_sexual_cramer \u2500\u2500 Topic: 50\n  \u2502         \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500homosexuality_homosexual_sin_paul_sex \u2500\u2500 Topic: 27\n  \u2502         \u2502    \u2502    \u2514\u2500god_atheists_atheism_moral_atheist\n  \u2502         \u2502    \u2502         \u251c\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500jim_context_challenges_articles_quote \u2500\u2500 Topic: 36\n  \u2502         \u2502    \u2502         \u2502    \u2514\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500islam_quran_islamic_rushdie_muslims \u2500\u2500 Topic: 31\n  \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500judas_scripture_bible_books_greek \u2500\u2500 Topic: 33\n  \u2502         \u2502    \u2502         \u2514\u2500atheists_atheism_god_moral_atheist\n  \u2502         \u2502    \u2502              \u251c\u2500atheists_atheism_god_atheist_argument\n  \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n  \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n  \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n  \u2502         \u2502    \u2514\u2500armenian_armenians_people_israel_said\n  \u2502         \u2502         \u251c\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502    \u251c\u2500tax_rights_government_income_taxes\n  \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500rights_right_slavery_slaves_residence \u2500\u2500 Topic: 106\n  \u2502         \u2502         \u2502    \u2502    \u2514\u2500tax_government_taxes_income_libertarians\n  \u2502         \u2502         \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500government_libertarians_libertarian_regulation_party \u2500\u2500 Topic: 58\n  \u2502         \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500tax_taxes_income_billion_deficit \u2500\u2500 Topic: 41\n  \u2502         \u2502         \u2502    \u2514\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502         \u251c\u2500gun_guns_militia_firearms_amendment\n  \u2502         \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500blacks_penalty_death_cruel_punishment \u2500\u2500 Topic: 55\n  \u2502         \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500gun_guns_militia_firearms_amendment \u2500\u2500 Topic: 7\n  \u2502         \u2502         \u2502         \u2514\u2500armenian_armenians_israel_jews_turkish\n  \u2502         \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500israel_israeli_jews_arab_jewish \u2500\u2500 Topic: 4\n  \u2502         \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500armenian_armenians_turkish_armenia_azerbaijan \u2500\u2500 Topic: 15\n  \u2502         \u2502         \u2514\u2500stephanopoulos_president_mr_myers_ms\n  \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500serbs_muslims_stephanopoulos_mr_bosnia \u2500\u2500 Topic: 35\n  \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500myers_stephanopoulos_president_ms_mr \u2500\u2500 Topic: 87\n  \u2502         \u2514\u2500batf_fbi_koresh_compound_gas\n  \u2502              \u251c\u2500\u25a0\u2500\u2500reno_workers_janet_clinton_waco \u2500\u2500 Topic: 77\n  \u2502              \u2514\u2500batf_fbi_koresh_gas_compound\n  \u2502                   \u251c\u2500batf_koresh_fbi_warrant_compound\n  \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500batf_warrant_raid_compound_fbi \u2500\u2500 Topic: 42\n  \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500koresh_batf_fbi_children_compound \u2500\u2500 Topic: 61\n  \u2502                   \u2514\u2500\u25a0\u2500\u2500fbi_gas_tear_bds_building \u2500\u2500 Topic: 23\n  \u2514\u2500use_like_just_dont_new\n      \u251c\u2500game_team_year_games_like\n      \u2502    \u251c\u2500game_team_games_25_year\n      \u2502    \u2502    \u251c\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502    \u251c\u2500window_printer_use_problem_mhz\n      \u2502    \u2502    \u2502    \u2502    \u251c\u2500mhz_wire_simms_wiring_battery\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_mhz_battery_cpu_heat\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_pds_simm_vram_lc\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500pds_nubus_lc_slot_card \u2500\u2500 Topic: 119\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500simms_simm_vram_meg_dram \u2500\u2500 Topic: 32\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500mhz_battery_cpu_heat_speed\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500fan_cpu_heat_sink_fans \u2500\u2500 Topic: 92\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500mhz_speed_cpu_fpu_clock \u2500\u2500 Topic: 22\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500monitor_turn_power_computer_electricity \u2500\u2500 Topic: 91\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500battery_batteries_concrete_duo_discharge\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500duo_battery_apple_230_problem \u2500\u2500 Topic: 121\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500battery_batteries_concrete_discharge_temperature \u2500\u2500 Topic: 75\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500leds_uv_blue_light_boards \u2500\u2500 Topic: 66\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500wire_wiring_ground_neutral_outlets \u2500\u2500 Topic: 120\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500scope_scopes_phone_dial_number\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500dial_number_phone_line_output \u2500\u2500 Topic: 93\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500scope_scopes_motorola_generator_oscilloscope \u2500\u2500 Topic: 113\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500celp_dsp_sampling_antenna_digital\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500antenna_antennas_receiver_cable_transmitter \u2500\u2500 Topic: 70\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500celp_dsp_sampling_speech_voice \u2500\u2500 Topic: 52\n      \u2502    \u2502    \u2502    \u2502    \u2514\u2500window_printer_xv_mouse_windows\n      \u2502    \u2502    \u2502    \u2502         \u251c\u2500window_xv_error_widget_problem\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500error_symbol_undefined_xterm_rx\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500symbol_error_undefined_doug_parse \u2500\u2500 Topic: 63\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500rx_remote_server_xdm_xterm \u2500\u2500 Topic: 45\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500window_xv_widget_application_expose\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500window_widget_expose_application_event\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500gc_mydisplay_draw_gxxor_drawing \u2500\u2500 Topic: 103\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500window_widget_application_expose_event \u2500\u2500 Topic: 25\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500xv_den_polygon_points_algorithm\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500den_polygon_points_algorithm_polygons \u2500\u2500 Topic: 28\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500xv_24bit_image_bit_images \u2500\u2500 Topic: 57\n      \u2502    \u2502    \u2502    \u2502         \u2514\u2500printer_fonts_print_mouse_postscript\n      \u2502    \u2502    \u2502    \u2502              \u251c\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500scanner_logitech_grayscale_ocr_scanman \u2500\u2500 Topic: 108\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u2514\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u251c\u2500\u25a0\u2500\u2500printer_print_deskjet_hp_ink \u2500\u2500 Topic: 18\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u2514\u2500\u25a0\u2500\u2500fonts_font_truetype_tt_atm \u2500\u2500 Topic: 49\n      \u2502    \u2502    \u2502    \u2502              \u2514\u2500mouse_ghostscript_midi_driver_postscript\n      \u2502    \u2502    \u2502    \u2502                   \u251c\u2500ghostscript_midi_postscript_files_file\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500ghostscript_postscript_pageview_ghostview_dsc \u2500\u2500 Topic: 104\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u2514\u2500midi_sound_file_windows_driver\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u251c\u2500\u25a0\u2500\u2500location_mar_file_host_rwrr \u2500\u2500 Topic: 83\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u2514\u2500\u25a0\u2500\u2500midi_sound_driver_blaster_soundblaster \u2500\u2500 Topic: 98\n      \u2502    \u2502    \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500mouse_driver_mice_ball_problem \u2500\u2500 Topic: 68\n      \u2502    \u2502    \u2502    \u2514\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502         \u251c\u25001st_sale_condition_comics_hulk\n      \u2502    \u2502    \u2502         \u2502    \u251c\u2500sale_condition_offer_asking_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500condition_stereo_amp_speakers_asking\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500miles_car_amfm_toyota_cassette \u2500\u2500 Topic: 62\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500amp_speakers_condition_stereo_audio \u2500\u2500 Topic: 24\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500games_sale_pom_cds_shipping\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u251c\u2500pom_cds_sale_shipping_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500size_shipping_sale_condition_mattress \u2500\u2500 Topic: 100\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500pom_cds_cd_sale_picture \u2500\u2500 Topic: 37\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500games_game_snes_sega_genesis \u2500\u2500 Topic: 40\n      \u2502    \u2502    \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u251c\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500lens_tape_camera_backup_lenses\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500tape_backup_tapes_drive_4mm \u2500\u2500 Topic: 107\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500lens_camera_lenses_zoom_pouch \u2500\u2500 Topic: 114\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u25001st_hulk_comics_art_appears \u2500\u2500 Topic: 105\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500books_book_cover_trek_chemistry \u2500\u2500 Topic: 125\n      \u2502    \u2502    \u2502         \u2502         \u2514\u2500tickets_hotel_ticket_voucher_package\n      \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500hotel_voucher_package_vacation_room \u2500\u2500 Topic: 74\n      \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500tickets_ticket_june_airlines_july \u2500\u2500 Topic: 84\n      \u2502    \u2502    \u2502         \u2514\u2500game_team_games_season_hockey\n      \u2502    \u2502    \u2502              \u251c\u2500game_hockey_team_25_550\n      \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500espn_pt_pts_game_la \u2500\u2500 Topic: 17\n      \u2502    \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500team_25_game_hockey_550 \u2500\u2500 Topic: 2\n      \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500year_game_hit_baseball_players \u2500\u2500 Topic: 0\n      \u2502    \u2502    \u2514\u2500bike_car_greek_insurance_msg\n      \u2502    \u2502         \u251c\u2500car_bike_insurance_cars_engine\n      \u2502    \u2502         \u2502    \u251c\u2500car_insurance_cars_radar_engine\n      \u2502    \u2502         \u2502    \u2502    \u251c\u2500insurance_health_private_care_canada\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500insurance_health_private_care_canada \u2500\u2500 Topic: 99\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500insurance_car_accident_rates_sue \u2500\u2500 Topic: 82\n      \u2502    \u2502         \u2502    \u2502    \u2514\u2500car_cars_radar_engine_detector\n      \u2502    \u2502         \u2502    \u2502         \u251c\u2500car_radar_cars_detector_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500radar_detector_detectors_ka_alarm \u2500\u2500 Topic: 39\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500car_cars_mustang_ford_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500clutch_shift_shifting_transmission_gear \u2500\u2500 Topic: 88\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500car_cars_mustang_ford_v8 \u2500\u2500 Topic: 14\n      \u2502    \u2502         \u2502    \u2502         \u2514\u2500oil_diesel_odometer_diesels_car\n      \u2502    \u2502         \u2502    \u2502              \u251c\u2500odometer_oil_sensor_car_drain\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500odometer_sensor_speedo_gauge_mileage \u2500\u2500 Topic: 96\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500oil_drain_car_leaks_taillights \u2500\u2500 Topic: 102\n      \u2502    \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500diesel_diesels_emissions_fuel_oil \u2500\u2500 Topic: 79\n      \u2502    \u2502         \u2502    \u2514\u2500bike_riding_ride_bikes_motorcycle\n      \u2502    \u2502         \u2502         \u251c\u2500bike_ride_riding_bikes_lane\n      \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500bike_ride_riding_lane_car \u2500\u2500 Topic: 11\n      \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500bike_bikes_miles_honda_motorcycle \u2500\u2500 Topic: 19\n      \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500countersteering_bike_motorcycle_rear_shaft \u2500\u2500 Topic: 46\n      \u2502    \u2502         \u2514\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u251c\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u251c\u2500greek_kuwait_greece_turkish_greeks\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500greek_greece_turkish_greeks_cyprus \u2500\u2500 Topic: 71\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kuwait_iraq_iran_gulf_arabia \u2500\u2500 Topic: 76\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2514\u2500msg_dog_drugs_drug_food\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u251c\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500clinton_bush_quayle_reagan_panicking \u2500\u2500 Topic: 101\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u2514\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500cooper_trial_weaver_spence_witnesses \u2500\u2500 Topic: 90\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500dog_dogs_bike_trained_springer \u2500\u2500 Topic: 67\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2514\u2500msg_drugs_drug_food_chinese\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500msg_food_chinese_foods_taste \u2500\u2500 Topic: 30\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500drugs_drug_marijuana_cocaine_alcohol \u2500\u2500 Topic: 72\n      \u2502    \u2502              \u2502    \u2502    \u2514\u2500water_theory_universe_science_larsons\n      \u2502    \u2502              \u2502    \u2502         \u251c\u2500water_nuclear_cooling_steam_dept\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500rocketry_rockets_engines_nuclear_plutonium \u2500\u2500 Topic: 115\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u2514\u2500water_cooling_steam_dept_plants\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500water_dept_phd_environmental_atmospheric \u2500\u2500 Topic: 97\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500cooling_water_steam_towers_plants \u2500\u2500 Topic: 109\n      \u2502    \u2502              \u2502    \u2502         \u2514\u2500theory_universe_larsons_larson_science\n      \u2502    \u2502              \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500theory_universe_larsons_larson_science \u2500\u2500 Topic: 54\n      \u2502    \u2502              \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500oort_cloud_grbs_gamma_burst \u2500\u2500 Topic: 80\n      \u2502    \u2502              \u2502    \u2514\u2500helmet_kirlian_photography_lock_wax\n      \u2502    \u2502              \u2502         \u251c\u2500helmet_kirlian_photography_leaf_mask\n      \u2502    \u2502              \u2502         \u2502    \u251c\u2500kirlian_photography_leaf_pictures_deleted\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u251c\u2500deleted_joke_stuff_maddi_nickname\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500joke_maddi_nickname_nicknames_frank \u2500\u2500 Topic: 43\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500deleted_stuff_bookstore_joke_motto \u2500\u2500 Topic: 81\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kirlian_photography_leaf_pictures_aura \u2500\u2500 Topic: 85\n      \u2502    \u2502              \u2502         \u2502    \u2514\u2500helmet_mask_liner_foam_cb\n      \u2502    \u2502              \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500helmet_liner_foam_cb_helmets \u2500\u2500 Topic: 112\n      \u2502    \u2502              \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500mask_goalies_77_santore_tl \u2500\u2500 Topic: 123\n      \u2502    \u2502              \u2502         \u2514\u2500lock_wax_paint_plastic_ear\n      \u2502    \u2502              \u2502              \u251c\u2500\u25a0\u2500\u2500lock_cable_locks_bike_600 \u2500\u2500 Topic: 117\n      \u2502    \u2502              \u2502              \u2514\u2500wax_paint_ear_plastic_skin\n      \u2502    \u2502              \u2502                   \u251c\u2500\u25a0\u2500\u2500wax_paint_plastic_scratches_solvent \u2500\u2500 Topic: 65\n      \u2502    \u2502              \u2502                   \u2514\u2500\u25a0\u2500\u2500ear_wax_skin_greasy_acne \u2500\u2500 Topic: 116\n      \u2502    \u2502              \u2514\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u251c\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500m4_mp_14_mw_mo \u2500\u2500 Topic: 111\n      \u2502    \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500test_ensign_nameless_deane_deanebinahccbrandeisedu \u2500\u2500 Topic: 118\n      \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500ites_cheek_hello_hi_ken \u2500\u2500 Topic: 3\n      \u2502    \u2514\u2500space_medical_health_disease_cancer\n      \u2502         \u251c\u2500medical_health_disease_cancer_patients\n      \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500cancer_centers_center_medical_research \u2500\u2500 Topic: 122\n      \u2502         \u2502    \u2514\u2500health_medical_disease_patients_hiv\n      \u2502         \u2502         \u251c\u2500patients_medical_disease_candida_health\n      \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n      \u2502         \u2502         \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n      \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n      \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n      \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n      \u2502         \u2514\u2500space_launch_nasa_shuttle_orbit\n      \u2502              \u251c\u2500space_moon_station_nasa_launch\n      \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500sky_advertising_billboard_billboards_space \u2500\u2500 Topic: 59\n      \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500space_station_moon_redesign_nasa \u2500\u2500 Topic: 16\n      \u2502              \u2514\u2500space_mission_hst_launch_orbit\n      \u2502                   \u251c\u2500space_launch_nasa_orbit_propulsion\n      \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500space_launch_nasa_propulsion_astronaut \u2500\u2500 Topic: 47\n      \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500orbit_km_jupiter_probe_earth \u2500\u2500 Topic: 86\n      \u2502                   \u2514\u2500\u25a0\u2500\u2500hst_mission_shuttle_orbit_arrays \u2500\u2500 Topic: 60\n      \u2514\u2500drive_file_key_windows_use\n          \u251c\u2500key_file_jpeg_encryption_image\n          \u2502    \u251c\u2500key_encryption_clipper_chip_keys\n          \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500key_clipper_encryption_chip_keys \u2500\u2500 Topic: 1\n          \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500entry_file_ripem_entries_key \u2500\u2500 Topic: 73\n          \u2502    \u2514\u2500jpeg_image_file_gif_images\n          \u2502         \u251c\u2500motif_graphics_ftp_available_3d\n          \u2502         \u2502    \u251c\u2500motif_graphics_openwindows_ftp_available\n          \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500openwindows_motif_xview_windows_mouse \u2500\u2500 Topic: 20\n          \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500graphics_widget_ray_3d_available \u2500\u2500 Topic: 95\n          \u2502         \u2502    \u2514\u2500\u25a0\u2500\u25003d_machines_version_comments_contact \u2500\u2500 Topic: 38\n          \u2502         \u2514\u2500jpeg_image_gif_images_format\n          \u2502              \u251c\u2500\u25a0\u2500\u2500gopher_ftp_files_stuffit_images \u2500\u2500 Topic: 51\n          \u2502              \u2514\u2500\u25a0\u2500\u2500jpeg_image_gif_format_images \u2500\u2500 Topic: 13\n          \u2514\u2500drive_db_card_scsi_windows\n              \u251c\u2500db_windows_dos_mov_os2\n              \u2502    \u251c\u2500\u25a0\u2500\u2500copy_protection_program_software_disk \u2500\u2500 Topic: 64\n              \u2502    \u2514\u2500\u25a0\u2500\u2500db_windows_dos_mov_os2 \u2500\u2500 Topic: 8\n              \u2514\u2500drive_card_scsi_drives_ide\n                      \u251c\u2500drive_scsi_drives_ide_disk\n                      \u2502    \u251c\u2500\u25a0\u2500\u2500drive_scsi_drives_ide_disk \u2500\u2500 Topic: 6\n                      \u2502    \u2514\u2500\u25a0\u2500\u2500meg_sale_ram_drive_shipping \u2500\u2500 Topic: 12\n                      \u2514\u2500card_modem_monitor_video_drivers\n                          \u251c\u2500\u25a0\u2500\u2500card_monitor_video_drivers_vga \u2500\u2500 Topic: 5\n                          \u2514\u2500\u25a0\u2500\u2500modem_port_serial_irq_com \u2500\u2500 Topic: 10\n
        "},{"location":"getting_started/hierarchicaltopics/hierarchicaltopics.html#merge-topics","title":"Merge topics","text":"

        After seeing the potential hierarchy of your topic, you might want to merge specific topics. For example, if topic 1 is 1_space_launch_moon_nasa and topic 2 is 2_spacecraft_solar_space_orbit it might make sense to merge those two topics as they are quite similar in meaning. In BERTopic, you can use .merge_topics to manually select and merge those topics. Doing so will update their topic representation which in turn updates the entire model:

        topics_to_merge = [1, 2]\ntopic_model.merge_topics(docs, topics_to_merge)\n

        If you have several groups of topics you want to merge, create a list of lists instead:

        topics_to_merge = [[1, 2],\n                   [3, 4]]\ntopic_model.merge_topics(docs, topics_to_merge)\n
        "},{"location":"getting_started/manual/manual.html","title":"Manual Topic Modeling","text":"

        Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used 20 NewsGroups dataset is already split up into 20 classes. Here, we might want to see how we can transform those 20 classes into 20 topics. Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic without actually learning them.

        We can view this as a manual topic modeling approach. There is no underlying algorithm for detecting these topics since you already have done that before. Whether that is simply because they are already available, like with the 20 NewsGroups dataset, or maybe because you have created clusters of documents before using packages like human-learn, bulk, thisnotthat or something entirely different.

        In other words, we can pass our labels to BERTopic and it will try to transform those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.

        Documents Labels c-TF-IDF

        To do so, we need to skip over the dimensionality reduction and clustering steps since we already know the labels for our documents. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels:

        from sklearn.datasets import fetch_20newsgroups\n\n# Get labeled data\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n

        Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process:

        from bertopic import BERTopic\nfrom bertopic.backend import BaseEmbedder\nfrom bertopic.cluster import BaseCluster\nfrom bertopic.vectorizers import ClassTfidfTransformer\nfrom bertopic.dimensionality import BaseDimensionalityReduction\n\n# Prepare our empty sub-models and reduce frequent words while we are at it.\nempty_embedding_model = BaseEmbedder()\nempty_dimensionality_model = BaseDimensionalityReduction()\nempty_cluster_model = BaseCluster()\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\n\n# Fit BERTopic without actually performing any clustering\ntopic_model= BERTopic(\n        embedding_model=empty_embedding_model,\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=empty_cluster_model,\n        ctfidf_model=ctfidf_model\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n

        Let's take a look at a few topics that we get out of training this way by running topic_model.get_topic_info():

        Topic Count Name 0 0 999 0_game_hockey_team_25 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit

        We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship:

        # Map input `y` to topics\nmappings = topic_model.topic_mapper_.get_mappings()\nmappings = {value: data[\"target_names\"][key] for key, value in mappings.items()}\n\n# Assign original classes to our topics\ndf = topic_model.get_topic_info()\ndf[\"Class\"] = df.Topic.map(mappings)\ndf\n

        Topic Count Name Class 0 0 999 0_game_hockey_team_25 rec.sport.hockey 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit sci.space comp.windows.x sci.med rec.autos sci.crypt rec.sport.baseball rec.motorcycles soc.religion.christian

        We can see that the c-TF-IDF representations nicely extract the words that give a nice representation of our input classes. This is all done without actually embedding and clustering the data.

        As a result, the entire \"training\" process only takes a couple of seconds. Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc.

        Note

        The resulting topics may be a different mapping from the y labels. To map y to topics, we can run the following:

        mappings = topic_model.topic_mapper_.get_mappings()\ny_mapped = [mappings[val] for val in y]\n
        "},{"location":"getting_started/merge/merge.html","title":"Merge Multiple Fitted Models","text":"

        After you have trained a new BERTopic model on your data, new data might still be coming in. Although you can use online BERTopic, you might prefer to use the default HDBSCAN and UMAP models since they do not support incremental learning out of the box.

        Instead, we you can train a new BERTopic on incoming data and merge it with your base model to detect whether new topics have appeared in the unseen documents. This is a great way of detecting whether your new model contains information that was not previously found in your base topic model.

        Similarly, you might want to train multiple BERTopic models using different sets of settings, even though they might all be using the same underlying embedding model. Merging these models would also allow for a single model that you can use throughout your use cases.

        Lastly, this methods also allows for a degree of federated learning where each node trains a topic model that are aggregated in a central server.

        "},{"location":"getting_started/merge/merge.html#example","title":"Example","text":"

        To demonstrate merging different topic models with BERTopic, we use the ArXiv paper abstracts to see which topics they generally contain.

        First, we train three separate models on different parts of the data:

        from umap import UMAP\nfrom bertopic import BERTopic\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\n\n# Extract abstracts to train on and corresponding titles\nabstracts_1 = dataset[\"abstract\"][:5_000]\nabstracts_2 = dataset[\"abstract\"][5_000:10_000]\nabstracts_3 = dataset[\"abstract\"][10_000:15_000]\n\n# Create topic models\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\ntopic_model_1 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_1)\ntopic_model_2 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_2)\ntopic_model_3 = BERTopic(umap_model=umap_model, min_topic_size=20).fit(abstracts_3)\n

        Then, we can combine all three models into one with .merge_models:

        # Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3])\n

        When we inspect the first model, we can see it has 52 topics:

        >>> len(topic_model_1.get_topic_info())\n52\n

        Now, we inspect the merged model, we can see it has 57 topics:

        >>> len(merged_model.get_topic_info())\n57\n

        It seems that by merging these three models, there were 6 undiscovered topics that we could add to the very first model.

        Note

        Note that the models are merged sequentially. This means that the comparison starts with topic_model_1 and that each new topic from topic_model_2 and topic_model_3 will be added to topic_model_1.

        We can check the newly added topics in the merged_model by simply looking at the 6 latest topics that were added. The order of topics from topic_model_1 remains the same. All new topics are simply added on top of them.

        Let's inspect them:

        >>> merged_model.get_topic_info().tail(5)\n
        Topic Count Name Representation Representative_Docs 52 51 47 50_activity_mobile_wearable_sensors ['activity', 'mobile', 'wearable', 'sensors', 'falls', 'human', 'phone', 'recognition', 'activities', 'accelerometer'] nan 53 52 48 25_music_musical_audio_chord ['music', 'musical', 'audio', 'chord', 'and', 'we', 'to', 'that', 'of', 'for'] nan 54 53 32 36_fairness_discrimination_fair_groups ['fairness', 'discrimination', 'fair', 'groups', 'protected', 'decision', 'we', 'of', 'classifier', 'to'] nan 55 54 30 38_traffic_driver_prediction_flow ['traffic', 'driver', 'prediction', 'flow', 'trajectory', 'the', 'and', 'congestion', 'of', 'transportation'] nan 56 55 22 50_spiking_neurons_networks_learning ['spiking', 'neurons', 'networks', 'learning', 'neural', 'snn', 'dynamics', 'plasticity', 'snns', 'of'] nan

        It seems that topics about activity, music, fairness, traffic, and spiking networks were added to the base topic model! Two things that you might have noticed. First, the representative documents were not added to the model. This is because of privacy reasons, you might want to combine models that were trained on different stations which would allow for a degree of federated learning. Second, the names of the new topics contain topic ids that refer to one of the old models. They were purposefully left this way so that the user can identify which topics were newly added which you could inspect in the original models.

        "},{"location":"getting_started/merge/merge.html#min_similarity","title":"min_similarity","text":"

        The way the models are merged is through comparison of their topic embeddings. If topics between models are similar enough, then they will be regarded as the same topics and the topic of the first model in the list will be chosen. However, if topics between models are dissimilar enough, then the topic of the latter model will be added to the former.

        This (dis)similarity is can be tweaked using the min_similarity parameter. Increasing this value will increase the chance of adding new topics. In contrast, decreasing this value will make it more strict and threfore decrease the chance of adding new topics. The value is set to 0.7 by default, so let's see what happens if we were to increase this value to `0.9``:

        # Combine all models into one\nmerged_model = BERTopic.merge_models([topic_model_1, topic_model_2, topic_model_3], min_similarity=0.9)\n

        When we inspect the number of topics in our new model, we can see that they have increased quite a bit:

        >>> len(merged_model.get_topic_info())\n102\n

        This demonstrates the influence of min_similarity on the number of new topics that are added to the base model.

        "},{"location":"getting_started/multiaspect/multiaspect.html","title":"6C. Multiple Representations","text":"

        During the development of BERTopic, many different types of representations can be created, from keywords and phrases to summaries and custom labels. There is a variety of techniques that one can choose from to represent a topic. As such, there are a number of interesting and creative ways one can summarize topics. A topic is more than just a single representation.

        Therefore, multi-aspect topic modeling is introduced! During the .fit or .fit_transform stages, you can now get multiple representations of a single topic. In practice, it works by generating and storing all kinds of different topic representations (see image below).

        The approach is rather straightforward. We might want to represent our topics using a PartOfSpeech representation model but we might also want to try out KeyBERTInspired and compare those representation models. We can do this as follows:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic.representation import PartOfSpeech\nfrom bertopic.representation import MaximalMarginalRelevance\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Documents to train on\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# The main representation of a topic\nmain_representation = KeyBERTInspired()\n\n# Additional ways of representing a topic\naspect_model1 = PartOfSpeech(\"en_core_web_sm\")\naspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]\n\n# Add all models together to be run in a single `fit`\nrepresentation_model = {\n   \"Main\": main_representation,\n   \"Aspect1\":  aspect_model1,\n   \"Aspect2\":  aspect_model2 \n}\ntopic_model = BERTopic(representation_model=representation_model).fit(docs)\n

        As show above, to perform multi-aspect topic modeling, we make sure that representation_model is a dictionary where each representation model pipeline is defined. The main pipeline, that is used in most visualization options, is defined with the \"Main\" key. All other aspects can be defined however you want. In the example above, the two additional aspects that we are interested in are defined as \"Aspect1\" and \"Aspect2\".

        After we have fitted our model, we can access all representations with topic_model.get_topic_info():

        As you can see, there are a number of different representations for our topics that we can inspect. All aspects are found in topic_model.topic_aspects_.

        "},{"location":"getting_started/multimodal/multimodal.html","title":"Multimodal Topic Modeling","text":"

        Documents or text are often accompanied by imagery or the other way around. For example, social media images with captions and products with descriptions. Topic modeling has traditionally focused on creating topics from textual representations. However, as more multimodal representations are created, the need for multimodal topics increases.

        BERTopic can perform multimodal topic modeling in a number of ways during .fit and .fit_transform stages.

        "},{"location":"getting_started/multimodal/multimodal.html#text-images","title":"Text + Images","text":"

        The most basic example of multimodal topic modeling in BERTopic is when you have images that accompany your documents. This means that it is expected that each document has an image and vice versa. Instagram pictures, for example, almost always have some descriptions to them.

        In this example, we are going to use images from flickr that each have a caption associated to it:

        # NOTE: This requires the `datasets` package which you can \n# install with `pip install datasets`\nfrom datasets import load_dataset\n\nds = load_dataset(\"maderix/flickr_bw_rgb\")\nimages = ds[\"train\"][\"image\"]\ndocs = ds[\"train\"][\"caption\"]\n

        The docs variable contains the captions for each image in images. We can now use these variables to run our multimodal example:

        Tip

        Do note that it is better to pass the paths of the images instead of the images themselves as there is no need to keep all images in memory. When passing the paths of the images, they are only opened temporarily when they are needed.

        from bertopic import BERTopic\nfrom bertopic.representation import VisualRepresentation\n\n# Additional ways of representing a topic\nvisual_model = VisualRepresentation()\n\n# Make sure to add the `visual_model` to a dictionary\nrepresentation_model = {\n   \"Visual_Aspect\":  visual_model,\n}\ntopic_model = BERTopic(representation_model=representation_model, verbose=True)\n

        In this example, we are clustering the documents and are then looking for the best matching images to the resulting clusters.

        We can now access our image representations for each topic with topic_model.topic_aspects_[\"Visual_Aspect\"]. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:

        import base64\nfrom io import BytesIO\nfrom IPython.display import HTML\n\ndef image_base64(im):\n    if isinstance(im, str):\n        im = get_thumbnail(im)\n    with BytesIO() as buffer:\n        im.save(buffer, 'jpeg')\n        return base64.b64encode(buffer.getvalue()).decode()\n\n\ndef image_formatter(im):\n    return f'<img src=\"data:image/jpeg;base64,{image_base64(im)}\">'\n\n# Extract dataframe\ndf = topic_model.get_topic_info().drop(\"Representative_Docs\", 1).drop(\"Name\", 1)\n\n# Visualize the images\nHTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))\n

        Tip

        In the example above, we are clustering the documents but since you have images, you might want to cluster those or cluster an aggregation of both images and documents. For that, you can use the new MultiModalBackend to generate embeddings:

        from bertopic.backend import MultiModalBackend\nmodel = MultiModalBackend('clip-ViT-B-32', batch_size=32)\n\n# Embed documents only\ndoc_embeddings = model.embed_documents(docs)\n\n# Embedding images only\nimage_embeddings = model.embed_images(images)\n\n# Embed both images and documents, then average them\ndoc_image_embeddings = model.embed(docs, images)\n
        "},{"location":"getting_started/multimodal/multimodal.html#images-only","title":"Images Only","text":"

        Traditional topic modeling techniques can only be run on textual data, as is shown in the example above. However, there are plenty of cases where textual data is not available but images are. BERTopic allows topic modeling to be performed using only images as your input data.

        To run BERTopic on images only, we first need to embed our images and then define a model that convert images to text. To do so, we are going to need some images. We will take the same images as the above but instead save them locally and pass the paths to the images instead. As mentioned before, this will make sure that we do not hold too many images in memory whilst only a small subset is needed:

        import os\nimport glob\nimport zipfile\nimport numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nfrom sentence_transformers import util\n\n# Flickr 8k images\nimg_folder = 'photos/'\ncaps_folder = 'captions/'\nif not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:\n    os.makedirs(img_folder, exist_ok=True)\n\n    if not os.path.exists('Flickr8k_Dataset.zip'):   #Download dataset if does not exist\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip')\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip')\n\n    for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]:\n        with zipfile.ZipFile(file, 'r') as zf:\n            for member in tqdm(zf.infolist(), desc='Extracting'):\n                zf.extract(member, folder)\nimages = list(glob.glob('photos/Flicker8k_Dataset/*.jpg'))\n

        Next, we can run our pipeline:

        from bertopic.representation import KeyBERTInspired, VisualRepresentation\nfrom bertopic.backend import MultiModalBackend\n\n# Image embedding model\nembedding_model = MultiModalBackend('clip-ViT-B-32', batch_size=32)\n\n# Image to text representation model\nrepresentation_model = {\n    \"Visual_Aspect\": VisualRepresentation(image_to_text_model=\"nlpconnect/vit-gpt2-image-captioning\")\n}\n

        Using these models, we can run our pipeline:

        from bertopic import BERTopic\n\n# Train our model with images only\ntopic_model = BERTopic(embedding_model=embedding_model, representation_model=representation_model, min_topic_size=30)\ntopics, probs = topic_model.fit_transform(documents=None, images=images)\n

        We can now access our image representations for each topic with topic_model.topic_aspects_[\"Visual_Aspect\"]. If you want an overview of the topic images together with their textual representations in jupyter, you can run the following:

        import base64\nfrom io import BytesIO\nfrom IPython.display import HTML\n\ndef image_base64(im):\n    if isinstance(im, str):\n        im = get_thumbnail(im)\n    with BytesIO() as buffer:\n        im.save(buffer, 'jpeg')\n        return base64.b64encode(buffer.getvalue()).decode()\n\n\ndef image_formatter(im):\n    return f'<img src=\"data:image/jpeg;base64,{image_base64(im)}\">'\n\n# Extract dataframe\ndf = topic_model.get_topic_info().drop(\"Representative_Docs\", 1).drop(\"Name\", 1)\n\n# Visualize the images\nHTML(df.to_html(formatters={'Visual_Aspect': image_formatter}, escape=False))\n

        "},{"location":"getting_started/online/online.html","title":"Online Topic Modeling","text":"

        Online topic modeling (sometimes called \"incremental topic modeling\") is the ability to learn incrementally from a mini-batch of instances. Essentially, it is a way to update your topic model with data on which it was not trained before. In Scikit-Learn, this technique is often modeled through a .partial_fit function, which is also used in BERTopic.

        Tip

        Another method for online topic modeling can be found with the .merge_models functionality of BERTopic. It allows for merging multiple BERTopic models to create a single new one. This method can be used to discover new topics by training a new model and exploring whether that new model added new topics to the original model when merging. A major benefit, compared to .partial_fit is that you can keep using the original UMAP and HDBSCAN models which tends result in improved performance and gives you significant more flexibility.

        In BERTopic, there are three main goals for using this technique.

        • To reduce the memory necessary for training a topic model.
        • To continuously update the topic model as new data comes in.
        • To continuously find new topics as new data comes in.

        In BERTopic, online topic modeling can be a bit tricky as there are several steps involved in which online learning needs to be made available. To recap, BERTopic consists of the following 6 steps:

        1. Extract embeddings
        2. Reduce dimensionality
        3. Cluster reduced embeddings
        4. Tokenize topics
        5. Extract topic words
        6. (Optional) Fine-tune topic words

        For some steps, an online variant is more important than others. Typically, in step 1 we use pre-trained language models that are in less need of continuous updates. This means that we can use an embedding model like Sentence-Transformers for extracting the embeddings and still use it in an online setting. Similarly, steps 5 and 6 do not necessarily need online variants since they are built upon step 4, tokenization. If that tokenization is by itself incremental, then so will steps 5 and 6.

        SBERT IncrementalPCA MiniBatchKMeans Online CountVectorizer Embeddings Dimensionality reduction Clustering Incremental Bag-of-Words c-TF-IDF Topic representation Online variants of these steps in the main BERTopic pipeline are needed in order to enable incremental learning.

        This means that we will need online variants for steps 2 through 4. Steps 2 and 3, dimensionality reduction and clustering, can be modeled through the use of Scikit-Learn's .partial_fit function. In other words, it supports any algorithm that can be trained using .partial_fit since these algorithms can be trained incrementally. For example, incremental dimensionality reduction can be achieved using Scikit-Learn's IncrementalPCA and incremental clustering with MiniBatchKMeans.

        Lastly, we need to develop an online variant for step 5, tokenization. In this step, a Bag-of-words representation is created through the CountVectorizer. However, as new data comes in, its vocabulary will need to be updated. For that purpose, bertopic.vectorizers.OnlineCountVectorizer was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. Most notably, the decay parameter is a value between 0 and 1 to weigh the percentage of frequencies that the previous bag-of-words matrix should be reduced to. For example, a value of .1 will decrease the frequencies in the bag-of-words matrix by 10% at each iteration. This will make sure that recent data has more weight than previous iterations. Similarly, delete_min_df will remove certain words from its vocabulary if their frequency is lower than a set value. This ties together with the decay parameter as some words will decay over time if not used. For more information regarding the OnlineCountVectorizer, please see the vectorizers documentation.

        "},{"location":"getting_started/online/online.html#example","title":"Example","text":"

        Online topic modeling in BERTopic is rather straightforward. We first need to have our documents split into chunks such that we can train and update our topic model incrementally.

        from sklearn.datasets import fetch_20newsgroups\n\n# Prepare documents\nall_docs = fetch_20newsgroups(subset=subset,  remove=('headers', 'footers', 'quotes'))[\"data\"]\ndoc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)]\n

        Here, we created chunks of 1000 documents to be fed in BERTopic. Then, we will need to define several sub-models that support online learning. Specifically, we are going to be using IncrementalPCA, MiniBatchKMeans, and the OnlineCountVectorizer:

        from sklearn.cluster import MiniBatchKMeans\nfrom sklearn.decomposition import IncrementalPCA\nfrom bertopic.vectorizers import OnlineCountVectorizer\n\n# Prepare sub-models that support online learning\numap_model = IncrementalPCA(n_components=5)\ncluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\", decay=.01)\n

        After having defined our sub-models, we can start training our topic model incrementally by looping over our document chunks:

        from bertopic import BERTopic\n\ntopic_model = BERTopic(umap_model=umap_model,\n                       hdbscan_model=cluster_model,\n                       vectorizer_model=vectorizer_model)\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n

        And that is it! During each iteration, you can access the predicted topics through the .topics_ attribute.

        Note

        Do note that in BERTopic it is not possible to use .partial_fit after the .fit as they work quite differently concerning internally updating topics, frequencies, representations, etc.

        Tip

        You can use any other dimensionality reduction and clustering algorithm as long as they have a .partial_fit function. Moreover, you can use dimensionality reduction algorithms that do not support .partial_fit functions but do have a .fit function to first train it on a large amount of data and then continuously add documents. The dimensionality reduction will not be updated but may be trained sufficiently to properly reduce the embeddings without the need to continuously add documents.

        Warning

        Only the most recent batch of documents is tracked. If you want to be using online topic modeling for low-memory use cases, then it is advised to also update the .topics_ attribute. Otherwise, variations such as hierarchical topic modeling will not work.

        # Incrementally fit the topic model by training on 1000 documents at a time and track the topics in each iteration\ntopics = []\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n    topics.extend(topic_model.topics_)\n\ntopic_model.topics_ = topics\n
        "},{"location":"getting_started/online/online.html#river","title":"River","text":"

        To continuously find new topics as they come in, we can use the package river. It contains several clustering models that can create new clusters as new data comes in. To make sure we can use their models, we first need to create a class that has a .partial_fit function and the option to extract labels through .labels_:

        from river import stream\nfrom river import cluster\n\nclass River:\n    def __init__(self, model):\n        self.model = model\n\n    def partial_fit(self, umap_embeddings):\n        for umap_embedding, _ in stream.iter_array(umap_embeddings):\n            self.model.learn_one(umap_embedding)\n\n        labels = []\n        for umap_embedding, _ in stream.iter_array(umap_embeddings):\n            label = self.model.predict_one(umap_embedding)\n            labels.append(label)\n\n        self.labels_ = labels\n        return self\n

        Then, we can choose any river.cluster model that we are interested in and pass it to the River class before using it in BERTopic:

        # Using DBSTREAM to detect new topics as they come in\ncluster_model = River(cluster.DBSTREAM())\nvectorizer_model = OnlineCountVectorizer(stop_words=\"english\")\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)\n\n# Prepare model\ntopic_model = BERTopic(\n    hdbscan_model=cluster_model, \n    vectorizer_model=vectorizer_model, \n    ctfidf_model=ctfidf_model,\n)\n\n\n# Incrementally fit the topic model by training on 1000 documents at a time\nfor docs in doc_chunks:\n    topic_model.partial_fit(docs)\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html","title":"Outlier reduction","text":"

        When using HDBSCAN, DBSCAN, or OPTICS, a number of outlier documents might be created that do not fall within any of the created topics. These are labeled as -1. Depending on your use case, you might want to decrease the number of documents that are labeled as outliers. Fortunately, there are a number of strategies one might use to reduce the number of outliers after you have trained your BERTopic model.

        The main way to reduce your outliers in BERTopic is by using the .reduce_outliers function. To make it work without too much tweaking, you will only need to pass the docs and their corresponding topics. You can pass outlier and non-outlier documents together since it will only try to reduce outlier documents and label them to a non-outlier topic.

        The following is a minimal example:

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers\nnew_topics = topic_model.reduce_outliers(docs, topics)\n

        Note

        You can use the threshold parameter to select the minimum distance or similarity when matching outlier documents with non-outlier topics. This allows the user to change the amount of outlier documents are assigned to non-outlier topics.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#strategies","title":"Strategies","text":"

        The default method for reducing outliers is by calculating the c-TF-IDF representations of outlier documents and assigning them to the best matching c-TF-IDF representations of non-outlier topics.

        However, there are a number of other strategies one can use, either separately or in conjunction that are worthwhile to explore:

        • Using the topic-document probabilities to assign topics
        • Using the topic-document distributions to assign topics
        • Using c-TF-IDF representations to assign topics
        • Using document and topic embeddings to assign topics
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#probabilities","title":"Probabilities","text":"

        This strategy uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document. To use this, make sure to calculate the probabilities beforehand by instantiating BERTopic with calculate_probabilities=True.

        from bertopic import BERTopic\n\n# Train your BERTopic model and calculate the document-topic probabilities\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `probabilities` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy=\"probabilities\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#topic-distributions","title":"Topic Distributions","text":"

        Use the topic distributions, as calculated with .approximate_distribution to find the most frequent topic in each outlier document. You can use the distributions_params variable to tweak the parameters of .approximate_distribution.

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `distributions` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"distributions\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#c-tf-idf","title":"c-TF-IDF","text":"

        Calculate the c-TF-IDF representation for each outlier document and find the best matching c-TF-IDF topic representation using cosine similarity.

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `c-tf-idf` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"c-tf-idf\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#embeddings","title":"Embeddings","text":"

        Using the embeddings of each outlier documents, find the best matching topic embedding using cosine similarity.

        from bertopic import BERTopic\n\n# Train your BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Reduce outliers using the `embeddings` strategy\nnew_topics = topic_model.reduce_outliers(docs, topics, strategy=\"embeddings\")\n

        Note

        If you have pre-calculated the documents embeddings you can speed up the outlier reduction process for the \"embeddings\" strategy as it will prevent re-calculating the document embeddings.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#chain-strategies","title":"Chain Strategies","text":"

        Since the .reduce_outliers function does not internally update the topics, we can easily try out different strategies but also chain them together. You might want to do a first pass with the \"c-tf-idf\" strategy as it is quite fast. Then, we can perform the \"distributions\" strategy on the outliers that are left since this method is typically much slower:

        # Use the \"c-TF-IDF\" strategy with a threshold\nnew_topics = topic_model.reduce_outliers(docs, topics , strategy=\"c-tf-idf\", threshold=0.1)\n\n# Reduce all outliers that are left with the \"distributions\" strategy\nnew_topics = topic_model.reduce_outliers(docs, new_topics, strategy=\"distributions\")\n
        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#update-topics","title":"Update Topics","text":"

        After generating our updated topics, we can feed them back into BERTopic in one of two ways. We can either update the topic representations themselves based on the documents that now belong to new topics or we can only update the topic frequency without updating the topic representations themselves.

        Warning

        In both cases, it is important to realize that updating the topics this way may lead to errors if topic reduction or topic merging techniques are used afterwards. The reason for this is that when you assign a -1 document to topic 1 and another -1 document to topic 2, it is unclear how you map the -1 documents. Is it matched to topic 1 or 2.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#update-topic-representation","title":"Update Topic Representation","text":"

        When outlier documents are generated, they are not used when modeling the topic representations. These documents are completely ignored when finding good descriptions of topics. Thus, after having reduced the number of outliers in your topic model, you might want to update the topic representations with the documents that now belong to actual topics. To do so, we can make use of the .update_topics function:

        topic_model.update_topics(docs, topics=new_topics)\n

        As seen above, you will only need to pass the documents on which the model was trained including the new topics that were generated using one of the above four strategies.

        "},{"location":"getting_started/outlier_reduction/outlier_reduction.html#exploration","title":"Exploration","text":"

        When you are reducing the number of topics, it might be worthwhile to iteratively visualize the results in order to get an intuitive understanding of the effect of the above four strategies. Making use of .visualize_documents, we can quickly iterate over the different strategies and view their effects. Here, an example will be shown on how to approach such a pipeline.

        First, we train our model:

        from umap import UMAP\nfrom bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Prepare data, extract embeddings, and prepare sub-models\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\nvectorizer_model = CountVectorizer(stop_words=\"english\")\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=True)\n\n# We reduce our embeddings to 2D as it will allows us to quickly iterate later on\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, \n                          min_dist=0.0, metric='cosine').fit_transform(embeddings)\n\n# Train our topic model\ntopic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, \n                       vectorizer_model=vectorizer_model calculate_probabilities=True, nr_topics=40)\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n

        After having trained our model, let us take a look at the 2D representation of the generated topics:

        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, \n                                hide_document_hover=True, hide_annotations=True)\n

        Next, we reduce the number of outliers using the probabilities strategy:

        new_topics = reduce_outliers(topic_model, docs, topics, probabilities=probs, \n                             threshold=0.05, strategy=\"probabilities\")\ntopic_model.update_topics(docs, topics=new_topics)\n

        And finally, we visualize the results:

        topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, \n                                hide_document_hover=True, hide_annotations=True)\n
        "},{"location":"getting_started/parameter%20tuning/parametertuning.html","title":"Hyperparameter Tuning","text":"

        Although BERTopic works quite well out of the box, there are a number of hyperparameters to tune according to your use case. This section will focus on important parameters directly accessible in BERTopic but also hyperparameter optimization in sub-models such as HDBSCAN and UMAP.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#bertopic","title":"BERTopic","text":"

        When instantiating BERTopic, there are several hyperparameters that you can directly adjust that could significantly improve the performance of your topic model. In this section, we will go through the most impactful parameters in BERTopic and directions on how to optimize them.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#language","title":"language","text":"

        The language parameter is used to simplify the selection of models for those who are not familiar with sentence-transformers models.

        In essence, there are two options to choose from:

        • language = \"english\" or
        • language = \"multilingual\"

        The English model is \"all-MiniLM-L6-v2\" and can be found here. It is the default model that is used in BERTopic and works great for English documents.

        The multilingual model is \"paraphrase-multilingual-MiniLM-L12-v2\" and supports over 50+ languages which can be found here. The model is very similar to the base model but is trained on many languages and has a slightly different architecture.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#top_n_words","title":"top_n_words","text":"

        top_n_words refers to the number of words per topic that you want to be extracted. In practice, I would advise you to keep this value below 30 and preferably between 10 and 20. The reasoning for this is that the more words you put in a topic the less coherent it can become. The top words are the most representative of the topic and should be focused on.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#n_gram_range","title":"n_gram_range","text":"

        The n_gram_range parameter refers to the CountVectorizer used when creating the topic representation. It relates to the number of words you want in your topic representation. For example, \"New\" and \"York\" are two separate words but are often used as \"New York\" which represents an n-gram of 2. Thus, the n_gram_range should be set to (1, 2) if you want \"New York\" in your topic representation.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#min_topic_size","title":"min_topic_size","text":"

        min_topic_size is an important parameter! It is used to specify what the minimum size of a topic can be. The lower this value the more topics are created. If you set this value too high, then it is possible that simply no topics will be created! Set this value too low and you will get many microclusters.

        It is advised to play around with this value depending on the size of your dataset. If it nears a million documents, then it is advised to set it much higher than the default of 10, for example, 100 or even 500.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#nr_topics","title":"nr_topics","text":"

        nr_topics can be a tricky parameter. It specifies, after training the topic model, the number of topics that will be reduced. For example, if your topic model results in 100 topics but you have set nr_topics to 20 then the topic model will try to reduce the number of topics from 100 to 20.

        This reduction can take a while as each reduction in topics activates a c-TF-IDF calculation. If this is set to None, no reduction is applied. Use \"auto\" to automatically reduce topics using HDBSCAN.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#low_memory","title":"low_memory","text":"

        low_memory sets UMAP's low_memory to True to make sure that less memory is used in the computation. This slows down computation but allows UMAP to be run on low-memory machines.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#calculate_probabilities","title":"calculate_probabilities","text":"

        calculate_probabilities lets you calculate the probabilities of each topic in each document. This is computationally quite expensive and is turned off by default.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#umap","title":"UMAP","text":"

        UMAP is an amazing technique for dimensionality reduction. In BERTopic, it is used to reduce the dimensionality of document embedding into something easier to use with HDBSCAN to create good clusters.

        However, it does has a significant number of parameters you could take into account. As exposing all parameters in BERTopic would be difficult to manage, we can instantiate our UMAP model and pass it to BERTopic:

        from umap import UMAP\n\numap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False)\ntopic_model = BERTopic(umap_model=umap_model).fit(docs)\n
        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#n_neighbors","title":"n_neighbors","text":"

        n_neighbors is the number of neighboring sample points used when making the manifold approximation. Increasing this value typically results in a more global view of the embedding structure whilst smaller values result in a more local view. Increasing this value often results in larger clusters being created.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#n_components","title":"n_components","text":"

        n_components refers to the dimensionality of the embeddings after reducing them. This is set as a default to 5 to reduce dimensionality as much as possible whilst trying to maximize the information kept in the resulting embeddings. Although lowering or increasing this value influences the quality of embeddings, its effect is largest on the performance of HDBSCAN. Increasing this value too much and HDBSCAN will have a hard time clustering the high-dimensional embeddings. Lower this value too much and too little information in the resulting embeddings are available to create proper clusters. If you want to increase this value, I would advise setting using a metric for HDBSCAN that works well in high dimensional data.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#metric","title":"metric","text":"

        metric refers to the method used to compute the distances in high dimensional space. The default is cosine as we are dealing with high dimensional data. However, BERTopic is also able to use any input, even regular tabular data, to cluster the documents. Thus, you might want to change the metric to something that fits your use case.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#low_memory_1","title":"low_memory","text":"

        low_memory is used when datasets may consume a lot of memory. Using millions of documents can lead to memory issues and setting this value to True might alleviate some of the issues.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#hdbscan","title":"HDBSCAN","text":"

        After reducing the embeddings with UMAP, we use HDBSCAN to cluster our documents into clusters of similar documents. Similar to UMAP, HDBSCAN has many parameters that could be tweaked to improve the cluster's quality.

        from hdbscan import HDBSCAN\n\nhdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)\ntopic_model = BERTopic(hdbscan_model=hdbscan_model).fit(docs)\n
        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#min_cluster_size","title":"min_cluster_size","text":"

        min_cluster_size is arguably the most important parameter in HDBSCAN. It controls the minimum size of a cluster and thereby the number of clusters that will be generated. It is set to 10 as a default. Increasing this value results in fewer clusters but of larger size whereas decreasing this value results in more micro clusters being generated. Typically, I would advise increasing this value rather than decreasing it.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#min_samples","title":"min_samples","text":"

        min_samples is automatically set to min_cluster_size and controls the number of outliers generated. Setting this value significantly lower than min_cluster_size might help you reduce the amount of noise you will get. Do note that outliers are to be expected and forcing the output to have no outliers may not properly represent the data.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#metric_1","title":"metric","text":"

        metric, like with HDBSCAN is used to calculate the distances. Here, we went with euclidean as, after reducing the dimensionality, we have low dimensional data and not much optimization is necessary. However, if you increase n_components in UMAP, then it would be advised to look into metrics that work with high dimensional data.

        "},{"location":"getting_started/parameter%20tuning/parametertuning.html#prediction_data","title":"prediction_data","text":"

        Make sure you always set this value to True as it is needed to predict new points later on. You can set this to False if you do not wish to predict any unseen data points.

        "},{"location":"getting_started/quickstart/quickstart.html","title":"Quick Start","text":""},{"location":"getting_started/quickstart/quickstart.html#installation","title":"Installation","text":"

        Installation, with sentence-transformers, can be done using pypi:

        pip install bertopic\n

        You may want to install more depending on the transformers and language backends that you will be using. The possible installations are:

        # Choose an embedding backend\npip install bertopic[flair, gensim, spacy, use]\n\n# Topic modeling with images\npip install bertopic[vision]\n
        "},{"location":"getting_started/quickstart/quickstart.html#quick-start","title":"Quick Start","text":"

        We start by extracting topics from the well-known 20 newsgroups dataset which is comprised of English documents:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        After generating topics, we can access the frequent topics that were generated:

        >>> topic_model.get_topic_info()\n\nTopic   Count   Name\n-1      4630    -1_can_your_will_any\n0       693     49_windows_drive_dos_file\n1       466     32_jesus_bible_christian_faith\n2       441     2_space_launch_orbit_lunar\n3       381     22_key_encryption_keys_encrypted\n

        -1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated, topic 0:

        >>> topic_model.get_topic(0)\n\n[('windows', 0.006152228076250982),\n ('drive', 0.004982897610645755),\n ('dos', 0.004845038866360651),\n ('file', 0.004140142872194834),\n ('disk', 0.004131678774810884),\n ('mac', 0.003624848635985097),\n ('memory', 0.0034840976976789903),\n ('software', 0.0034415334250699077),\n ('email', 0.0034239554442333257),\n ('pc', 0.003047105930670237)]\n

        Using .get_document_info, we can also extract information on a document level, such as their corresponding topics, probabilities, whether they are representative documents for a topic, etc.:

        >>> topic_model.get_document_info(docs)\n\nDocument                               Topic    Name                        Top_n_words                     Probability    ...\nI am sure some bashers of Pens...       0       0_game_team_games_season    game - team - games...          0.200010       ...\nMy brother is in the market for...      -1     -1_can_your_will_any         can - your - will...            0.420668       ...\nFinally you said what you dream...      -1     -1_can_your_will_any         can - your - will...            0.807259       ...\nThink! It is the SCSI card doing...     49     49_windows_drive_dos_file    windows - drive - docs...       0.071746       ...\n1) I have an old Jasmine drive...       49     49_windows_drive_dos_file    windows - drive - docs...       0.038983       ...\n

        Multilingual

        Use BERTopic(language=\"multilingual\") to select a model that supports 50+ languages.

        "},{"location":"getting_started/quickstart/quickstart.html#fine-tune-topic-representations","title":"Fine-tune Topic Representations","text":"

        In BERTopic, there are a number of different topic representations that we can choose from. They are all quite different from one another and give interesting perspectives and variations of topic representations. A great start is KeyBERTInspired, which for many users increases the coherence and reduces stopwords from the resulting topic representations:

        from bertopic.representation import KeyBERTInspired\n\n# Fine-tune your topic representations\nrepresentation_model = KeyBERTInspired()\ntopic_model = BERTopic(representation_model=representation_model)\n

        However, you might want to use something more powerful to describe your clusters. You can even use ChatGPT or other models from OpenAI to generate labels, summaries, phrases, keywords, and more:

        import openai\nfrom bertopic.representation import OpenAI\n\n# Fine-tune topic representations with GPT\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(client, model=\"gpt-3.5-turbo\", chat=True)\ntopic_model = BERTopic(representation_model=representation_model)\n

        Multi-aspect Topic Modeling

        Instead of iterating over all of these different topic representations, you can model them simultaneously with multi-aspect topic representations in BERTopic.

        "},{"location":"getting_started/quickstart/quickstart.html#visualizations","title":"Visualizations","text":"

        After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can use one of the many visualization options in BERTopic. For example, we can visualize the topics that were generated in a way very similar to LDAvis:

        topic_model.visualize_topics()\n
        "},{"location":"getting_started/quickstart/quickstart.html#saveload-bertopic-model","title":"Save/Load BERTopic model","text":"

        There are three methods for saving BERTopic:

        1. A light model with .safetensors and config files
        2. A light model with pytorch .bin and config files
        3. A full model with .pickle

        Method 3 allows for saving the entire topic model but has several drawbacks:

        • Arbitrary code can be run from .pickle files
        • The resulting model is rather large (often > 500MB) since all sub-models need to be saved
        • Explicit and specific version control is needed as they typically only run if the environment is exactly the same

        It is advised to use methods 1 or 2 for saving.

        These methods have a number of advantages:

        • .safetensors is a relatively safe format
        • The resulting model can be very small (often < 20MB) since no sub-models need to be saved
        • Although version control is important, there is a bit more flexibility with respect to specific versions of packages
        • More easily used in production
        • Share models with the HuggingFace Hub

        Tip

        For more detail about how to load in a custom vectorizer, representation model, and more, it is highly advised to checkout the serialization page. It contains more examples, details, and some tips and tricks for loading and saving your environment.

        The methods are as used as follows:

        topic_model = BERTopic().fit(my_docs)\n\n# Method 1 - safetensors\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 2 - pytorch\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"pytorch\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 3 - pickle\ntopic_model.save(\"my_model\", serialization=\"pickle\")\n

        To load a model:

        # Load from directory\nloaded_model = BERTopic.load(\"path/to/my/model_dir\")\n\n# Load from file\nloaded_model = BERTopic.load(\"my_model\")\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_Wikipedia\")\n

        Warning

        When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic should not be loaded in other versions.

        "},{"location":"getting_started/representation/llm.html","title":"6B. LLM & Generative AI","text":"

        As we have seen in the previous section, the topics that you get from BERTopic can be fine-tuned using a number of approaches. Here, we are going to focus on text generation Large Language Models such as ChatGPT, GPT-4, and open-source solutions.

        Using these techniques, we can further fine-tune topics to generate labels, summaries, poems of topics, and more. To do so, we first generate a set of keywords and documents that describe a topic best using BERTopic's c-TF-IDF calculate. Then, these candidate keywords and documents are passed to the text generation model and asked to generate output that fits the topic best.

        A huge benefit of this is that we can describe a topic with only a few documents and we therefore do not need to pass all documents to the text generation model. Not only speeds this the generation of topic labels up significantly, you also do not need a massive amount of credits when using an external API, such as Cohere or OpenAI.

        "},{"location":"getting_started/representation/llm.html#prompt-engineering","title":"Prompt Engineering","text":"

        In most of the examples below, we use certain tags to customize our prompts. There are currently two tags, namely \"[KEYWORDS]\" and \"[DOCUMENTS]\". These tags indicate where in the prompt they are to be replaced with a topics keywords and top 4 most representative documents respectively. For example, if we have the following prompt:

        prompt = \"\"\"\nI have topic that contains the following documents: \\n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the above information, can you give a short label of the topic?\n\"\"\"\n

        then that will be rendered as follows:

        \"\"\"\nI have a topic that contains the following documents: \n- Our videos are also made possible by your support on patreon.co.\n- If you want to help us make more videos, you can do so on patreon.com or get one of our posters from our shop.\n- If you want to help us make more videos, you can do so there.\n- And if you want to support us in our endeavor to survive in the world of online video, and make more videos, you can do so on patreon.com.\n\nThe topic is described by the following keywords: videos video you our support want this us channel patreon make on we if facebook to patreoncom can for and more watch \n\nBased on the above information, can you give a short label of the topic?\n\"\"\"\n

        Tip 1

        You can access the default prompts of these models with representation_model.default_prompt_. The prompts that were generated after training can be accessed with topic_model.representation_model.prompts_.

        "},{"location":"getting_started/representation/llm.html#selecting-documents","title":"Selecting Documents","text":"

        By default, four of the most representative documents will be passed to [DOCUMENTS]. These documents are selected by calculating their similarity (through c-TF-IDF representations) with the main c-TF-IDF representation of the topics. The four best matching documents per topic are selected.

        To increase the number of documents passed to [DOCUMENTS], we can use the nr_docs parameter which is accessible in all LLMs on this page. Using this value allows you to select the top n most representative documents instead. If you have a long enough context length, then you could even give the LLM dozens of documents.

        However, some of these documents might be very similar to one another and might be near duplicates. They will not provide much additional information about the content of the topic. Instead, we can use the diversity parameter in each LLM to only select documents that are sufficiently diverse. It takes values between 0 and 1 but a value of 0.1 already does wonders!

        "},{"location":"getting_started/representation/llm.html#truncating-documents","title":"Truncating Documents","text":"

        We can truncate the input documents in [DOCUMENTS] in order to reduce the number of tokens that we have in our input prompt. To do so, all text generation modules have two parameters that we can tweak:

        • doc_length
          • The maximum length of each document. If a document is longer, it will be truncated. If None, the entire document is passed.
        • tokenizer
          • The tokenizer used to calculate to split the document into segments used to count the length of a document.
            • If tokenizer is 'char', then the document is split up into characters which are counted to adhere to doc_length
            • If tokenizer is 'whitespace', the document is split up into words separated by whitespaces. These words are counted and truncated depending on doc_length
            • If tokenizer is 'vectorizer', then the internal CountVectorizer is used to tokenize the document. These tokens are counted and truncated depending on doc_length
            • If tokenizer is a callable, then that callable is used to tokenized the document. These tokens are counted and truncated depending on doc_length

        This means that the definition of doc_length changes depending on what constitutes a token in the tokenizer parameter. If a token is a character, then doc_length refers to max length in characters. If a token is a word, then doc_length refers to the max length in words.

        Let's illustrate this with an example. In the code below, we will use tiktoken to count the number of tokens in each document and limit them to 100 tokens. All documents that have more than 100 tokens will be truncated.

        We start by installing the relevant packages:

        pip install tiktoken openai\n

        Then, we use bertopic.representation.OpenAI to represent our topics with nicely written labels. We specify that documents that we put in the prompt cannot exceed 100 tokens each. Since we will put 4 documents in the prompt, they will total roughly 400 tokens:

        import openai\nimport tiktoken\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Tokenizer\ntokenizer= tiktoken.encoding_for_model(\"gpt-3.5-turbo\")\n\n# Create your representation model\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(\n    client,\n    model=\"gpt-3.5-turbo\", \n    delay_in_seconds=2, \n    chat=True,\n    nr_docs=4,\n    doc_length=100,\n    tokenizer=tokenizer\n)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"getting_started/representation/llm.html#transformers","title":"\ud83e\udd17 Transformers","text":"

        Nearly every week, there are new and improved models released on the \ud83e\udd17 Model Hub that, with some creativity, allow for further fine-tuning of our c-TF-IDF based topics. These models range from text generation to zero-classification. In BERTopic, wrappers around these methods are created as a way to support whatever might be released in the future.

        Using a GPT-like model from the huggingface hub is rather straightforward:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = TextGeneration('gpt2')\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        GPT2, however, is not the most accurate model out there on HuggingFace models. You can get much better results with a flan-T5 like model:

        from transformers import pipeline\nfrom bertopic.representation import TextGeneration\n\nprompt = \"I have a topic described by the following keywords: [KEYWORDS]. Based on the previous keywords, what is this topic about?\"\n\n# Create your representation model\ngenerator = pipeline('text2text-generation', model='google/flan-t5-base')\nrepresentation_model = TextGeneration(generator)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation beef volcanoes immune system earth european union cotton \ud83e\udd17 Transformers

        As can be seen from the example above, if you would like to use a text2text-generation model, you will to pass a transformers.pipeline with the \"text2text-generation\" parameter. Moreover, you can use a custom prompt and decide where the keywords should be inserted by using the [KEYWORDS] or documents with the [DOCUMENTS] tag.

        "},{"location":"getting_started/representation/llm.html#zephyr-mistral-7b","title":"Zephyr (Mistral 7B)","text":"

        We can go a step further with open-source Large Language Models (LLMs) that have shown to match the performance of closed-source LLMs like ChatGPT.

        In this example, we will show you how to use Zephyr, a fine-tuning version of Mistral 7B. Mistral 7B outperforms other open-source LLMs at a much smaller scale and is a worthwhile solution for use cases such as topic modeling. We want to keep inference as fast as possible and a relatively small model helps with that. Zephyr is a fine-tuned version of Mistral 7B that was trained on a mix of publicly available and synthetic datasets using Direct Preference Optimization (DPO).

        To use Zephyr in BERTopic, we will first need to install and update a couple of packages that can handle quantized versions of Zephyr:

        pip install ctransformers[cuda]\npip install --upgrade git+https://github.com/huggingface/transformers\n

        Instead of loading in the full model, we can instead load a quantized model which is a compressed version of the original model:

        from ctransformers import AutoModelForCausalLM\nfrom transformers import AutoTokenizer, pipeline\n\n# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.\nmodel = AutoModelForCausalLM.from_pretrained(\n    \"TheBloke/zephyr-7B-alpha-GGUF\",\n    model_file=\"zephyr-7b-alpha.Q4_K_M.gguf\",\n    model_type=\"mistral\",\n    gpu_layers=50,\n    hf=True\n)\ntokenizer = AutoTokenizer.from_pretrained(\"HuggingFaceH4/zephyr-7b-alpha\")\n\n# Pipeline\ngenerator = pipeline(\n    model=model, tokenizer=tokenizer,\n    task='text-generation',\n    max_new_tokens=50,\n    repetition_penalty=1.1\n)\n

        This Zephyr model requires a specific prompt template in order to work:

        prompt = \"\"\"<|system|>You are a helpful, respectful and honest assistant for labeling topics..</s>\n<|user|>\nI have a topic that contains the following documents:\n[DOCUMENTS]\n\nThe topic is described by the following keywords: '[KEYWORDS]'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.</s>\n<|assistant|>\"\"\"\n

        After creating this prompt template, we can create our representation model to be used in BERTopic:

        from bertopic.representation import TextGeneration\n\n# Text generation with Zephyr\nzephyr = TextGeneration(generator, prompt=prompt)\nrepresentation_model = {\"Zephyr\": zephyr}\n\n# Topic Modeling\ntopic_model = BERTopic(representation_model=representation_model, verbose=True)\n
        "},{"location":"getting_started/representation/llm.html#llama-2","title":"Llama 2","text":"

        Full Llama 2 Tutorial:

        Open-source LLMs are starting to become more and more popular. Here, we will go through a minimal example of using Llama 2 together with BERTopic.

        First, we need to load in our Llama 2 model:

        from torch import bfloat16\nimport transformers\n\n# set quantization configuration to load large model with less GPU memory\n# this requires the `bitsandbytes` library\nbnb_config = transformers.BitsAndBytesConfig(\n    load_in_4bit=True,  # 4-bit quantization\n    bnb_4bit_quant_type='nf4',  # Normalized float 4\n    bnb_4bit_use_double_quant=True,  # Second quantization after the first\n    bnb_4bit_compute_dtype=bfloat16  # Computation type\n)\n\n# Llama 2 Tokenizer\ntokenizer = transformers.AutoTokenizer.from_pretrained(model_id)\n\n# Llama 2 Model\nmodel = transformers.AutoModelForCausalLM.from_pretrained(\n    model_id,\n    trust_remote_code=True,\n    quantization_config=bnb_config,\n    device_map='auto',\n)\nmodel.eval()\n\n# Our text generator\ngenerator = transformers.pipeline(\n    model=model, tokenizer=tokenizer,\n    task='text-generation',\n    temperature=0.1,\n    max_new_tokens=500,\n    repetition_penalty=1.1\n)\n

        After doing so, we will need to define a prompt that works with both Llama 2 as well as BERTopic:

        # System prompt describes information given to all conversations\nsystem_prompt = \"\"\"\n<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant for labeling topics.\n<</SYS>>\n\"\"\"\n\n# Example prompt demonstrating the output we are looking for\nexample_prompt = \"\"\"\nI have a topic that contains the following documents:\n- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.\n- Meat, but especially beef, is the word food in terms of emissions.\n- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.\n\nThe topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.\n\n[/INST] Environmental impacts of eating meat\n\"\"\"\n\n# Our main prompt with documents ([DOCUMENTS]) and keywords ([KEYWORDS]) tags\nmain_prompt = \"\"\"\n[INST]\nI have a topic that contains the following documents:\n[DOCUMENTS]\n\nThe topic is described by the following keywords: '[KEYWORDS]'.\n\nBased on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.\n[/INST]\n\"\"\"\n\nprompt = system_prompt + example_prompt + main_prompt\n

        Three pieces of the prompt were created:

        • system_prompt helps us guide the model during a conversation. For example, we can say that it is a helpful assistant that is specialized in labeling topics.
        • example_prompt gives an example of a correctly labeled topic to guide Llama 2
        • main_prompt contains the main question we are going to ask it, namely to label a topic. Note that it uses the [DOCUMENTS] and [KEYWORDS] to provide the most relevant documents and keywords as additional context

        After having generated our prompt template, we can start running our topic model:

        from bertopic.representation import TextGeneration\nfrom bertopic import BERTopic\n\n# Text generation with Llama 2\nllama2 = TextGeneration(generator, prompt=prompt)\nrepresentation_model = {\n    \"Llama2\": llama2,\n}\n\n# Create our BERTopic model\ntopic_model = BERTopic(representation_model=representation_model,  verbose=True)\n
        "},{"location":"getting_started/representation/llm.html#llamacpp","title":"llama.cpp","text":"

        An amazing framework for using LLMs for inference is llama.cpp which has python bindings that we can use in BERTopic. To start with, we first need to install llama-cpp-python:

        pip install llama-cpp-python\n

        or using the following for hardware acceleration:

        CMAKE_ARGS=\"-DLLAMA_CUBLAS=on\" FORCE_CMAKE=1 pip install llama-cpp-python\n

        Note

        There are a number of installation options depending on your hardware and OS. Make sure that you select the correct one to optimize your performance.

        After installation, you need to download your LLM locally before we use it in BERTopic, like so:

        wget https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF/resolve/main/zephyr-7b-alpha.Q4_K_M.gguf\n

        Finally, we can now use the model the model with BERTopic in just a couple of lines:

        from bertopic import BERTopic\nfrom bertopic.representation import LlamaCPP\n\n# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha\nrepresentation_model = LlamaCPP(\"zephyr-7b-alpha.Q4_K_M.gguf\")\n\n# Create our BERTopic model\ntopic_model = BERTopic(representation_model=representation_model,  verbose=True)\n

        If you want to have more control over the LLMs parameters, you can run it like so:

        from bertopic import BERTopic\nfrom bertopic.representation import LlamaCPP\nfrom llama_cpp import Llama\n\n# Use llama.cpp to load in a 4-bit quantized version of Zephyr 7B Alpha\nllm = Llama(model_path=\"zephyr-7b-alpha.Q4_K_M.gguf\", n_gpu_layers=-1, n_ctx=4096, stop=\"Q:\")\nrepresentation_model = LlamaCPP(llm)\n\n# Create our BERTopic model\ntopic_model = BERTopic(representation_model=representation_model,  verbose=True)\n

        Note

        The default template that is being used uses a \"Q: ... A: ... \" type of structure which is why the stop is set at \"Q:\". The default template is:

        \"\"\"\nQ: I have a topic that contains the following documents: \n[DOCUMENTS]\n\nThe topic is described by the following keywords: '[KEYWORDS]'.\n\nBased on the above information, can you give a short label of the topic?\nA: \n\"\"\"\n

        "},{"location":"getting_started/representation/llm.html#openai","title":"OpenAI","text":"

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use OpenAI to extract our topic labels from the candidate documents and keywords. To use this, you will need to install openai first:

        pip install openai\n

        Then, get yourself an API key and use OpenAI's API as follows:

        import openai\nfrom bertopic.representation import OpenAI\nfrom bertopic import BERTopic\n\n# Create your representation model\nclient = openai.OpenAI(api_key=\"sk-...\")\nrepresentation_model = OpenAI(client)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation Organic vs Conventional Food: Environmental and Health Considerations Volcanic Eruptions and Impacts The Immune System: Understanding and Boosting Immunity The Moon's Tides and Orbit Phenomena Democracy in the European Union Plastic Pollution and its environmental impact OpenAI

        You can also use a custom prompt:

        prompt = \"I have the following documents: [DOCUMENTS] \\nThese documents are about the following topic: '\"\nrepresentation_model = OpenAI(client, prompt=prompt)\n
        "},{"location":"getting_started/representation/llm.html#chatgpt","title":"ChatGPT","text":"

        Within OpenAI's API, the ChatGPT models use a different API structure compared to the GPT-3 models. In order to use ChatGPT with BERTopic, we need to define the model and make sure to enable chat:

        representation_model = OpenAI(client, model=\"gpt-3.5-turbo\", delay_in_seconds=10, chat=True)\n

        Prompting with ChatGPT is very satisfying and is customizable as follows:

        prompt = \"\"\"\nI have a topic that contains the following documents: \n[DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS]\n\nBased on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n\"\"\"\n

        Note

        Whenever you create a custom prompt, it is important to add

        Based on the information above, extract a short topic label in the following format:\ntopic: <topic label>\n
        at the end of your prompt as BERTopic extracts everything that comes after topic:. Having said that, if topic: is not in the output, then it will simply extract the entire response, so feel free to experiment with the prompts.

        "},{"location":"getting_started/representation/llm.html#summarization","title":"Summarization","text":"

        Due to the structure of the prompts in OpenAI's chat models, we can extract different types of topic representations from their GPT models. Instead of extracting a topic label, we can instead ask it to extract a short description of the topic instead:

        summarization_prompt = \"\"\"\nI have a topic that is described by the following keywords: [KEYWORDS]\nIn this topic, the following documents are a small but representative subset of all documents in the topic:\n[DOCUMENTS]\n\nBased on the information above, please give a description of this topic in the following format:\ntopic: <description>\n\"\"\"\n\nrepresentation_model = OpenAI(client, model=\"gpt-3.5-turbo\", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3)\n

        The above is not constrained to just creating a short description or summary of the topic, we can extract labels, keywords, poems, example documents, extensitive descriptions, and more using this method! If you want to have multiple representations of a single topic, it might be worthwhile to also check out multi-aspect topic modeling with BERTopic.

        "},{"location":"getting_started/representation/llm.html#langchain","title":"LangChain","text":"

        Langchain is a package that helps users with chaining large language models. In BERTopic, we can leverage this package in order to more efficiently combine external knowledge. Here, this external knowledge are the most representative documents in each topic.

        To use langchain, you will need to install the langchain package first. Additionally, you will need an underlying LLM to support langchain, like openai:

        pip install langchain, openai\n

        Then, you can create your chain as follows:

        from langchain.chains.question_answering import load_qa_chain\nfrom langchain.llms import OpenAI\nchain = load_qa_chain(OpenAI(temperature=0, openai_api_key=my_openai_api_key), chain_type=\"stuff\")\n

        Finally, you can pass the chain to BERTopic as follows:

        from bertopic.representation import LangChain\n\n# Create your representation model\nrepresentation_model = LangChain(chain)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        You can also use a custom prompt:

        prompt = \"What are these documents about? Please give a single label.\"\nrepresentation_model = LangChain(chain, prompt=prompt)\n

        Note

        The prompt does not make use of [KEYWORDS] and [DOCUMENTS] tags as the documents are already used within langchain's load_qa_chain.

        "},{"location":"getting_started/representation/llm.html#cohere","title":"Cohere","text":"

        Instead of using a language model from \ud83e\udd17 transformers, we can use external APIs instead that do the work for you. Here, we can use Cohere to extract our topic labels from the candidate documents and keywords. To use this, you will need to install cohere first:

        pip install cohere\n

        Then, get yourself an API key and use Cohere's API as follows:

        import cohere\nfrom bertopic.representation import Cohere\nfrom bertopic import BERTopic\n\n# Create your representation model\nco = cohere.Client(my_api_key)\nrepresentation_model = Cohere(co)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation Organic food Exploding planets How your immune system works How tides work How democratic is the European Union? Plastic pollution Cohere

        You can also use a custom prompt:

        prompt = \"\"\"\nI have topic that contains the following documents: [DOCUMENTS]\nThe topic is described by the following keywords: [KEYWORDS].\nBased on the above information, can you give a short label of the topic?\n\"\"\"\nrepresentation_model = Cohere(co, prompt=prompt)\n
        "},{"location":"getting_started/representation/representation.html","title":"6A. Representation Models","text":"

        One of the core components of BERTopic is its Bag-of-Words representation and weighting with c-TF-IDF. This method is fast and can quickly generate a number of keywords for a topic without depending on the clustering task. As a result, topics can easily and quickly be updated after training the model without the need to re-train it. Although these give good topic representations, we may want to further fine-tune the topic representations.

        As such, there are a number of representation models implemented in BERTopic that allows for further fine-tuning of the topic representations. These are optional and are not used by default. You are not restrained by the how the representation can be fine-tuned, from GPT-like models to fast keyword extraction with KeyBERT-like models:

        For each model below, an example will be shown on how it may change or improve upon the default topic keywords that are generated. The dataset used in these examples can be found here.

        If you want to have multiple representations of a single topic, it might be worthwhile to also check out multi-aspect topic modeling with BERTopic.

        "},{"location":"getting_started/representation/representation.html#keybertinspired","title":"KeyBERTInspired","text":"

        After having generated our topics with c-TF-IDF, we might want to do some fine-tuning based on the semantic relationship between keywords/keyphrases and the set of documents in each topic. Although we can use a centroid-based technique for this, it can be costly and does not take the structure of a cluster into account. Instead, we leverage c-TF-IDF to create a set of representative documents per topic and use those as our updated topic embedding. Then, we calculate the similarity between candidate keywords and the topic embedding using the same embedding model that embedded the documents.

        n Topic Extract representative documents Embed candidate keywords Compare embedded keywords with embedded documents Embed and average documents Extract candidate keywords Compare c-TF-IDF sampled documents with the topic c-TF-IDF. Extract top n words per topic based on their c-TF-IDF scores

        Thus, the algorithm follows some principles of KeyBERT but does some optimization in order to speed up inference. Usage is straightforward:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation organic | meat | foods | crops | beef | produce | food | diet | cows | eating explosion | explodes | eruptions | eruption | blast | volcanoes | volcanic immune | immunology | antibodies | disease | cells | infection | cell | system moon | moons | lunar | tides | tidal | gravity | orbit | satellites | earth | orbits eu | democracy | european | democratic | parliament | governments | voting plastics | plastic | pollution | microplastics | environmental | polymers | bpa KeyBERT-Inspired

        "},{"location":"getting_started/representation/representation.html#partofspeech","title":"PartOfSpeech","text":"

        Our candidate topics, as extracted with c-TF-IDF, do not take into account a keyword's part of speech as extracting noun-phrases from all documents can be computationally quite expensive. Instead, we can leverage c-TF-IDF to perform part of speech on a subset of keywords and documents that best represent a topic.

        n Topic Extract documents that contain at least one keyword Sort keywords by their c-TF-IDF value Use the POS matcher on those documents to generate new candidate keywords Extract candidate keywords

        More specifically, we find documents that contain the keywords from our candidate topics as calculated with c-TF-IDF. These documents serve as the representative set of documents from which the Spacy model can extract a set of candidate keywords for each topic. These candidate keywords are first put through Spacy's POS module to see whether they match with the DEFAULT_PATTERNS:

        DEFAULT_PATTERNS = [\n            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n            [{'POS': 'NOUN'}],\n            [{'POS': 'ADJ'}]\n]\n

        These patterns follow Spacy's Rule-Based Matching. Then, the resulting keywords are sorted by their respective c-TF-IDF values.

        from bertopic.representation import PartOfSpeech\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation meat | organic | food | beef | emissions | most | health | pesticides | production explosion | atmosphere | eruption | kilometers | eruptions | fireball | super immune | system | cells | immunology | adaptive | body | memory | cell moon | earth | lunar | tides | water | orbit | base | moons | surface | gravity democratic | vote | parliament | member | union | states | national | countries plastic | plastics | tons | pollution | waste | microplastics | polymers | bag PartOfSpeech

        You can define custom POS patterns to be extracted:

        pos_patterns = [\n            [{'POS': 'ADJ'}, {'POS': 'NOUN'}],\n            [{'POS': 'NOUN'}], [{'POS': 'ADJ'}]\n]\nrepresentation_model = PartOfSpeech(\"en_core_web_sm\", pos_patterns=pos_patterns)\n
        "},{"location":"getting_started/representation/representation.html#maximalmarginalrelevance","title":"MaximalMarginalRelevance","text":"

        When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like \"car\" and \"cars\" essentially represent the same information and often redundant.

        To decrease this redundancy and improve the diversity of keywords, we can use an algorithm called Maximal Marginal Relevance (MMR). MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document.

        from bertopic.representation import MaximalMarginalRelevance\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = MaximalMarginalRelevance(diversity=0.3)\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation meat | organic | beef | emissions | health | pesticides | foods | farming | conventional explosion | atmosphere | eruption | eruptions | crust | volcanoes | earthquakes immune | system | cells | immunology | adaptive | body | memory | antibodies moon | lunar | tides | moons | surface | gravity | tide | meters | oceans | dust eu | democratic | vote | parliament | citizen | laws | institutions | influence | nations plastics | tons | pollution | waste | microplastics | polymers | ocean | bpa | cotton MaximalMarginalRelevance

        "},{"location":"getting_started/representation/representation.html#zero-shot-classification","title":"Zero-Shot Classification","text":"

        For some use cases, you might already have a set of candidate labels that you would like to automatically assign to some of the topics. Although we can use guided or supervised BERTopic for that, we can also use zero-shot classification to assign labels to our topics. For that, we can make use of \ud83e\udd17 transformers on their models on the model hub.

        To perform this classification, we feed the model with the keywords as generated through c-TF-IDF and a set of candidate labels. If, for a certain topic, we find a similar enough label, then it is assigned. If not, then we keep the original c-TF-IDF keywords.

        We use it in BERTopic as follows:

        from bertopic.representation import ZeroShotClassification\nfrom bertopic import BERTopic\n\n# Create your representation model\ncandidate_topics = [\"space and nasa\", \"bicycles\", \"sports\"]\nrepresentation_model = ZeroShotClassification(candidate_topics, model=\"facebook/bart-large-mnli\")\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n

        meat | organic | food | beef | emissions | eat | of | eating | is the | explosion | atmosphere | eruption | kilometers | of | immune | system | your | cells | my | and | is | the | how | of moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers Default Representation Organic food the | explosion | atmosphere | eruption | kilometers | of Your immune system moon | earth | lunar | tides | the | water | orbit | base | moons eu | european | democratic | vote | parliament | member | union plastic | plastics | tons | pollution | waste | microplastics | polymers ZeroShotClassification

        "},{"location":"getting_started/representation/representation.html#chain-models","title":"Chain Models","text":"

        All of the above models can make use of the candidate topics, as generated by c-TF-IDF, to further fine-tune the topic representations. For example, MaximalMarginalRelevance takes the keywords in the candidate topics and re-ranks them. Similarly, the keywords in the candidate topic can be used as the input for GPT-prompts in OpenAI.

        Although the default candidate topics are generated by c-TF-IDF, what if we were to chain these models? For example, we can use MaximalMarginalRelevance to improve upon the keywords in each topic before passing them to OpenAI.

        This is supported in BERTopic by simply passing a list of representation models when instantiation the topic model:

        from bertopic.representation import MaximalMarginalRelevance, OpenAI\nfrom bertopic import BERTopic\nimport openai\n\n# Create your representation models\nclient = openai.OpenAI(api_key=\"sk-...\")\nopenai_generator = OpenAI(client)\nmmr = MaximalMarginalRelevance(diversity=0.3)\nrepresentation_models = [mmr, openai_generator]\n\n# Use the chained models\ntopic_model = BERTopic(representation_model=representation_models)\n
        "},{"location":"getting_started/representation/representation.html#custom-model","title":"Custom Model","text":"

        Although several representation models have been implemented in BERTopic, new technologies get released often and we should not have to wait until they get implemented in BERTopic. Therefore, you can create your own representation model and use that to fine-tune the topics.

        The following is the basic structure for creating your custom model. Note that it returns the same topics as the those calculated with c-TF-IDF:

        from bertopic.representation._base import BaseRepresentation\n\n\nclass CustomRepresentationModel(BaseRepresentation):\n    def extract_topics(self, topic_model, documents, c_tf_idf, topics\n                      ) -> Mapping[str, List[Tuple[str, float]]]:\n        \"\"\" Extract topics\n\n        Arguments:\n            topic_model: The BERTopic model\n            documents: A dataframe of documents with their related topics\n            c_tf_idf: The c-TF-IDF matrix\n            topics: The candidate topics as calculated with c-TF-IDF\n\n        Returns:\n            updated_topics: Updated topic representations\n        \"\"\"\n        updated_topics = topics.copy()\n        return updated_topics\n

        Then, we can use that model as follows:

        from bertopic import BERTopic\n\n# Create our custom representation model\nrepresentation_model = CustomRepresentationModel()\n\n# Pass our custom representation model to BERTopic\ntopic_model = BERTopic(representation_model=representation_model)\n

        There are a few things to take into account when creating your custom model:

        • It needs to have the exact same parameter input: topic_model, documents, c_tf_idf, topics.
        • Make sure that updated_topics has the exact same structure as topics:
        updated_topics = {\n    \"1\", [(\"space\", 0.9), (\"nasa\", 0.7)], \n    \"2\": [(\"science\", 0.66), (\"article\", 0.6)]\n}\n

        Tip

        You can change the __init__ however you want, it does not influence the underlying structure. This also means that you can save data/embeddings/representations/sentiment in your custom representation model.

        "},{"location":"getting_started/search/search.html","title":"Search Topics","text":"

        After having created a BERTopic model, you might end up with over a hundred topics. Searching through those can be quite cumbersome especially if you are searching for a specific topic. Fortunately, BERTopic allows you to search for topics using search terms. First, let's create and train a BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Create topics\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        After having trained our model, we can use find_topics to search for topics that are similar to an input search_term. Here, we are going to be searching for topics that closely relate the search term \"motor\". Then, we extract the most similar topic and check the results:

        >>> similar_topics, similarity = topic_model.find_topics(\"motor\", top_n=5)\n>>> topic_model.get_topic(similar_topics[0])\n[('bike', 0.02275997701645559),\n ('motorcycle', 0.011391202866080292),\n ('bikes', 0.00981187573649205),\n ('dod', 0.009614623748226669),\n ('honda', 0.008247663662558535),\n ('ride', 0.0064683227888861945),\n ('harley', 0.006355502638631013),\n ('riding', 0.005766601561614182),\n ('motorcycles', 0.005596372493714447),\n ('advice', 0.005534544418830091)]\n

        It definitely seems that a topic was found that closely matches \"motor\". The topic seems to be motorcycle related and therefore matches our \"motor\" input. You can use the similarity variable to see how similar the extracted topics are to the search term.

        Note

        You can only use this method if an embedding model was supplied to BERTopic using embedding_model.

        "},{"location":"getting_started/seed_words/seed_words.html","title":"Seed Words","text":"

        When performing Topic Modeling, you are often faced with data that you are familiar with to a certain extend or that speaks a very specific language. In those cases, topic modeling techniques might have difficulties capturing and representing the semantic nature of domain specific abbreviations, slang, short form, acronyms, etc. For example, the \"TNM\" classification is a method for identifying the stage of most cancers. The word \"TNM\" is an abbreviation and might not be correctly captured in generic embedding models.

        To make sure that certain domain specific words are weighted higher and are more often used in topic representations, you can set any number of seed_words in the bertopic.vectorizer.ClassTfidfTransformer. The ClassTfidfTransformer is the base representation of BERTopic and essentially represents each topic as a bag of words. As such, we can choose to increase the importance of certain words, such as \"TNM\".

        To do so, let's take a look at an example. We have a dataset of article abstracts and want to perform some topic modeling. Since we might be familiar with the data, there are certain words that we know should be generally important. Let's assume that we have in-depth knowledge about reinforcement learning and know that words like \"agent\" and \"robot\" should be important in such a topic were it to be found. Using the ClassTfidfTransformer, we can define those seed_words and also choose by how much their values are multiplied.

        The full example is then as follows:

        from umap import UMAP\nfrom datasets import load_dataset\nfrom bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\n# Let's take a subset of ArXiv abstracts as the training data\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\nabstracts = dataset[\"abstract\"][:5_000]\n\n# For illustration purposes, we make sure the output is fixed when running this code multiple times\numap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)\n\n# We can choose any number of seed words for which we want their representation\n# to be strengthen. We increase the importance of these words as we want them to be more\n# likely to end up in the topic representations.\nctfidf_model = ClassTfidfTransformer(\n    seed_words=[\"agent\", \"robot\", \"behavior\", \"policies\", \"environment\"], \n    seed_multiplier=2\n)\n\n# We run the topic model with the seeded words\ntopic_model = BERTopic(\n    umap_model=umap_model,\n    min_topic_size=15,\n    ctfidf_model=ctfidf_model,\n).fit(abstracts)\n

        Then, when we run topic_model.get_topic(0), we get the following output:

        [('policy', 0.023413102511982354),\n ('reinforcement', 0.021796126795834238),\n ('agent', 0.021131601305431902),\n ('policies', 0.01888385271486409),\n ('environment', 0.017819874593917057),\n ('learning', 0.015321710504308708),\n ('robot', 0.013881115279230468),\n ('control', 0.013297705894983875),\n ('the', 0.013247933839985382),\n ('to', 0.013058208312484141)]\n

        As we can see, the output includes some of the seed words that we assigned. However, if a word is not found to be important in a topic than we can still multiply its importance but it will remain relatively low. This is a great feature as it allows you to improve their importance with less risk of making words important in topics that really should not be.

        A benefit of this method is that this often influences all other representation methods, like KeyBERTInspired and OpenAI. The reason for this is that each representation model uses the words generated by the ClassTfidfTransformer as candidate words to be further optimized. In many cases, words like \"TNM\" might not end up in the candidate words. By increasing their importance, they are more likely to end up as candidate words in representation models.

        Another benefit of using this method is that it artificially increases the interpretability of topics. Sure, some words might be more important than others but there might not mean something to a domain expert. For them, certain words, like \"TNM\" are highly descriptive and that is something difficult to capture using any method (embedding model, large language model, etc.).

        Moreover, these seed_words can be defined together with the domain expert as they can decide what type of words are generally important and might need a nudge from you the algorithmic developer.

        "},{"location":"getting_started/semisupervised/semisupervised.html","title":"Semi-supervised Topic Modeling","text":"

        In BERTopic, you have several options to nudge the creation of topics toward certain pre-specified topics. Here, we will be looking at semi-supervised topic modeling with BERTopic.

        Semi-supervised modeling allows us to steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have.

        SBERT UMAP HDBSCAN c-TF-IDF Embeddings Dimensionality reduction Labels Clustering Topic representation

        In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents with HDBSCAN.

        First, let us prepare the data needed for our topic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\ncategories = data[\"target\"]\ncategory_names = data[\"target_names\"]\n

        We are using the popular 20 Newsgroups dataset which contains roughly 18000 newsgroups posts that each is assigned to one of 20 categories. Using this dataset we can try to extract its corresponding topic model whilst taking its underlying categories into account. These categories are here the variable targets.

        Each document can be put into one of the following categories:

        >>> category_names\n\n['alt.atheism',\n 'comp.graphics',\n 'comp.os.ms-windows.misc',\n 'comp.sys.ibm.pc.hardware',\n 'comp.sys.mac.hardware',\n 'comp.windows.x',\n 'misc.forsale',\n 'rec.autos',\n 'rec.motorcycles',\n 'rec.sport.baseball',\n 'rec.sport.hockey',\n 'sci.crypt',\n 'sci.electronics',\n 'sci.med',\n 'sci.space',\n 'soc.religion.christian',\n 'talk.politics.guns',\n 'talk.politics.mideast',\n 'talk.politics.misc',\n 'talk.religion.misc'] \n

        To perform this semi-supervised approach, we can take in some pre-defined topics and simply pass those to the y parameter when fitting BERTopic. These labels can be pre-defined topics or simply documents that you feel belong together regardless of their content. BERTopic will nudge the creation of topics toward these categories using the pre-defined labels.

        To perform supervised topic modeling, we simply use all categories:

        topic_model = BERTopic(verbose=True).fit(docs, y=categories)\n

        The topic model will be much more attuned to the categories that were defined previously. However, this does not mean that only topics for these categories will be found. BERTopic is likely to find more specific topics in those you have already defined. This allows you to discover previously unknown topics!

        "},{"location":"getting_started/semisupervised/semisupervised.html#partial-labels","title":"Partial labels","text":"

        At times, you might only have labels for a subset of documents. Fortunately, we can still use those labels to at least nudge the documents for which those labels exist. The documents for which we do not have labels are assigned a -1. For this example, imagine we only have the labels of categories that are related to computers and we want to create a topic model using semi-supervised modeling:

        labels_to_add = ['comp.graphics', 'comp.os.ms-windows.misc',\n              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',\n              'comp.windows.x',]\nindices = [category_names.index(label) for label in labels_to_add]\ny = [label if label in indices else -1 for label in categories]\n

        The y variable contains many -1 values since we do not know all the categories.

        Next, we use those newly constructed labels to again BERTopic semi-supervised:

        topic_model = BERTopic(verbose=True).fit(docs, y=y)\n

        And that is it! By defining certain classes for our documents, we can steer the topic modeling towards modeling the pre-defined categories.

        "},{"location":"getting_started/serialization/serialization.html","title":"Serialization","text":"

        Saving, loading, and sharing a BERTopic model can be done in several ways. It is generally advised to go with .safetensors as that allows for a small, safe, and fast method for saving your BERTopic model. However, other formats, such as .pickle and pytorch .bin are also possible.

        "},{"location":"getting_started/serialization/serialization.html#saving","title":"Saving","text":"

        There are three methods for saving BERTopic:

        1. A light model with .safetensors and config files
        2. A light model with pytorch .bin and config files
        3. A full model with .pickle

        Tip

        It is advised to use methods 1 or 2 for saving as they generated very small models. Especially method 1 (safetensors) allows for a relatively safe format compared to the other methods.

        The methods are used as follows:

        topic_model = BERTopic().fit(my_docs)\n\n# Method 1 - safetensors\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"safetensors\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 2 - pytorch\nembedding_model = \"sentence-transformers/all-MiniLM-L6-v2\"\ntopic_model.save(\"path/to/my/model_dir\", serialization=\"pytorch\", save_ctfidf=True, save_embedding_model=embedding_model)\n\n# Method 3 - pickle\ntopic_model.save(\"my_model\", serialization=\"pickle\")\n

        Warning

        When saving the model, make sure to also keep track of the versions of dependencies and Python used. Loading and saving the model should be done using the same dependencies and Python. Moreover, models saved in one version of BERTopic are not guaranteed to load in other versions.

        "},{"location":"getting_started/serialization/serialization.html#pickle-drawbacks","title":"Pickle Drawbacks","text":"

        Saving the model with pickle allows for saving the entire topic model, including dimensionality reduction and clustering algorithms, but has several drawbacks:

        • Arbitrary code can be run from .pickle files
        • The resulting model is rather large (often > 500MB) since all sub-models need to be saved
        • Explicit and specific version control is needed as they typically only run if the environment is exactly the same
        "},{"location":"getting_started/serialization/serialization.html#safetensors-and-pytorch-advantages","title":"Safetensors and Pytorch Advantages","text":"

        Saving the topic modeling with .safetensors or pytorch has a number of advantages:

        • .safetensors is a relatively safe format
        • The resulting model can be very small (often < 20MB>) since no sub-models need to be saved
        • Although version control is important, there is a bit more flexibility with respect to specific versions of packages
        • More easily used in production
        • Share models with the HuggingFace Hub

        The above image, a model trained on 100,000 documents, demonstrates the differences in sizes comparing safetensors, pytorch, and pickle. The difference in sizes can mostly be explained due to the efficient saving procedure and that the clustering and dimensionality reductions are not saved in safetensors/pytorch since inference can be done based on the topic embeddings.

        "},{"location":"getting_started/serialization/serialization.html#huggingface-hub","title":"HuggingFace Hub","text":"

        When you have created a BERTopic model, you can easily share it with other through the HuggingFace Hub. First, you need to log in to your HuggingFace account which you can do in a number of ways:

        • Log in to your Hugging Face account with the command below
        huggingface-cli login\n\n# or using an environment variable\nhuggingface-cli login --token $HUGGINGFACE_TOKEN\n
        • Alternatively, you can programmatically login using login() in a notebook or a script
        from huggingface_hub import login\nlogin()\n
        • Or you can give a token with the token variable

        When you have logged in to your HuggingFace account, you can save and upload the model as follows:

        from bertopic import BERTopic\n\n# Train model\ntopic_model = BERTopic().fit(my_docs)\n\n# Push to HuggingFace Hub\ntopic_model.push_to_hf_hub(\n    repo_id=\"MaartenGr/BERTopic_ArXiv\",\n    save_ctfidf=True\n)\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_ArXiv\")\n
        "},{"location":"getting_started/serialization/serialization.html#parameters","title":"Parameters","text":"

        There are number of parameters that may be worthwhile to know:

        • private
          • Whether to create a private repository
        • serialization
          • The type of serialization. Either safetensors or pytorch. Make sure to run pip install safetensors for safetensors.
        • save_embedding_model
          • A pointer towards a HuggingFace model to be loaded in with SentenceTransformers. E.g., sentence-transformers/all-MiniLM-L6-v2
        • save_ctfidf
          • Whether to save c-TF-IDF information
        "},{"location":"getting_started/serialization/serialization.html#loading","title":"Loading","text":"

        To load a model:

        # Load from directory\nloaded_model = BERTopic.load(\"path/to/my/model_dir\")\n\n# Load from file\nloaded_model = BERTopic.load(\"my_model\")\n\n# Load from HuggingFace\nloaded_model = BERTopic.load(\"MaartenGr/BERTopic_Wikipedia\")\n

        The embedding model cannot always be saved using a non-pickle method if, for example, you are using OpenAI embeddings. Instead, you can load them in as follows:

        # Define embedding model\nimport openai\nfrom bertopic.backend import OpenAIBackend\n\nclient = openai.OpenAI(api_key=\"sk-...\")\nembedding_model = OpenAIBackend(client, \"text-embedding-ada-002\")\n\n# Load model and add embedding model\nloaded_model = BERTopic.load(\"path/to/my/model_dir\", embedding_model=embedding_model)\n
        "},{"location":"getting_started/supervised/supervised.html","title":"Supervised Topic Modeling","text":"

        Although topic modeling is typically done by discovering topics in an unsupervised manner, there might be times when you already have a bunch of clusters or classes from which you want to model the topics. For example, the often used 20 NewsGroups dataset is already split up into 20 classes. Similarly, you might already have created some labels yourself through packages like human-learn, bulk, thisnotthat or something entirely different.

        Instead of using BERTopic to discover previously unknown topics, we are now going to manually pass them to BERTopic and try to learn the relationship between those topics and the input documents.

        In other words, we are going to be performing classification instead!

        We can view this as a supervised topic modeling approach. Instead of using a clustering algorithm, we are going to be using a classification algorithm instead.

        Generally, we have the following pipeline:

        SBERT UMAP HDBSCAN c-TF-IDF Embeddings Dimensionality reduction Clustering Topic representation

        Instead, we are now going to skip over the dimensionality reduction step and replace the clustering step with a classification model:

        SBERT Logistic Regression c-TF-IDF Embeddings Classifier Topic representation

        In other words, we can pass our labels to BERTopic and it will not only learn how to predict labels for new instances, but it also transforms those labels into topics by running the c-TF-IDF representations on the set of documents within each label. This process allows us to model the topics themselves and similarly gives us the option to use everything BERTopic has to offer.

        To do so, we need to skip over the dimensionality reduction step and replace the clustering step with a classification algorithm. We can use the documents and labels from the 20 NewsGroups dataset to create topics from those 20 labels:

        from sklearn.datasets import fetch_20newsgroups\n\n# Get labeled data\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n

        Then, we make sure to create empty instances of the dimensionality reduction and clustering steps. We pass those to BERTopic to simply skip over them and go to the topic representation process:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\nfrom bertopic.dimensionality import BaseDimensionalityReduction\nfrom sklearn.linear_model import LogisticRegression\n\n# Get labeled data\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data['data']\ny = data['target']\n\n# Skip over dimensionality reduction, replace cluster model with classifier,\n# and reduce frequent words while we are at it.\nempty_dimensionality_model = BaseDimensionalityReduction()\nclf = LogisticRegression()\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\n\n# Create a fully supervised BERTopic instance\ntopic_model= BERTopic(\n        umap_model=empty_dimensionality_model,\n        hdbscan_model=clf,\n        ctfidf_model=ctfidf_model\n)\ntopics, probs = topic_model.fit_transform(docs, y=y)\n

        Let's take a look at a few topics that we get out of training this way by running topic_model.get_topic_info():

        Topic Count Name 0 0 999 0_game_hockey_team_25 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit

        We can see several interesting topics appearing here. They seem to relate to the 20 classes we had as input. Now, let's map those topics to our original classes to view their relationship:

        # Map input `y` to topics\nmappings = topic_model.topic_mapper_.get_mappings()\nmappings = {value: data[\"target_names\"][key] for key, value in mappings.items()}\n\n# Assign original classes to our topics\ndf = topic_model.get_topic_info()\ndf[\"Class\"] = df.Topic.map(mappings)\ndf\n
        Topic Count Name Class 0 0 999 0_game_hockey_team_25 rec.sport.hockey 1_god_church_jesus_christ 997 1 1 2 2 996 2_bike_dod_ride_bikes 3_baseball_game_he_year 994 3 3 4 4 991 4_key_encryption_db_clipper 5_car_cars_engine_ford 990 5 5 6 6 990 6_medical_patients_cancer_disease 7_window_server_widget_motif 988 7 7 8 8 988 8_space_launch_nasa_orbit sci.space comp.windows.x sci.med rec.autos sci.crypt rec.sport.baseball rec.motorcycles soc.religion.christian

        We can see that the c-TF-IDF representations extract the words that give a good representation of our input classes. This is all done directly from the labeling. A welcome side-effect is that we now have a classification algorithm that allows us to predict the topics of unseen data:

        >>> topic, _ = topic_model.transform(\"this is a document about cars\")\n>>> topic_model.get_topic(topic)\n[('car', 0.4407600315538472),\n ('cars', 0.32348015696446325),\n ('engine', 0.28032518444946686),\n ('ford', 0.2500224508115155),\n ('oil', 0.2325984913598611),\n ('dealer', 0.2310723968585826),\n ('my', 0.22045777551991935),\n ('it', 0.21327993649430219),\n ('tires', 0.20420842634292657),\n ('brake', 0.20246902481367085)]\n

        Moreover, we can still perform BERTopic-specific features like dynamic topic modeling, topics per class, hierarchical topic modeling, modeling topic distributions, etc.

        Note

        The resulting topics may be a different mapping from the y labels. To map y to topics, we can run the following:

        mappings = topic_model.topic_mapper_.get_mappings()\ny_mapped = [mappings[val] for val in y]\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html","title":"Tips & Tricks","text":""},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#document-length","title":"Document length","text":"

        As a default, we are using sentence-transformers to embed our documents. However, as the name implies, the embedding model works best for either sentences or paragraphs. This means that whenever you have a set of documents, where each documents contains several paragraphs, the document is truncated and the topic model is only trained on a small part of the data.

        One way to solve this issue is by splitting up longer documents into either sentences or paragraphs before embedding them. Another solution is to approximate the topic distributions of topics after having trained your topic model.

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#removing-stop-words","title":"Removing stop words","text":"

        At times, stop words might end up in our topic representations. This is something we typically want to avoid as they contribute little to the interpretation of the topics. However, removing stop words as a preprocessing step is not advised as the transformer-based embedding models that we use need the full context in order to create accurate embeddings.

        Instead, we can use the CountVectorizer to preprocess our documents after having generated embeddings and clustered our documents. Personally, I have found almost no disadvantages to using the CountVectorizer to remove stopwords and it is something I would strongly advise to try out:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n

        We can also use the ClassTfidfTransformer to reduce the impact of frequent words. The end result is very similar to explicitly removing stopwords but this process does this automatically:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import ClassTfidfTransformer\n\nctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)\ntopic_model = BERTopic(ctfidf_model=ctfidf_model)\n

        Lastly, we can use a KeyBERT-Inspired model to reduce the appearance of stop words. This also often improves the topic representation:

        from bertopic.representation import KeyBERTInspired\nfrom bertopic import BERTopic\n\n# Create your representation model\nrepresentation_model = KeyBERTInspired()\n\n# Use the representation model in BERTopic on top of the default pipeline\ntopic_model = BERTopic(representation_model=representation_model)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#diversify-topic-representation","title":"Diversify topic representation","text":"

        After having calculated our top n words per topic there might be many words that essentially mean the same thing. As a little bonus, we can use bertopic.representation.MaximalMarginalRelevance in BERTopic to diversify words in each topic such that we limit the number of duplicate words we find in each topic. This is done using an algorithm called Maximal Marginal Relevance which compares word embeddings with the topic embedding.

        We do this by specifying a value between 0 and 1, with 0 being not at all diverse and 1 being completely diverse:

        from bertopic import BERTopic\nfrom bertopic.representation import MaximalMarginalRelevance\n\nrepresentation_model = MaximalMarginalRelevance(diversity=0.2)\ntopic_model = BERTopic(representation_model=representation_model)\n

        Since MMR is using word embeddings to diversify the topic representations, it is necessary to pass the embedding model to BERTopic if you are using pre-computed embeddings:

        from bertopic import BERTopic\nfrom bertopic.representation import MaximalMarginalRelevance\nfrom sentence_transformers import SentenceTransformer\n\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\nrepresentation_model = MaximalMarginalRelevance(diversity=0.2)\ntopic_model = BERTopic(embedding_model=sentence_model, representation_model=representation_model)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#topic-term-matrix","title":"Topic-term matrix","text":"

        Although BERTopic focuses on clustering our documents, the end result does contain a topic-term matrix. This topic-term matrix is calculated using c-TF-IDF, a TF-IDF procedure optimized for class-based analyses.

        To extract the topic-term matrix (or c-TF-IDF matrix) with the corresponding words, we can simply do the following:

        topic_term_matrix = topic_model.c_tf_idf_\nwords = topic_model.vectorizer_model.get_feature_names()\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#pre-compute-embeddings","title":"Pre-compute embeddings","text":"

        Typically, we want to iterate fast over different versions of our BERTopic model whilst we are trying to optimize it to a specific use case. To speed up this process, we can pre-compute the embeddings, save them, and pass them to BERTopic so it does not need to calculate the embeddings each time:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train our topic model using our pre-trained sentence-transformers embeddings\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs, embeddings)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#speed-up-umap","title":"Speed up UMAP","text":"

        At times, UMAP may take a while to fit on the embeddings that you have. This often happens when you have the embeddings millions of documents that you want to reduce in dimensionality. There is a trick that can speed up this process somewhat: Initializing UMAP with rescaled PCA embeddings.

        Without going in too much detail (look here for more information), you can reduce the embeddings using PCA and use that as a starting point. This can speed up the dimensionality reduction a bit:

        import numpy as np\nfrom umap import UMAP\nfrom bertopic import BERTopic\nfrom sklearn.decomposition import PCA\n\n\ndef rescale(x, inplace=False):\n    \"\"\" Rescale an embedding so optimization will not have convergence issues.\n    \"\"\"\n    if not inplace:\n        x = np.array(x, copy=True)\n\n    x /= np.std(x[:, 0]) * 10000\n\n    return x\n\n\n# Initialize and rescale PCA embeddings\npca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))\n\n# Start UMAP from PCA embeddings\numap_model = UMAP(\n    n_neighbors=15,\n    n_components=5,\n    min_dist=0.0,\n    metric=\"cosine\",\n    init=pca_embeddings,\n)\n\n# Pass the model to BERTopic:\ntopic_model = BERTopic(umap_model=umap_model)\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#gpu-acceleration","title":"GPU acceleration","text":"

        You can use cuML to speed up both UMAP and HDBSCAN through GPU acceleration:

        from bertopic import BERTopic\nfrom cuml.cluster import HDBSCAN\nfrom cuml.manifold import UMAP\n\n# Create instances of GPU-accelerated UMAP and HDBSCAN\numap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)\nhdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)\n\n# Pass the above models to be used in BERTopic\ntopic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Depending on the embeddings you are using, you might want to normalize them first in order to force a cosine-related distance metric in UMAP:

        from cuml.preprocessing import normalize\nembeddings = normalize(embeddings)\n

        Note

        As of the v0.13 release, it is not yet possible to calculate the topic-document probability matrix for unseen data (i.e., .transform) using cuML's HDBSCAN. However, it is still possible to calculate the topic-document probability matrix for the data on which the model was trained (i.e., .fit and .fit_transform).

        Note

        If you want to install cuML together with BERTopic using Google Colab, you can run the following code:

        !pip install bertopic\n!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com\n!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64\n
        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#lightweight-installation","title":"Lightweight installation","text":"

        The default embedding model in BERTopic is one of the amazing sentence-transformers models, namely \"all-MiniLM-L6-v2\". Although this model performs well out of the box, it typically needs a GPU to transform the documents into embeddings in a reasonable time. Moreover, the installation requires pytorch which often results in a rather large environment, memory-wise.

        Fortunately, it is possible to install BERTopic without sentence-transformers and use it as a lightweight solution instead. The installation can be done as follows:

        pip install --no-deps bertopic\npip install --upgrade numpy hdbscan umap-learn pandas scikit-learn tqdm plotly pyyaml\n

        Then, we can use BERTopic without sentence-transformers as follows using a CPU-based embedding technique:

        from sklearn.pipeline import make_pipeline\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\npipe = make_pipeline(\n    TfidfVectorizer(),\n    TruncatedSVD(100)\n)\n\ntopic_model = BERTopic(embedding_model=pipe)\n

        As a result, the entire package and resulting model can be run quickly on the CPU and no GPU is necessary!

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#wordcloud","title":"WordCloud","text":"

        To minimize the number of dependencies in BERTopic, it is not possible to generate wordclouds out-of-the-box. However, there is a minimal script that you can use to generate wordclouds in BERTopic. First, you will need to install the wordcloud package with pip install wordcloud. Then, run the following code to generate the wordcloud for a specific topic:

        from wordcloud import WordCloud\nimport matplotlib.pyplot as plt\n\ndef create_wordcloud(model, topic):\n    text = {word: value for word, value in model.get_topic(topic)}\n    wc = WordCloud(background_color=\"white\", max_words=1000)\n    wc.generate_from_frequencies(text)\n    plt.imshow(wc, interpolation=\"bilinear\")\n    plt.axis(\"off\")\n    plt.show()\n\n# Show wordcloud\ncreate_wordcloud(topic_model, topic=1)\n

        Tip

        To increase the number of words shown in the wordcloud, you can increase the top_n_words parameter when instantiating BERTopic. You can also increase the number of words in a topic after training the model using .update_topics().

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#finding-similar-topics-between-models","title":"Finding similar topics between models","text":"

        Whenever you have trained separate BERTopic models on different datasets, it might be worthful to find the similarities among these models. Is there overlap between topics in model A and topic in model B? In other words, can we find topics in model A that are similar to those in model B?

        We can compare the topic representations of several models in two ways. First, by comparing the topic embeddings that are created when using the same embedding model across both fitted BERTopic instances. Second, we can compare the c-TF-IDF representations instead assuming we have fixed the vocabulary in both instances.

        This example will go into the former, using the same embedding model across two BERTopic instances. To do this comparison, let's first create an example where I trained two models, one on an English dataset and one on a Dutch dataset:

        from datasets import load_dataset\nfrom bertopic import BERTopic\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# The same embedding model needs to be used for both topic models\n# and since we are dealing with multiple languages, the model needs to be multi-lingual\nsentence_model = SentenceTransformer(\"paraphrase-multilingual-MiniLM-L12-v2\")\n\n# To make this example reproducible\numap_model = UMAP(n_neighbors=15, n_components=5, \n                  min_dist=0.0, metric='cosine', random_state=42)\n\n# English\nen_dataset = load_dataset(\"stsb_multi_mt\", name=\"en\", split=\"train\").to_pandas().sentence1.tolist()\nen_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model)\nen_model.fit(en_dataset)\n\n# Dutch\nnl_dataset = load_dataset(\"stsb_multi_mt\", name=\"nl\", split=\"train\").to_pandas().sentence1.tolist()\nnl_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model)\nnl_model.fit(nl_dataset)\n

        In the code above, there is one important thing to note and that is the sentence_model. This model needs to be exactly the same in all BERTopic models, otherwise, it is not possible to compare topic models.

        Next, we can calculate the similarity between topics in the English topic model en_model and the Dutch model nl_model. To do so, we can simply calculate the cosine similarity between the topic_embedding of both models:

        from sklearn.metrics.pairwise import cosine_similarity\nsim_matrix = cosine_similarity(en_model.topic_embeddings_, nl_model.topic_embeddings_)\n

        Now that we know which topics are similar to each other, we can extract the most similar topics. Let's say that we have topic 10 in the en_model which represents a topic related to trains:

        >>> topic = 10\n>>> en_model.get_topic(topic)\n[('train', 0.2588080580844999),\n ('tracks', 0.1392140438801078),\n ('station', 0.12126454635946024),\n ('passenger', 0.058057876475695866),\n ('engine', 0.05123717127783682),\n ('railroad', 0.048142847325312044),\n ('waiting', 0.04098973702226946),\n ('track', 0.03978248702913929),\n ('subway', 0.03834661195748458),\n ('steam', 0.03834661195748458)]\n

        To find the matching topic, we extract the most similar topic in the sim_matrix:

        >>> most_similar_topic = np.argmax(sim_matrix[topic + 1])-1\n>>> nl_model.get_topic(most_similar_topic)\n[('trein', 0.24186603209316418),\n ('spoor', 0.1338118418551581),\n ('sporen', 0.07683661859111401),\n ('station', 0.056990389779394225),\n ('stoommachine', 0.04905829711711234),\n ('zilveren', 0.04083879598477808),\n ('treinen', 0.03534099197032758),\n ('treinsporen', 0.03534099197032758),\n ('staat', 0.03481332997324445),\n ('zwarte', 0.03179591746822408)]\n

        It seems to be working as, for example, trein is a translation of train and sporen a translation of tracks! You can do this for every single topic to find out which topic in the en_model might belong to a model in the nl_model.

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#multimodal-data","title":"Multimodal data","text":"

        Concept is a variation of BERTopic for multimodal data, such as images with captions. Although we can use that package for multimodal data, we can perform a small trick with BERTopic to have a similar feature.

        BERTopic is a relatively modular approach that attempts to isolate steps from one another. This means, for example, that you can use k-Means instead of HDBSCAN or PCA instead of UMAP as it does not make any assumptions with respect to the nature of the clustering.

        Similarly, you can pass pre-calculated embeddings to BERTopic that represent the documents that you have. However, it does not make any assumption with respect to the relationship between those embeddings and the documents. This means that we could pass any metadata to BERTopic to cluster on instead of document embeddings. In this example, we can separate our embeddings from our documents so that the embeddings are generated from images instead of their corresponding images. Thus, we will cluster image embeddings but create the topic representation from the related captions.

        In this example, we first need to fetch our data, namely the Flickr 8k dataset that contains images with captions:

        import os\nimport glob\nimport zipfile\nimport numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nfrom PIL import Image\nfrom sentence_transformers import SentenceTransformer, util\n\n# Flickr 8k images\nimg_folder = 'photos/'\ncaps_folder = 'captions/'\nif not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:\n    os.makedirs(img_folder, exist_ok=True)\n\n    if not os.path.exists('Flickr8k_Dataset.zip'):   #Download dataset if does not exist\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip', 'Flickr8k_Dataset.zip')\n        util.http_get('https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip', 'Flickr8k_text.zip')\n\n    for folder, file in [(img_folder, 'Flickr8k_Dataset.zip'), (caps_folder, 'Flickr8k_text.zip')]:\n        with zipfile.ZipFile(file, 'r') as zf:\n            for member in tqdm(zf.infolist(), desc='Extracting'):\n                zf.extract(member, folder)\nimages = list(glob.glob('photos/Flicker8k_Dataset/*.jpg'))\n\n# Prepare dataframe\ncaptions = pd.read_csv(\"captions/Flickr8k.lemma.token.txt\",sep='\\t',names=[\"img_id\",\"img_caption\"])\ncaptions.img_id = captions.apply(lambda row: \"photos/Flicker8k_Dataset/\" + row.img_id.split(\".jpg\")[0] + \".jpg\", 1)\ncaptions = captions.groupby([\"img_id\"])[\"img_caption\"].apply(','.join).reset_index()\ncaptions = pd.merge(captions, pd.Series(images, name=\"img_id\"), on=\"img_id\")\n\n# Extract images together with their documents/captions\nimages = captions.img_id.to_list()\ndocs = captions.img_caption.to_list()\n

        Now that we have our images and captions, we need to generate our image embeddings:

        model = SentenceTransformer('clip-ViT-B-32')\n\n# Prepare images\nbatch_size = 32\nnr_iterations = int(np.ceil(len(images) / batch_size))\n\n# Embed images per batch\nembeddings = []\nfor i in tqdm(range(nr_iterations)):\n    start_index = i * batch_size\n    end_index = (i * batch_size) + batch_size\n\n    images_to_embed = [Image.open(filepath) for filepath in images[start_index:end_index]]\n    img_emb = model.encode(images_to_embed, show_progress_bar=False)\n    embeddings.extend(img_emb.tolist())\n\n    # Close images\n    for image in images_to_embed:\n        image.close()\nembeddings = np.array(embeddings)\n

        Finally, we can fit BERTopic the way we are used to, with documents and embeddings:

        from bertopic import BERTopic\nfrom sklearn.cluster import KMeans\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\ntopics, probs = topic_model.fit_transform(docs, embeddings)\ncaptions[\"Topic\"] = topics\n

        After fitting our model, let's inspect a topic about skateboarders:

        >>> topic_model.get_topic(2)\n[('skateboard', 0.09592033177340711),\n ('skateboarder', 0.07792520092546491),\n ('trick', 0.07481578896400298),\n ('ramp', 0.056952605147927216),\n ('skate', 0.03745127816149923),\n ('perform', 0.036546213623432654),\n ('bicycle', 0.03453483070441857),\n ('bike', 0.033233021253898994),\n ('jump', 0.026709362981948037),\n ('air', 0.025422798170830936)]\n

        Based on the above output, we can take an image to see if the representation makes sense:

        image = captions.loc[captions.Topic == 2, \"img_id\"].values.tolist()[0]\nImage.open(image)\n

        "},{"location":"getting_started/tips_and_tricks/tips_and_tricks.html#keybert-bertopic","title":"KeyBERT & BERTopic","text":"

        Although BERTopic focuses on topic extraction methods that does not assume specific structures for the generated clusters, it is possible to do this on a more local level. More specifically, we can use KeyBERT to generate a number of keywords for each document and then build a vocabulary on top of that as the input for BERTopic. This way, we can select words that we know have meaning to a topic, without focusing on the centroid of that cluster. This also allows more frequent words to pop-up regardless of the structure and density of a cluster.

        To do this, we first need to run KeyBERT on our data and create our vocabulary:

        from sklearn.datasets import fetch_20newsgroups\nfrom keybert import KeyBERT\n\n# Prepare documents \ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# Extract keywords\nkw_model = KeyBERT()\nkeywords = kw_model.extract_keywords(docs)\n\n# Create our vocabulary\nvocabulary = [k[0] for keyword in keywords for k in keyword]\nvocabulary = list(set(vocabulary))\n

        Then, we pass our vocabulary to BERTopic and train the model:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer_model= CountVectorizer(vocabulary=vocabulary)\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\ntopics, probs = topic_model.fit_transform(docs)\n
        "},{"location":"getting_started/topicreduction/topicreduction.html","title":"Topic Reduction","text":"

        BERTopic uses HDBSCAN for clustering the data and it cannot specify the number of clusters you would want. To a certain extent, this is an advantage, as we can trust HDBSCAN to be better in finding the number of clusters than we are. Instead, we can try to reduce the number of topics that have been created. Below, you will find three methods of doing so.

        "},{"location":"getting_started/topicreduction/topicreduction.html#manual-topic-reduction","title":"Manual Topic Reduction","text":"

        Each resulting topic has its feature vector constructed from c-TF-IDF. Using those feature vectors, we can find the most similar topics and merge them. If we do this iteratively, starting from the least frequent topic, we can reduce the number of topics quite easily. We do this until we reach the value of nr_topics:

        from bertopic import BERTopic\ntopic_model = BERTopic(nr_topics=20)\n

        It is also possible to manually select certain topics that you believe should be merged. For example, if topic 1 is 1_space_launch_moon_nasa and topic 2 is 2_spacecraft_solar_space_orbit it might make sense to merge those two topics:

        topics_to_merge = [1, 2]\ntopic_model.merge_topics(docs, topics_to_merge)\n

        If you have several groups of topics you want to merge, create a list of lists instead:

        topics_to_merge = [[1, 2]\n                   [3, 4]]\ntopic_model.merge_topics(docs, topics_to_merge)\n
        "},{"location":"getting_started/topicreduction/topicreduction.html#automatic-topic-reduction","title":"Automatic Topic Reduction","text":"

        One issue with the approach above is that it will merge topics regardless of whether they are very similar. They are simply the most similar out of all options. This can be resolved by reducing the number of topics automatically. To do this, we can use HDBSCAN to cluster our topics using each c-TF-IDF representation. Then, we merge topics that are clustered together. Another benefit of HDBSCAN is that it generates outliers. These outliers prevent topics from being merged if no other topics are similar.

        To use this option, we simply set nr_topics to \"auto\":

        from bertopic import BERTopic\ntopic_model = BERTopic(nr_topics=\"auto\")\n
        "},{"location":"getting_started/topicreduction/topicreduction.html#topic-reduction-after-training","title":"Topic Reduction after Training","text":"

        Finally, we can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so is that you can decide the number of topics after knowing how many are created. It is difficult to predict before training your model how many topics that are in your documents and how many will be extracted. Instead, we can decide afterward how many topics seem realistic:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Create topics -> Typically over 50 topics\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Further reduce topics\ntopic_model.reduce_topics(docs, nr_topics=30)\n\n# Access updated topics\ntopics = topic_model.topics_\n

        The reasoning for putting docs as a parameter is that the documents are not saved within BERTopic on purpose. If you were to have a million documents, it is very inefficient to save those in BERTopic instead of a dedicated database.

        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html","title":"Update Topic Representations","text":"

        The topics that are extracted from BERTopic are represented by words. These words are extracted from the documents occupying their topics using a class-based TF-IDF. This allows us to extract words that are interesting to a topic but less so to another.

        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html#update-topic-representation-after-training","title":"Update Topic Representation after Training","text":"

        When you have trained a model and viewed the topics and the words that represent them, you might not be satisfied with the representation. Perhaps you forgot to remove stop_words or you want to try out a different n_gram_range. We can use the function update_topics to update the topic representation with new parameters for c-TF-IDF:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Create topics\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic(n_gram_range=(2, 3))\ntopics, probs = topic_model.fit_transform(docs)\n

        From the model created above, one of the most frequent topics is the following:

        >>> topic_model.get_topic(31)[:10]\n[('clipper chip', 0.007240771542316232),\n ('key escrow', 0.004601603973377443),\n ('law enforcement', 0.004277247929596332),\n ('intercon com', 0.0035961920238955824),\n ('amanda walker', 0.003474856425297157),\n ('serial number', 0.0029876119137150358),\n ('com amanda', 0.002789303096817983),\n ('intercon com amanda', 0.0027386688593327084),\n ('amanda intercon', 0.002585262048515583),\n ('amanda intercon com', 0.002585262048515583)]\n

        Although there does seems to be some relation between words, it is difficult, at least for me, to intuitively understand what the topic is about. Instead, let's simplify the topic representation by setting n_gram_range to (1, 3) to also allow for single words.

        >>> topic_model.update_topics(docs, n_gram_range=(1, 3))\n>>> topic_model.get_topic(31)[:10]\n[('encryption', 0.008021846079148017),\n ('clipper', 0.00789642647602742),\n ('chip', 0.00637127942464045),\n ('key', 0.006363124787175884),\n ('escrow', 0.005030980365244285),\n ('clipper chip', 0.0048271268437973395),\n ('keys', 0.0043245812747907545),\n ('crypto', 0.004311198708675516),\n ('intercon', 0.0038772934659295076),\n ('amanda', 0.003516026493904586)]\n

        To me, the combination of the words above seem a bit more intuitive than the words we previously had! You can play around with n_gram_range or use your own custom sklearn.feature_extraction.text.CountVectorizer and pass that instead:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(stop_words=\"english\", ngram_range=(1, 5))\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        Tip!

        If you want to change the topics to something else, whether that is merging them or removing outliers, you can pass a custom list of topics to update them: topic_model.update_topics(docs, topics=my_updated_topics)

        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html#custom-labels","title":"Custom labels","text":"

        The topic labels are currently automatically generated by taking the top 3 words and combining them using the _ separator. Although this is an informative label, in practice, this is definitely not the prettiest nor necessarily the most accurate label. For example, although the topic label 1_space_nasa_orbit is informative, but we would prefer to have a bit more intuitive label, such as space travel. The difficulty with creating such topic labels is that much of the interpretation is left to the user. Would space travel be more accurate or perhaps space explorations? To truly understand which labels are most suited, going into some of the documents in topics is especially helpful.

        Although we can go through every single topic ourselves and try to label them, we can start by creating an overview of labels that have the length and number of words that we are looking for. To do so, we can generate our list of topic labels with .generate_topic_labels and define the number of words, the separator, word length, etc:

        topic_labels = topic_model.generate_topic_labels(nr_words=3,\n                                                 topic_prefix=False,\n                                                 word_length=10,\n                                                 separator=\", \")\n

        Tip

        If you created multiple topic representations or aspects, you can choose one of these aspects with aspect=\"Aspect1\" or whatever you named the aspect.

        In the above example, 1_space_nasa_orbit would turn into space, nasa, orbit since we selected 3 words, no topic prefix, and the , separator. We can then either change our topic_labels to whatever we want or directly pass them to .set_topic_labels so that they can be used across most visualization functions:

        topic_model.set_topic_labels(topic_labels)\n

        It is also possible to only change a few topic labels at a time by passing a dictionary where the key represents the topic ID and the value is the topic label:

        topic_model.set_topic_labels({1: \"Space Travel\", 7: \"Religion\"})\n

        Then, to make use of those custom topic labels across visualizations, such as .visualize_hierarchy(), we can use the custom_labels=True parameter that is found in most visualizations.

        fig = topic_model.visualize_barchart(custom_labels=True)\n
        "},{"location":"getting_started/topicrepresentation/topicrepresentation.html#optimize-labels","title":"Optimize labels","text":"

        The great advantage of passing custom labels to BERTopic is that when more accurate zero-shot are released, we can simply use those on top of BERTopic to further fine-tune the labeling. For example, let's say you have a set of potential topic labels that you want to use instead of the ones generated by BERTopic. You could use the bart-large-mnli model to find which user-defined labels best represent the BERTopic-generated labels:

        from transformers import pipeline\nclassifier = pipeline(\"zero-shot-classification\", model=\"facebook/bart-large-mnli\")\n\n# A selected topic representation\n# 'god jesus atheists atheism belief atheist believe exist beliefs existence'\nsequence_to_classify =  \" \".join([word for word, _ in topic_model.get_topic(1)])\n\n# Our set of potential topic labels\ncandidate_labels = ['cooking', 'dancing', 'religion']\nclassifier(sequence_to_classify, candidate_labels)\n\n#{'labels': ['cooking', 'dancing', 'religion'],\n# 'scores': [0.086, 0.063, 0.850],\n# 'sequence': 'god jesus atheists atheism belief atheist believe exist beliefs existence'}\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html","title":"Dynamic Topic Modeling","text":"

        Dynamic topic modeling (DTM) is a collection of techniques aimed at analyzing the evolution of topics over time. These methods allow you to understand how a topic is represented across different times. For example, in 1995 people may talk differently about environmental awareness than those in 2015. Although the topic itself remains the same, environmental awareness, the exact representation of that topic might differ.

        BERTopic allows for DTM by calculating the topic representation at each timestep without the need to run the entire model several times. To do this, we first need to fit BERTopic as if there were no temporal aspect in the data. Thus, a general topic model will be created. We use the global representation as to the main topics that can be found at, most likely, different timesteps. For each topic and timestep, we calculate the c-TF-IDF representation. This will result in a specific topic representation at each timestep without the need to create clusters from embeddings as they were already created.

        1 Topic Timestep 1 m Timestep Timestep 1 Timestep m n Topic c-TF-IDF c-TF-IDF c-TF-IDF c-TF-IDF topic c-TF-IDF c-TF-IDF at t + 2 c-TF-IDF at t c-TF-IDF at t-1 + 2 Global Tuning Split documents by topic Split documents by topic and timestep Apply pre-fitted c-TF-IDF on each subset of documents. Tune the c-TF-IDF at each timestep t by either averaging the representations with the global representation or with the representation at t-1. Evolutionary Tuning Optional tuning of representations

        Next, there are two main ways to further fine-tune these specific topic representations, namely globally and evolutionary.

        A topic representation at timestep t can be fine-tuned globally by averaging its c-TF-IDF representation with that of the global representation. This allows each topic representation to move slightly towards the global representation whilst still keeping some of its specific words.

        A topic representation at timestep t can be fine-tuned evolutionary by averaging its c-TF-IDF representation with that of the c-TF-IDF representation at timestep t-1. This is done for each topic representation allowing for the representations to evolve over time.

        Both fine-tuning methods are set to True as a default and allow for interesting representations to be created.

        "},{"location":"getting_started/topicsovertime/topicsovertime.html#example","title":"Example","text":"

        To demonstrate DTM in BERTopic, we first need to prepare our data. A good example of where DTM is useful is topic modeling on Twitter data. We can analyze how certain people have talked about certain topics in the years they have been on Twitter. Due to the controversial nature of his tweets, we are going to be using all tweets by Donald Trump.

        First, we need to load the data and do some very basic cleaning. For example, I am not interested in his re-tweets for this use-case:

        import re\nimport pandas as pd\n\n# Prepare data\ntrump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')\ntrump.text = trump.apply(lambda row: re.sub(r\"http\\S+\", \"\", row.text).lower(), 1)\ntrump.text = trump.apply(lambda row: \" \".join(filter(lambda x:x[0]!=\"@\", row.text.split())), 1)\ntrump.text = trump.apply(lambda row: \" \".join(re.sub(\"[^a-zA-Z]+\", \" \", row.text).split()), 1)\ntrump = trump.loc[(trump.isRetweet == \"f\") & (trump.text != \"\"), :]\ntimestamps = trump.date.to_list()\ntweets = trump.text.to_list()\n

        Then, we need to extract the global topic representations by simply creating and training a BERTopic model:

        from bertopic import BERTopic\n\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(tweets)\n

        From these topics, we are going to generate the topic representations at each timestamp for each topic. We do this by simply calling topics_over_time and passing the tweets, the corresponding timestamps, and the related topics:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20)\n

        And that is it! Aside from what you always need for BERTopic, you now only need to add timestamps to quickly calculate the topics over time.

        "},{"location":"getting_started/topicsovertime/topicsovertime.html#parameters","title":"Parameters","text":"

        There are a few parameters that are of interest which will be discussed below.

        "},{"location":"getting_started/topicsovertime/topicsovertime.html#tuning","title":"Tuning","text":"

        Both global_tuning and evolutionary_tuning are set to True as a default, but can easily be changed. Perhaps you do not want the representations to be influenced by the global representation and merely see how they evolved over time:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, \n                                                global_tuning=True, evolution_tuning=True, nr_bins=20)\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html#bins","title":"Bins","text":"

        If you have more than 100 unique timestamps, then there will be topic representations created for each of those timestamps which can negatively affect the topic representations. It is advised to keep the number of unique timestamps below 50. To do this, you can simply set the number of bins that are created when calculating the topic representations. The timestamps will be taken and put into equal-sized bins:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=20)\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html#datetime-format","title":"Datetime format","text":"

        If you are passing strings (dates) instead of integers, then BERTopic will try to automatically detect which datetime format your strings have. Unfortunately, this will not always work if they are in an unexpected format. We can use datetime_format to pass the format the timestamps have:

        topics_over_time = topic_model.topics_over_time(tweets, timestamps, datetime_format=\"%b%M\", nr_bins=20)\n
        "},{"location":"getting_started/topicsovertime/topicsovertime.html#visualization","title":"Visualization","text":"

        To me, DTM becomes truly interesting when you have a good way of visualizing how topics have changed over time. A nice way of doing so is by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call visualize_topics_over_time with the newly created topics over time:

        topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)\n

        I used top_n_topics to only show the top 20 most frequent topics. If I were to visualize all topics, which is possible by leaving top_n_topics empty, there is a chance that hundreds of lines will fill the plot.

        You can also use topics to show specific topics:

        topic_model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])\n
        "},{"location":"getting_started/topicsperclass/topicsperclass.html","title":"Topics per Class","text":"

        In some cases, you might be interested in how certain topics are represented over certain categories. Perhaps there are specific groups of users for which you want to see how they talk about certain topics.

        Instead of running the topic model per class, we can simply create a topic model and then extract, for each topic, its representation per class. This allows you to see how certain topics, calculated over all documents, are represented for certain subgroups.

        1 Topic 1 Class m Class 1 Class m Class n Topic c-TF-IDF c-TF-IDF c-TF-IDF c-TF-IDF Split documents by topic Split documents by topic and class Apply pre-fitted c-TF-IDF on each subset of documents.

        To do so, we use the 20 Newsgroups dataset to see how the topics that we uncover are represented in the 20 categories of documents.

        First, let's prepare the data:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\ntargets = data[\"target\"]\ntarget_names = data[\"target_names\"]\nclasses = [data[\"target_names\"][i] for i in data[\"target\"]]\n

        Next, we want to extract the topics across all documents without taking the categories into account:

        topic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\n

        Now that we have created our global topic model, let us calculate the topic representations across each category:

        topics_per_class = topic_model.topics_per_class(docs, classes=classes)\n

        The classes variable contains the class for each document. Then, we simply visualize these topics per class:

        topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)\n

        You can hover over the bars to see the topic representation per class.

        As you can see in the visualization above, the topics 93_homosexual_homosexuality_sex and 58_bike_bikes_motorcycle are somewhat distributed over all classes.

        You can see that the topic representation between rec.motorcycles and rec.autos in 58_bike_bikes_motorcycle clearly differs from one another. It seems that BERTopic has tried to combine those two categories into a single topic. However, since they do contain two separate topics, the topic representation in those two categories differs.

        We see something similar for 93_homosexual_homosexuality_sex, where the topic is distributed among several categories and is represented slightly differently.

        Thus, you can see that although in certain categories the topic is similar, the way the topic is represented can differ.

        "},{"location":"getting_started/vectorizers/vectorizers.html","title":"4. Vectorizers","text":"

        In topic modeling, the quality of the topic representations is key for interpreting the topics, communicating results, and understanding patterns. It is of utmost importance to make sure that the topic representations fit with your use case.

        In practice, there is not one correct way of creating topic representations. Some use cases might opt for higher n-grams, whereas others might focus more on single words without any stop words. The diversity in use cases also means that we need to have some flexibility in BERTopic to make sure it can be used across most use cases. The image below illustrates this modularity:

        In this section, we will go through several examples of vectorization algorithms and how they can be implemented.

        "},{"location":"getting_started/vectorizers/vectorizers.html#countvectorizer","title":"CountVectorizer","text":"

        One often underestimated component of BERTopic is the CountVectorizer and c-TF-IDF calculation. Together, they are responsible for creating the topic representations and luckily can be quite flexible in parameter tuning. Here, we will go through tips and tricks for tuning your CountVectorizer and see how they might affect the topic representations.

        Before starting, it should be noted that you can pass the CountVectorizer before and after training your topic model. Passing it before training allows you to minimize the size of the resulting c-TF-IDF matrix:

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Train BERTopic with a custom CountVectorizer\nvectorizer_model = CountVectorizer(min_df=10)\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\ntopics, probs = topic_model.fit_transform(docs)\n

        Passing it after training allows you to fine-tune the topic representations by using .update_topics():

        from bertopic import BERTopic\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Train a BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n\n# Fine-tune topic representations after training BERTopic\nvectorizer_model = CountVectorizer(stop_words=\"english\", ngram_range=(1, 3), min_df=10)\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        The great thing about using .update_topics() is that it allows you to tweak the topic representations without re-training your model! Thus, here we will be focusing on fine-tuning our topic representations after training our model.

        Note

        The great thing about processing our topic representations with the CountVectorizer is that it does not influence the quality of clusters as that is being performed before generating the topic representations.

        "},{"location":"getting_started/vectorizers/vectorizers.html#basic-usage","title":"Basic Usage","text":"

        First, let's start with defining our documents and training our topic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Prepare documents\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\n\n# Train a BERTopic model\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\n

        Now, let's see the top 10 most frequent topics that have been generated:

        >>> topic_model.get_topic_info()[1:11]\nTopic   Count   Name\n1   0   1822    0_game_team_games_he\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_ites_hello_cheek_hi\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_you_your_post_jim\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_fire_gas\n10  9   174 9_amp_audio_condition_asking\n

        The topic representations generated already seem quite interpretable! However, I am quite sure we do much better without having to re-train our model. Next, we will go through common parameters in CountVectorizer and focus on the effects that they might have. As a baseline, we will be comparing them to the topic representation above.

        "},{"location":"getting_started/vectorizers/vectorizers.html#parameters","title":"Parameters","text":"

        There are several basic parameters in the CountVectorizer that we can use to improve upon the quality of the resulting topic representations.

        "},{"location":"getting_started/vectorizers/vectorizers.html#ngram_range","title":"ngram_range","text":"

        The ngram_range parameter allows us to decide how many tokens each entity is in a topic representation. For example, we have words like game and team with a length of 1 in a topic but it would also make sense to have words like hockey league with a length of 2. To allow for these words to be generated, we can set the ngram_range parameter:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words=\"english\")\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        As you might have noticed, I also added stop_words=\"english\". This is necessary as longer words tend to have many stop words and removing them allows for nicer topic representations:

        >>> topic_model.get_topic_info()[1:11]\n    Topic   Count   Name\n1   0   1822    0_game_team_games_players\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_hello ites_forget hello_ites 15_huh hi\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_post_jim_context_forged\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_gas_compound\n10  9   174 9_amp_audio_condition_asking\n

        Although they look very similar, if we zoom in on topic 8, we can see longer words in our representation:

        >>> topic_model.get_topic(8)\n[('fbi', 0.019637149205975653),\n ('koresh', 0.019054514637064403),\n ('gas', 0.014156057632897179),\n ('compound', 0.012381224868591681),\n ('batf', 0.010349992314076047),\n ('children', 0.009336408916322387),\n ('tear gas', 0.008941747802855279),\n ('tear', 0.008446786597564537),\n ('davidians', 0.007911119583253022),\n ('started', 0.007398687505638955)]\n

        tear and gas have now been combined into a single representation. This helps us understand what those individual words might have been representing.

        "},{"location":"getting_started/vectorizers/vectorizers.html#stop_words","title":"stop_words","text":"

        In some of the topics, we can see stop words appearing like he or the. Stop words are something we typically want to prevent in our topic representations as they do not give additional information to the topic. To prevent those stop words, we can use the stop_words parameter in the CountVectorizer to remove them from the representations:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(stop_words=\"english\")\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        After running the above, we get the following output:

        >>> topic_model.get_topic_info()[1:11]\n    Topic   Count   Name\n1   0   1822    0_game_team_games_players\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_ites_cheek_hello_hi\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_monitor_card_video_vga\n6   5   438 5_post_jim_context_forged\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_tobacco\n9   8   218 8_fbi_koresh_gas_compound\n10  9   174 9_amp_audio_condition_stereo\n

        As you can see, the topic representations already look much better! Stop words are removed and the representations are more interpretable. We can also pass in a list of stop words if you have multiple languages to take into account.

        "},{"location":"getting_started/vectorizers/vectorizers.html#min_df","title":"min_df","text":"

        One important parameter to keep in mind is the min_df. This is typically an integer representing how frequent a word must be before being added to our representation. You can imagine that if we have a million documents and a certain word only appears a single time across all of them, then it would be highly unlikely to be representative of a topic. Typically, the c-TF-IDF calculation removes that word from the topic representation but when you have millions of documents, that will also lead to a very large topic-term matrix. To prevent a huge vocabulary, we can set the min_df to only accept words that have a minimum frequency.

        When you have millions of documents or error issues, I would advise increasing the value of min_df as long as the topic representations might sense:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(min_df=10)\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        With the following topic representation:

        >>> topic_model.get_topic_info()[1:11]\n    Topic   Count   Name\n1   0   1822    0_game_team_games_he\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_hello_hi_yep_huh\n4   3   493 3_israel_jews_jewish_peace\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_you_your_post_jim\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_fire_gas\n10  9   174 9_audio_condition_stereo_asking\n

        As you can see, the output is nearly the same which is what we would like to achieve. All words that appear less than 10 times are now removed from our topic-term matrix (i.e., c-TF-IDF matrix) which drastically lowers the matrix in size.

        "},{"location":"getting_started/vectorizers/vectorizers.html#max_features","title":"max_features","text":"

        A parameter similar to min_df is max_features which allows you to select the top n most frequent words to be used in the topic representation. Setting this, for example, to 10_000 creates a topic-term matrix with 10_000 terms. This helps you control the size of the topic-term matrix directly without having to fiddle around with the min_df parameter:

        from sklearn.feature_extraction.text import CountVectorizer\nvectorizer_model = CountVectorizer(max_features=10_000)\ntopic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n

        With the following representation:

        >>> topic_model.get_topic_info()[1:11]\nTopic   Count   Name\n1   0   1822    0_game_team_games_he\n2   1   580 1_key_clipper_chip_encryption\n3   2   532 2_hello_hi_yep_huh\n4   3   493 3_israel_israeli_jews_arab\n5   4   453 4_card_monitor_video_drivers\n6   5   438 5_you_your_post_jim\n7   6   314 6_car_cars_engine_ford\n8   7   279 7_health_newsgroup_cancer_1993\n9   8   218 8_fbi_koresh_fire_gas\n10  9   174 9_amp_audio_condition_asking\n

        As with min_df, we would like the topic representations to be very similar.

        "},{"location":"getting_started/vectorizers/vectorizers.html#tokenizer","title":"tokenizer","text":"

        The default tokenizer in the CountVectorizer works well for western languages but fails to tokenize some non-western languages, like Chinese. Fortunately, we can use the tokenizer variable in the CountVectorizer to use jieba, which is a package for Chinese text segmentation. Using it is straightforward:

        from sklearn.feature_extraction.text import CountVectorizer\nimport jieba\n\ndef tokenize_zh(text):\n    words = jieba.lcut(text)\n    return words\n\nvectorizer = CountVectorizer(tokenizer=tokenize_zh)\n

        Then, we can simply pass the vectorizer to update our topic representations:

        topic_model.update_topics(docs, vectorizer_model=vectorizer_model)\n
        "},{"location":"getting_started/vectorizers/vectorizers.html#onlinecountvectorizer","title":"OnlineCountVectorizer","text":"

        When using the online/incremental variant of BERTopic, we need a CountVectorizer than can incrementally update its representation. For that purpose, OnlineCountVectorizer was created that not only updates out-of-vocabulary words but also implements decay and cleaning functions to prevent the sparse bag-of-words matrix to become too large. It is a class that can be found in bertopic.vectorizers which extends sklearn.feature_extraction.text.CountVectorizer. In other words, you can use the exact same parameter in OnlineCountVectorizer as found in Scikit-Learn's CountVectorizer. We can use it as follows:

        from bertopic import BERTopic\nfrom bertopic.vectorizers import OnlineCountVectorizer\n\n# Train BERTopic with a custom OnlineCountVectorizer\nvectorizer_model = OnlineCountVectorizer()\ntopic_model = BERTopic(vectorizer_model=vectorizer_model)\n
        "},{"location":"getting_started/vectorizers/vectorizers.html#parameters_1","title":"Parameters","text":"

        Other than parameters found in CountVectorizer, such as stop_words and ngram_range, we can two parameters in OnlineCountVectorizer to adjust the way old data is processed and kept.

        "},{"location":"getting_started/vectorizers/vectorizers.html#decay","title":"decay","text":"

        At each iteration, we sum the bag-of-words representation of the new documents with the bag-of-words representation of all documents processed thus far. In other words, the bag-of-words matrix keeps increasing with each iteration. However, especially in a streaming setting, older documents might become less and less relevant as time goes on. Therefore, a decay parameter was implemented that decays the bag-of-words' frequencies at each iteration before adding the document frequencies of new documents. The decay parameter is a value between 0 and 1 and indicates the percentage of frequencies the previous bag-of-words matrix should be reduced to. For example, a value of .1 will decrease the frequencies in the bag-of-words matrix by 10% at each iteration before adding the new bag-of-words matrix. This will make sure that recent data has more weight than previous iterations.

        "},{"location":"getting_started/vectorizers/vectorizers.html#delete_min_df","title":"delete_min_df","text":"

        In BERTopic, we might want to remove words from the topic representation that appear infrequently. The min_df in the CountVectorizer works quite well for that. However, when we have a streaming setting, the min_df does not work as well since a word's frequency might start below min_df but will end up higher than that over time. Setting that value high might not always be advised.

        As a result, the vocabulary of the resulting bag-of-words matrix can become quite large. Similarly, if we implement the decay parameter, then some values will decrease over time until they are below min_df. For these reasons, the delete_min_df parameter was implemented. The parameter takes positive integers and indicates, at each iteration, which words will be removed. If the value is set to 5, it will check after each iteration if the total frequency of a word is exceeded by that value. If so, the word will be removed in its entirety from the bag-of-words matrix. This helps to keep the bag-of-words matrix of a manageable size.

        Note

        Although the delete_min_df parameter removes words from the bag-of-words matrix, it is not permanent. If new documents come in where those previously deleted words are used frequently, they get added back to the matrix.

        "},{"location":"getting_started/visualization/visualization.html","title":"Visualization","text":"

        Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue.

        "},{"location":"getting_started/visualization/visualization.html#visualize-topics","title":"Visualize Topics","text":"

        After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to LDAvis.

        We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs) \n

        Then, we can call .visualize_topics to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML:

        topic_model.visualize_topics()\n

        You can use the slider to select the topic which then lights up red. If you hover over a topic, then general information is given about the topic, including the size of the topic and its corresponding words.

        "},{"location":"getting_started/visualization/visualization.html#visualize-documents","title":"Visualize Documents","text":"

        Using the previous method, we can visualize the topics and get insight into their relationships. However, you might want a more fine-grained approach where we can visualize the documents inside the topics to see if they were assigned correctly or whether they make sense. To do so, we can use the topic_model.visualize_documents() function. This function recalculates the document embeddings and reduces them to 2-dimensional space for easier visualization purposes. This process can be quite expensive, so it is advised to adhere to the following pipeline:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This was done for demonstration purposes as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set hide_document_hover=False in order to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualization.html#custom-hover","title":"Custom Hover","text":"

        When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (titles) containing the title for each document:

        topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-topic-hierarchy","title":"Visualize Topic Hierarchy","text":"

        The topics that were created can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use scipy.cluster.hierarchy to create clusters and visualize how they relate to one another. This might help to select an appropriate nr_topics when reducing the number of topics that you have created. To visualize this hierarchy, run the following:

        topic_model.visualize_hierarchy()\n

        Note

        Do note that this is not the actual procedure of .reduce_topics() when nr_topics is set to auto since HDBSCAN is used to automatically extract topics. The visualization above closely resembles the actual procedure of .reduce_topics() when any number of nr_topics is selected.

        "},{"location":"getting_started/visualization/visualization.html#hierarchical-labels","title":"Hierarchical labels","text":"

        Although visualizing this hierarchy gives us information about the structure, it would be helpful to see what happens to the topic representations when merging topics. To do so, we first need to calculate the representations of the hierarchical topics:

        First, we train a basic BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n

        To visualize these results, we simply need to pass the resulting hierarchical_topics to our .visualize_hierarchy function:

        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you hover over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes.

        "},{"location":"getting_started/visualization/visualization.html#text-based-topic-tree","title":"Text-based topic tree","text":"

        Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use topic_model.get_topic_tree to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged:

        >>> tree = topic_model.get_topic_tree(hierarchical_topics)\n>>> print(tree)\n.\n\u2514\u2500atheists_atheism_god_moral_atheist\n     \u251c\u2500atheists_atheism_god_atheist_argument\n     \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n     \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n     \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n
        Click here to view the full tree.
          .\n  \u251c\u2500people_armenian_said_god_armenians\n  \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u2502    \u251c\u2500jehovah_lord_mormon_mcconkie_god\n  \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500ra_satan_thou_god_lucifer \u2500\u2500 Topic: 94\n  \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500jehovah_lord_mormon_mcconkie_unto \u2500\u2500 Topic: 78\n  \u2502    \u2502    \u2502    \u2514\u2500jesus_mary_god_hell_sin\n  \u2502    \u2502    \u2502         \u251c\u2500jesus_hell_god_eternal_heaven\n  \u2502    \u2502    \u2502         \u2502    \u251c\u2500hell_jesus_eternal_god_heaven\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500jesus_tomb_disciples_resurrection_john \u2500\u2500 Topic: 69\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500hell_eternal_god_jesus_heaven \u2500\u2500 Topic: 53\n  \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500aaron_baptism_sin_law_god \u2500\u2500 Topic: 89\n  \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500mary_sin_maria_priest_conception \u2500\u2500 Topic: 56\n  \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500marriage_married_marry_ceremony_marriages \u2500\u2500 Topic: 110\n  \u2502    \u2514\u2500people_armenian_armenians_said_mr\n  \u2502         \u251c\u2500people_armenian_armenians_said_israel\n  \u2502         \u2502    \u251c\u2500god_homosexual_homosexuality_atheists_sex\n  \u2502         \u2502    \u2502    \u251c\u2500homosexual_homosexuality_sex_gay_homosexuals\n  \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500kinsey_sex_gay_men_sexual \u2500\u2500 Topic: 44\n  \u2502         \u2502    \u2502    \u2502    \u2514\u2500homosexuality_homosexual_sin_homosexuals_gay\n  \u2502         \u2502    \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500gay_homosexual_homosexuals_sexual_cramer \u2500\u2500 Topic: 50\n  \u2502         \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500homosexuality_homosexual_sin_paul_sex \u2500\u2500 Topic: 27\n  \u2502         \u2502    \u2502    \u2514\u2500god_atheists_atheism_moral_atheist\n  \u2502         \u2502    \u2502         \u251c\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500jim_context_challenges_articles_quote \u2500\u2500 Topic: 36\n  \u2502         \u2502    \u2502         \u2502    \u2514\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500islam_quran_islamic_rushdie_muslims \u2500\u2500 Topic: 31\n  \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500judas_scripture_bible_books_greek \u2500\u2500 Topic: 33\n  \u2502         \u2502    \u2502         \u2514\u2500atheists_atheism_god_moral_atheist\n  \u2502         \u2502    \u2502              \u251c\u2500atheists_atheism_god_atheist_argument\n  \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n  \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n  \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n  \u2502         \u2502    \u2514\u2500armenian_armenians_people_israel_said\n  \u2502         \u2502         \u251c\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502    \u251c\u2500tax_rights_government_income_taxes\n  \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500rights_right_slavery_slaves_residence \u2500\u2500 Topic: 106\n  \u2502         \u2502         \u2502    \u2502    \u2514\u2500tax_government_taxes_income_libertarians\n  \u2502         \u2502         \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500government_libertarians_libertarian_regulation_party \u2500\u2500 Topic: 58\n  \u2502         \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500tax_taxes_income_billion_deficit \u2500\u2500 Topic: 41\n  \u2502         \u2502         \u2502    \u2514\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502         \u251c\u2500gun_guns_militia_firearms_amendment\n  \u2502         \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500blacks_penalty_death_cruel_punishment \u2500\u2500 Topic: 55\n  \u2502         \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500gun_guns_militia_firearms_amendment \u2500\u2500 Topic: 7\n  \u2502         \u2502         \u2502         \u2514\u2500armenian_armenians_israel_jews_turkish\n  \u2502         \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500israel_israeli_jews_arab_jewish \u2500\u2500 Topic: 4\n  \u2502         \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500armenian_armenians_turkish_armenia_azerbaijan \u2500\u2500 Topic: 15\n  \u2502         \u2502         \u2514\u2500stephanopoulos_president_mr_myers_ms\n  \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500serbs_muslims_stephanopoulos_mr_bosnia \u2500\u2500 Topic: 35\n  \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500myers_stephanopoulos_president_ms_mr \u2500\u2500 Topic: 87\n  \u2502         \u2514\u2500batf_fbi_koresh_compound_gas\n  \u2502              \u251c\u2500\u25a0\u2500\u2500reno_workers_janet_clinton_waco \u2500\u2500 Topic: 77\n  \u2502              \u2514\u2500batf_fbi_koresh_gas_compound\n  \u2502                   \u251c\u2500batf_koresh_fbi_warrant_compound\n  \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500batf_warrant_raid_compound_fbi \u2500\u2500 Topic: 42\n  \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500koresh_batf_fbi_children_compound \u2500\u2500 Topic: 61\n  \u2502                   \u2514\u2500\u25a0\u2500\u2500fbi_gas_tear_bds_building \u2500\u2500 Topic: 23\n  \u2514\u2500use_like_just_dont_new\n      \u251c\u2500game_team_year_games_like\n      \u2502    \u251c\u2500game_team_games_25_year\n      \u2502    \u2502    \u251c\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502    \u251c\u2500window_printer_use_problem_mhz\n      \u2502    \u2502    \u2502    \u2502    \u251c\u2500mhz_wire_simms_wiring_battery\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_mhz_battery_cpu_heat\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_pds_simm_vram_lc\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500pds_nubus_lc_slot_card \u2500\u2500 Topic: 119\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500simms_simm_vram_meg_dram \u2500\u2500 Topic: 32\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500mhz_battery_cpu_heat_speed\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500fan_cpu_heat_sink_fans \u2500\u2500 Topic: 92\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500mhz_speed_cpu_fpu_clock \u2500\u2500 Topic: 22\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500monitor_turn_power_computer_electricity \u2500\u2500 Topic: 91\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500battery_batteries_concrete_duo_discharge\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500duo_battery_apple_230_problem \u2500\u2500 Topic: 121\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500battery_batteries_concrete_discharge_temperature \u2500\u2500 Topic: 75\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500leds_uv_blue_light_boards \u2500\u2500 Topic: 66\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500wire_wiring_ground_neutral_outlets \u2500\u2500 Topic: 120\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500scope_scopes_phone_dial_number\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500dial_number_phone_line_output \u2500\u2500 Topic: 93\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500scope_scopes_motorola_generator_oscilloscope \u2500\u2500 Topic: 113\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500celp_dsp_sampling_antenna_digital\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500antenna_antennas_receiver_cable_transmitter \u2500\u2500 Topic: 70\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500celp_dsp_sampling_speech_voice \u2500\u2500 Topic: 52\n      \u2502    \u2502    \u2502    \u2502    \u2514\u2500window_printer_xv_mouse_windows\n      \u2502    \u2502    \u2502    \u2502         \u251c\u2500window_xv_error_widget_problem\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500error_symbol_undefined_xterm_rx\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500symbol_error_undefined_doug_parse \u2500\u2500 Topic: 63\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500rx_remote_server_xdm_xterm \u2500\u2500 Topic: 45\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500window_xv_widget_application_expose\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500window_widget_expose_application_event\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500gc_mydisplay_draw_gxxor_drawing \u2500\u2500 Topic: 103\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500window_widget_application_expose_event \u2500\u2500 Topic: 25\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500xv_den_polygon_points_algorithm\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500den_polygon_points_algorithm_polygons \u2500\u2500 Topic: 28\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500xv_24bit_image_bit_images \u2500\u2500 Topic: 57\n      \u2502    \u2502    \u2502    \u2502         \u2514\u2500printer_fonts_print_mouse_postscript\n      \u2502    \u2502    \u2502    \u2502              \u251c\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500scanner_logitech_grayscale_ocr_scanman \u2500\u2500 Topic: 108\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u2514\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u251c\u2500\u25a0\u2500\u2500printer_print_deskjet_hp_ink \u2500\u2500 Topic: 18\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u2514\u2500\u25a0\u2500\u2500fonts_font_truetype_tt_atm \u2500\u2500 Topic: 49\n      \u2502    \u2502    \u2502    \u2502              \u2514\u2500mouse_ghostscript_midi_driver_postscript\n      \u2502    \u2502    \u2502    \u2502                   \u251c\u2500ghostscript_midi_postscript_files_file\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500ghostscript_postscript_pageview_ghostview_dsc \u2500\u2500 Topic: 104\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u2514\u2500midi_sound_file_windows_driver\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u251c\u2500\u25a0\u2500\u2500location_mar_file_host_rwrr \u2500\u2500 Topic: 83\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u2514\u2500\u25a0\u2500\u2500midi_sound_driver_blaster_soundblaster \u2500\u2500 Topic: 98\n      \u2502    \u2502    \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500mouse_driver_mice_ball_problem \u2500\u2500 Topic: 68\n      \u2502    \u2502    \u2502    \u2514\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502         \u251c\u25001st_sale_condition_comics_hulk\n      \u2502    \u2502    \u2502         \u2502    \u251c\u2500sale_condition_offer_asking_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500condition_stereo_amp_speakers_asking\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500miles_car_amfm_toyota_cassette \u2500\u2500 Topic: 62\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500amp_speakers_condition_stereo_audio \u2500\u2500 Topic: 24\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500games_sale_pom_cds_shipping\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u251c\u2500pom_cds_sale_shipping_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500size_shipping_sale_condition_mattress \u2500\u2500 Topic: 100\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500pom_cds_cd_sale_picture \u2500\u2500 Topic: 37\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500games_game_snes_sega_genesis \u2500\u2500 Topic: 40\n      \u2502    \u2502    \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u251c\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500lens_tape_camera_backup_lenses\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500tape_backup_tapes_drive_4mm \u2500\u2500 Topic: 107\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500lens_camera_lenses_zoom_pouch \u2500\u2500 Topic: 114\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u25001st_hulk_comics_art_appears \u2500\u2500 Topic: 105\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500books_book_cover_trek_chemistry \u2500\u2500 Topic: 125\n      \u2502    \u2502    \u2502         \u2502         \u2514\u2500tickets_hotel_ticket_voucher_package\n      \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500hotel_voucher_package_vacation_room \u2500\u2500 Topic: 74\n      \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500tickets_ticket_june_airlines_july \u2500\u2500 Topic: 84\n      \u2502    \u2502    \u2502         \u2514\u2500game_team_games_season_hockey\n      \u2502    \u2502    \u2502              \u251c\u2500game_hockey_team_25_550\n      \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500espn_pt_pts_game_la \u2500\u2500 Topic: 17\n      \u2502    \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500team_25_game_hockey_550 \u2500\u2500 Topic: 2\n      \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500year_game_hit_baseball_players \u2500\u2500 Topic: 0\n      \u2502    \u2502    \u2514\u2500bike_car_greek_insurance_msg\n      \u2502    \u2502         \u251c\u2500car_bike_insurance_cars_engine\n      \u2502    \u2502         \u2502    \u251c\u2500car_insurance_cars_radar_engine\n      \u2502    \u2502         \u2502    \u2502    \u251c\u2500insurance_health_private_care_canada\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500insurance_health_private_care_canada \u2500\u2500 Topic: 99\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500insurance_car_accident_rates_sue \u2500\u2500 Topic: 82\n      \u2502    \u2502         \u2502    \u2502    \u2514\u2500car_cars_radar_engine_detector\n      \u2502    \u2502         \u2502    \u2502         \u251c\u2500car_radar_cars_detector_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500radar_detector_detectors_ka_alarm \u2500\u2500 Topic: 39\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500car_cars_mustang_ford_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500clutch_shift_shifting_transmission_gear \u2500\u2500 Topic: 88\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500car_cars_mustang_ford_v8 \u2500\u2500 Topic: 14\n      \u2502    \u2502         \u2502    \u2502         \u2514\u2500oil_diesel_odometer_diesels_car\n      \u2502    \u2502         \u2502    \u2502              \u251c\u2500odometer_oil_sensor_car_drain\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500odometer_sensor_speedo_gauge_mileage \u2500\u2500 Topic: 96\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500oil_drain_car_leaks_taillights \u2500\u2500 Topic: 102\n      \u2502    \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500diesel_diesels_emissions_fuel_oil \u2500\u2500 Topic: 79\n      \u2502    \u2502         \u2502    \u2514\u2500bike_riding_ride_bikes_motorcycle\n      \u2502    \u2502         \u2502         \u251c\u2500bike_ride_riding_bikes_lane\n      \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500bike_ride_riding_lane_car \u2500\u2500 Topic: 11\n      \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500bike_bikes_miles_honda_motorcycle \u2500\u2500 Topic: 19\n      \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500countersteering_bike_motorcycle_rear_shaft \u2500\u2500 Topic: 46\n      \u2502    \u2502         \u2514\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u251c\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u251c\u2500greek_kuwait_greece_turkish_greeks\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500greek_greece_turkish_greeks_cyprus \u2500\u2500 Topic: 71\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kuwait_iraq_iran_gulf_arabia \u2500\u2500 Topic: 76\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2514\u2500msg_dog_drugs_drug_food\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u251c\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500clinton_bush_quayle_reagan_panicking \u2500\u2500 Topic: 101\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u2514\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500cooper_trial_weaver_spence_witnesses \u2500\u2500 Topic: 90\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500dog_dogs_bike_trained_springer \u2500\u2500 Topic: 67\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2514\u2500msg_drugs_drug_food_chinese\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500msg_food_chinese_foods_taste \u2500\u2500 Topic: 30\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500drugs_drug_marijuana_cocaine_alcohol \u2500\u2500 Topic: 72\n      \u2502    \u2502              \u2502    \u2502    \u2514\u2500water_theory_universe_science_larsons\n      \u2502    \u2502              \u2502    \u2502         \u251c\u2500water_nuclear_cooling_steam_dept\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500rocketry_rockets_engines_nuclear_plutonium \u2500\u2500 Topic: 115\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u2514\u2500water_cooling_steam_dept_plants\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500water_dept_phd_environmental_atmospheric \u2500\u2500 Topic: 97\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500cooling_water_steam_towers_plants \u2500\u2500 Topic: 109\n      \u2502    \u2502              \u2502    \u2502         \u2514\u2500theory_universe_larsons_larson_science\n      \u2502    \u2502              \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500theory_universe_larsons_larson_science \u2500\u2500 Topic: 54\n      \u2502    \u2502              \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500oort_cloud_grbs_gamma_burst \u2500\u2500 Topic: 80\n      \u2502    \u2502              \u2502    \u2514\u2500helmet_kirlian_photography_lock_wax\n      \u2502    \u2502              \u2502         \u251c\u2500helmet_kirlian_photography_leaf_mask\n      \u2502    \u2502              \u2502         \u2502    \u251c\u2500kirlian_photography_leaf_pictures_deleted\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u251c\u2500deleted_joke_stuff_maddi_nickname\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500joke_maddi_nickname_nicknames_frank \u2500\u2500 Topic: 43\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500deleted_stuff_bookstore_joke_motto \u2500\u2500 Topic: 81\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kirlian_photography_leaf_pictures_aura \u2500\u2500 Topic: 85\n      \u2502    \u2502              \u2502         \u2502    \u2514\u2500helmet_mask_liner_foam_cb\n      \u2502    \u2502              \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500helmet_liner_foam_cb_helmets \u2500\u2500 Topic: 112\n      \u2502    \u2502              \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500mask_goalies_77_santore_tl \u2500\u2500 Topic: 123\n      \u2502    \u2502              \u2502         \u2514\u2500lock_wax_paint_plastic_ear\n      \u2502    \u2502              \u2502              \u251c\u2500\u25a0\u2500\u2500lock_cable_locks_bike_600 \u2500\u2500 Topic: 117\n      \u2502    \u2502              \u2502              \u2514\u2500wax_paint_ear_plastic_skin\n      \u2502    \u2502              \u2502                   \u251c\u2500\u25a0\u2500\u2500wax_paint_plastic_scratches_solvent \u2500\u2500 Topic: 65\n      \u2502    \u2502              \u2502                   \u2514\u2500\u25a0\u2500\u2500ear_wax_skin_greasy_acne \u2500\u2500 Topic: 116\n      \u2502    \u2502              \u2514\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u251c\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500m4_mp_14_mw_mo \u2500\u2500 Topic: 111\n      \u2502    \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500test_ensign_nameless_deane_deanebinahccbrandeisedu \u2500\u2500 Topic: 118\n      \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500ites_cheek_hello_hi_ken \u2500\u2500 Topic: 3\n      \u2502    \u2514\u2500space_medical_health_disease_cancer\n      \u2502         \u251c\u2500medical_health_disease_cancer_patients\n      \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500cancer_centers_center_medical_research \u2500\u2500 Topic: 122\n      \u2502         \u2502    \u2514\u2500health_medical_disease_patients_hiv\n      \u2502         \u2502         \u251c\u2500patients_medical_disease_candida_health\n      \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n      \u2502         \u2502         \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n      \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n      \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n      \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n      \u2502         \u2514\u2500space_launch_nasa_shuttle_orbit\n      \u2502              \u251c\u2500space_moon_station_nasa_launch\n      \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500sky_advertising_billboard_billboards_space \u2500\u2500 Topic: 59\n      \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500space_station_moon_redesign_nasa \u2500\u2500 Topic: 16\n      \u2502              \u2514\u2500space_mission_hst_launch_orbit\n      \u2502                   \u251c\u2500space_launch_nasa_orbit_propulsion\n      \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500space_launch_nasa_propulsion_astronaut \u2500\u2500 Topic: 47\n      \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500orbit_km_jupiter_probe_earth \u2500\u2500 Topic: 86\n      \u2502                   \u2514\u2500\u25a0\u2500\u2500hst_mission_shuttle_orbit_arrays \u2500\u2500 Topic: 60\n      \u2514\u2500drive_file_key_windows_use\n          \u251c\u2500key_file_jpeg_encryption_image\n          \u2502    \u251c\u2500key_encryption_clipper_chip_keys\n          \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500key_clipper_encryption_chip_keys \u2500\u2500 Topic: 1\n          \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500entry_file_ripem_entries_key \u2500\u2500 Topic: 73\n          \u2502    \u2514\u2500jpeg_image_file_gif_images\n          \u2502         \u251c\u2500motif_graphics_ftp_available_3d\n          \u2502         \u2502    \u251c\u2500motif_graphics_openwindows_ftp_available\n          \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500openwindows_motif_xview_windows_mouse \u2500\u2500 Topic: 20\n          \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500graphics_widget_ray_3d_available \u2500\u2500 Topic: 95\n          \u2502         \u2502    \u2514\u2500\u25a0\u2500\u25003d_machines_version_comments_contact \u2500\u2500 Topic: 38\n          \u2502         \u2514\u2500jpeg_image_gif_images_format\n          \u2502              \u251c\u2500\u25a0\u2500\u2500gopher_ftp_files_stuffit_images \u2500\u2500 Topic: 51\n          \u2502              \u2514\u2500\u25a0\u2500\u2500jpeg_image_gif_format_images \u2500\u2500 Topic: 13\n          \u2514\u2500drive_db_card_scsi_windows\n              \u251c\u2500db_windows_dos_mov_os2\n              \u2502    \u251c\u2500\u25a0\u2500\u2500copy_protection_program_software_disk \u2500\u2500 Topic: 64\n              \u2502    \u2514\u2500\u25a0\u2500\u2500db_windows_dos_mov_os2 \u2500\u2500 Topic: 8\n              \u2514\u2500drive_card_scsi_drives_ide\n                      \u251c\u2500drive_scsi_drives_ide_disk\n                      \u2502    \u251c\u2500\u25a0\u2500\u2500drive_scsi_drives_ide_disk \u2500\u2500 Topic: 6\n                      \u2502    \u2514\u2500\u25a0\u2500\u2500meg_sale_ram_drive_shipping \u2500\u2500 Topic: 12\n                      \u2514\u2500card_modem_monitor_video_drivers\n                          \u251c\u2500\u25a0\u2500\u2500card_monitor_video_drivers_vga \u2500\u2500 Topic: 5\n                          \u2514\u2500\u25a0\u2500\u2500modem_port_serial_irq_com \u2500\u2500 Topic: 10\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-hierarchical-documents","title":"Visualize Hierarchical Documents","text":"

        We can extend the previous method by calculating the topic representation at different levels of the hierarchy and plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n
        Then, we can visualize the hierarchical documents by either supplying it with our embeddings or by reducing their dimensionality ourselves:

        # Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization smaller and fit into your RAM. However, it might be interesting to set hide_document_hover=False to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualization.html#visualize-terms","title":"Visualize Terms","text":"

        We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other. To visualize this hierarchy, run the following:

        topic_model.visualize_barchart()\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-topic-similarity","title":"Visualize Topic Similarity","text":"

        Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other. To visualize the heatmap, run the following:

        topic_model.visualize_heatmap()\n

        Note

        You can set n_clusters in visualize_heatmap to order the topics by their similarity. This will result in blocks being formed in the heatmap indicating which clusters of topics are similar to each other. This step is very much recommended as it will make reading the heatmap easier.

        "},{"location":"getting_started/visualization/visualization.html#visualize-term-score-decline","title":"Visualize Term Score Decline","text":"

        Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation.

        To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic.

        To visualize the c-TF-IDF score decline, run the following:

        topic_model.visualize_term_rank()\n

        To enable the log scale on the y-axis for a better view of individual topics, run the following:

        topic_model.visualize_term_rank(log_scale=True)\n

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        "},{"location":"getting_started/visualization/visualization.html#visualize-topics-over-time","title":"Visualize Topics over Time","text":"

        After creating topics over time with Dynamic Topic Modeling, we can visualize these topics by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call .visualize_topics_over_time with the newly created topics over time:

        import re\nimport pandas as pd\nfrom bertopic import BERTopic\n\n# Prepare data\ntrump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')\ntrump.text = trump.apply(lambda row: re.sub(r\"http\\S+\", \"\", row.text).lower(), 1)\ntrump.text = trump.apply(lambda row: \" \".join(filter(lambda x:x[0]!=\"@\", row.text.split())), 1)\ntrump.text = trump.apply(lambda row: \" \".join(re.sub(\"[^a-zA-Z]+\", \" \", row.text).split()), 1)\ntrump = trump.loc[(trump.isRetweet == \"f\") & (trump.text != \"\"), :]\ntimestamps = trump.date.to_list()\ntweets = trump.text.to_list()\n\n# Create topics over time\nmodel = BERTopic(verbose=True)\ntopics, probs = model.fit_transform(tweets)\ntopics_over_time = model.topics_over_time(tweets, timestamps)\n

        Then, we visualize some interesting topics:

        model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-topics-per-class","title":"Visualize Topics per Class","text":"

        You might want to extract and visualize the topic representation per class. For example, if you have specific groups of users that might approach topics differently, then extracting them would help understanding how these users talk about certain topics. In other words, this is simply creating a topic representation for certain classes that you might have in your data.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Prepare data and classes\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\nclasses = [data[\"target_names\"][i] for i in data[\"target\"]]\n\n# Create topic model and calculate topics per class\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_per_class = topic_model.topics_per_class(docs, classes=classes)\n

        Then, we visualize the topic representation of major topics per class:

        topic_model.visualize_topics_per_class(topics_per_class)\n
        "},{"location":"getting_started/visualization/visualization.html#visualize-probabilities-or-distribution","title":"Visualize Probabilities or Distribution","text":"

        We can generate the topic-document probability matrix by simply setting calculate_probabilities=True if a HDBSCAN model is used:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs) \n

        The resulting probs variable contains the soft-clustering as done through HDBSCAN.

        If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model:

        from bertopic import BERTopic\n\ntopic_model = BERTopic()\ntopics, _ = topic_model.fit_transform(docs) \ntopic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0)\n

        Then, we either pass the probs or topic_distr variable to .visualize_distribution to visualize either the probability distributions or the topic distributions:

        # To visualize the probabilities of topic assignment\ntopic_model.visualize_distribution(probs[0])\n\n# To visualize the topic distributions in a document\ntopic_model.visualize_distribution(topic_distr[0])\n

        Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])\ndf\n

        Note

        To get the stylized dataframe for .visualize_approximate_distribution you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via pip install jinja2

        Note

        The distribution of the probabilities does not give an indication to the distribution of the frequencies of topics across a document. It merely shows how confident BERTopic is that certain topics can be found in a document.

        "},{"location":"getting_started/visualization/visualize_documents.html","title":"Documents","text":""},{"location":"getting_started/visualization/visualize_documents.html#visualize-documents-with-plotly","title":"Visualize documents with Plotly","text":"

        Using the .visualize_topics, we can visualize the topics and get insight into their relationships. However, you might want a more fine-grained approach where we can visualize the documents inside the topics to see if they were assigned correctly or whether they make sense. To do so, we can use the topic_model.visualize_documents() function. This function recalculates the document embeddings and reduces them to 2-dimensional space for easier visualization purposes. This process can be quite expensive, so it is advised to adhere to the following pipeline:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic\ntopic_model = BERTopic().fit(docs, embeddings)\n\n# Run the visualization with the original embeddings\ntopic_model.visualize_documents(docs, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This was done for demonstration purposes as saving all those documents in the visualization can be quite expensive and result in large files. However, it might be interesting to set hide_document_hover=False in order to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualize_documents.html#custom-hover","title":"Custom Hover","text":"

        When you visualize the documents, you might not always want to see the complete document over hover. Many documents have shorter information that might be more interesting to visualize, such as its title. To create the hover based on a documents' title instead of its content, you can simply pass a variable (titles) containing the title for each document:

        topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings)\n
        "},{"location":"getting_started/visualization/visualize_documents.html#visualize-documents-with-datamapplot","title":"Visualize documents with DataMapPlot","text":"

        .visualize_document_datamap provides an alternative way to visualize the documents inside the topics as a static DataMapPlot. Using the same pipeline as above, you can generate a DataMapPlot by running:

        # with the original embeddings\ntopic_model.visualize_document_datamap(docs, embeddings=embeddings)\n\n# with the reduced embeddings\ntopic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\n

        Or if you want to save the resulting figure:

        fig = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)\nfig.savefig(\"path/to/file.png\", bbox_inches=\"tight\")\n
        "},{"location":"getting_started/visualization/visualize_documents.html#visualize-probabilities-or-distribution","title":"Visualize Probabilities or Distribution","text":"

        We can generate the topic-document probability matrix by simply setting calculate_probabilities=True if a HDBSCAN model is used:

        from bertopic import BERTopic\ntopic_model = BERTopic(calculate_probabilities=True)\ntopics, probs = topic_model.fit_transform(docs) \n

        The resulting probs variable contains the soft-clustering as done through HDBSCAN.

        If a non-HDBSCAN model is used, we can estimate the topic distributions after training our model:

        from bertopic import BERTopic\n\ntopic_model = BERTopic()\ntopics, _ = topic_model.fit_transform(docs) \ntopic_distr, _ = topic_model.approximate_distribution(docs, min_similarity=0)\n

        Then, we either pass the probs or topic_distr variable to .visualize_distribution to visualize either the probability distributions or the topic distributions:

        # To visualize the probabilities of topic assignment\ntopic_model.visualize_distribution(probs[0])\n\n# To visualize the topic distributions in a document\ntopic_model.visualize_distribution(topic_distr[0])\n

        Although a topic distribution is nice, we may want to see how each token contributes to a specific topic. To do so, we need to first calculate topic distributions on a token level and then visualize the results:

        # Calculate the topic distributions on a token-level\ntopic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)\n\n# Visualize the token-level distributions\ndf = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])\ndf\n

        Note

        To get the stylized dataframe for .visualize_approximate_distribution you will need to have Jinja installed. If you do not have this installed, an unstylized dataframe will be returned instead. You can install Jinja via pip install jinja2

        Note

        The distribution of the probabilities does not give an indication to the distribution of the frequencies of topics across a document. It merely shows how confident BERTopic is that certain topics can be found in a document.

        "},{"location":"getting_started/visualization/visualize_hierarchy.html","title":"Hierarchy","text":"

        The topics that you create can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use scipy.cluster.hierarchy to create clusters and visualize how they relate to one another. This might help to select an appropriate nr_topics when reducing the number of topics that you have created. To visualize this hierarchy, run the following:

        topic_model.visualize_hierarchy()\n

        Note

        Do note that this is not the actual procedure of .reduce_topics() when nr_topics is set to auto since HDBSCAN is used to automatically extract topics. The visualization above closely resembles the actual procedure of .reduce_topics() when any number of nr_topics is selected.

        "},{"location":"getting_started/visualization/visualize_hierarchy.html#hierarchical-labels","title":"Hierarchical labels","text":"

        Although visualizing this hierarchy gives us information about the structure, it would be helpful to see what happens to the topic representations when merging topics. To do so, we first need to calculate the representations of the hierarchical topics:

        First, we train a basic BERTopic model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))[\"data\"]\ntopic_model = BERTopic(verbose=True)\ntopics, probs = topic_model.fit_transform(docs)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n

        To visualize these results, we simply need to pass the resulting hierarchical_topics to our .visualize_hierarchy function:

        topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)\n

        If you hover over the black circles, you will see the topic representation at that level of the hierarchy. These representations help you understand the effect of merging certain topics. Some might be logical to merge whilst others might not. Moreover, we can now see which sub-topics can be found within certain larger themes.

        "},{"location":"getting_started/visualization/visualize_hierarchy.html#text-based-topic-tree","title":"Text-based topic tree","text":"

        Although this gives a nice overview of the potential hierarchy, hovering over all black circles can be tiresome. Instead, we can use topic_model.get_topic_tree to create a text-based representation of this hierarchy. Although the general structure is more difficult to view, we can see better which topics could be logically merged:

        >>> tree = topic_model.get_topic_tree(hierarchical_topics)\n>>> print(tree)\n.\n\u2514\u2500atheists_atheism_god_moral_atheist\n     \u251c\u2500atheists_atheism_god_atheist_argument\n     \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n     \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n     \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n
        Click here to view the full tree.
          .\n  \u251c\u2500people_armenian_said_god_armenians\n  \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u251c\u2500god_jesus_jehovah_lord_christ\n  \u2502    \u2502    \u2502    \u251c\u2500jehovah_lord_mormon_mcconkie_god\n  \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500ra_satan_thou_god_lucifer \u2500\u2500 Topic: 94\n  \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500jehovah_lord_mormon_mcconkie_unto \u2500\u2500 Topic: 78\n  \u2502    \u2502    \u2502    \u2514\u2500jesus_mary_god_hell_sin\n  \u2502    \u2502    \u2502         \u251c\u2500jesus_hell_god_eternal_heaven\n  \u2502    \u2502    \u2502         \u2502    \u251c\u2500hell_jesus_eternal_god_heaven\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500jesus_tomb_disciples_resurrection_john \u2500\u2500 Topic: 69\n  \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500hell_eternal_god_jesus_heaven \u2500\u2500 Topic: 53\n  \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500aaron_baptism_sin_law_god \u2500\u2500 Topic: 89\n  \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500mary_sin_maria_priest_conception \u2500\u2500 Topic: 56\n  \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500marriage_married_marry_ceremony_marriages \u2500\u2500 Topic: 110\n  \u2502    \u2514\u2500people_armenian_armenians_said_mr\n  \u2502         \u251c\u2500people_armenian_armenians_said_israel\n  \u2502         \u2502    \u251c\u2500god_homosexual_homosexuality_atheists_sex\n  \u2502         \u2502    \u2502    \u251c\u2500homosexual_homosexuality_sex_gay_homosexuals\n  \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500kinsey_sex_gay_men_sexual \u2500\u2500 Topic: 44\n  \u2502         \u2502    \u2502    \u2502    \u2514\u2500homosexuality_homosexual_sin_homosexuals_gay\n  \u2502         \u2502    \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500gay_homosexual_homosexuals_sexual_cramer \u2500\u2500 Topic: 50\n  \u2502         \u2502    \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500homosexuality_homosexual_sin_paul_sex \u2500\u2500 Topic: 27\n  \u2502         \u2502    \u2502    \u2514\u2500god_atheists_atheism_moral_atheist\n  \u2502         \u2502    \u2502         \u251c\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500jim_context_challenges_articles_quote \u2500\u2500 Topic: 36\n  \u2502         \u2502    \u2502         \u2502    \u2514\u2500islam_quran_judas_islamic_book\n  \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500islam_quran_islamic_rushdie_muslims \u2500\u2500 Topic: 31\n  \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500judas_scripture_bible_books_greek \u2500\u2500 Topic: 33\n  \u2502         \u2502    \u2502         \u2514\u2500atheists_atheism_god_moral_atheist\n  \u2502         \u2502    \u2502              \u251c\u2500atheists_atheism_god_atheist_argument\n  \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500atheists_atheism_god_atheist_argument \u2500\u2500 Topic: 21\n  \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500br_god_exist_genetic_existence \u2500\u2500 Topic: 124\n  \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500moral_morality_objective_immoral_morals \u2500\u2500 Topic: 29\n  \u2502         \u2502    \u2514\u2500armenian_armenians_people_israel_said\n  \u2502         \u2502         \u251c\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502    \u251c\u2500tax_rights_government_income_taxes\n  \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500rights_right_slavery_slaves_residence \u2500\u2500 Topic: 106\n  \u2502         \u2502         \u2502    \u2502    \u2514\u2500tax_government_taxes_income_libertarians\n  \u2502         \u2502         \u2502    \u2502         \u251c\u2500\u25a0\u2500\u2500government_libertarians_libertarian_regulation_party \u2500\u2500 Topic: 58\n  \u2502         \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500tax_taxes_income_billion_deficit \u2500\u2500 Topic: 41\n  \u2502         \u2502         \u2502    \u2514\u2500armenian_armenians_israel_people_jews\n  \u2502         \u2502         \u2502         \u251c\u2500gun_guns_militia_firearms_amendment\n  \u2502         \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500blacks_penalty_death_cruel_punishment \u2500\u2500 Topic: 55\n  \u2502         \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500gun_guns_militia_firearms_amendment \u2500\u2500 Topic: 7\n  \u2502         \u2502         \u2502         \u2514\u2500armenian_armenians_israel_jews_turkish\n  \u2502         \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500israel_israeli_jews_arab_jewish \u2500\u2500 Topic: 4\n  \u2502         \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500armenian_armenians_turkish_armenia_azerbaijan \u2500\u2500 Topic: 15\n  \u2502         \u2502         \u2514\u2500stephanopoulos_president_mr_myers_ms\n  \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500serbs_muslims_stephanopoulos_mr_bosnia \u2500\u2500 Topic: 35\n  \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500myers_stephanopoulos_president_ms_mr \u2500\u2500 Topic: 87\n  \u2502         \u2514\u2500batf_fbi_koresh_compound_gas\n  \u2502              \u251c\u2500\u25a0\u2500\u2500reno_workers_janet_clinton_waco \u2500\u2500 Topic: 77\n  \u2502              \u2514\u2500batf_fbi_koresh_gas_compound\n  \u2502                   \u251c\u2500batf_koresh_fbi_warrant_compound\n  \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500batf_warrant_raid_compound_fbi \u2500\u2500 Topic: 42\n  \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500koresh_batf_fbi_children_compound \u2500\u2500 Topic: 61\n  \u2502                   \u2514\u2500\u25a0\u2500\u2500fbi_gas_tear_bds_building \u2500\u2500 Topic: 23\n  \u2514\u2500use_like_just_dont_new\n      \u251c\u2500game_team_year_games_like\n      \u2502    \u251c\u2500game_team_games_25_year\n      \u2502    \u2502    \u251c\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502    \u251c\u2500window_printer_use_problem_mhz\n      \u2502    \u2502    \u2502    \u2502    \u251c\u2500mhz_wire_simms_wiring_battery\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_mhz_battery_cpu_heat\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500simms_pds_simm_vram_lc\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500pds_nubus_lc_slot_card \u2500\u2500 Topic: 119\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500simms_simm_vram_meg_dram \u2500\u2500 Topic: 32\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500mhz_battery_cpu_heat_speed\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500mhz_cpu_speed_heat_fan\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500fan_cpu_heat_sink_fans \u2500\u2500 Topic: 92\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500mhz_speed_cpu_fpu_clock \u2500\u2500 Topic: 22\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500monitor_turn_power_computer_electricity \u2500\u2500 Topic: 91\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500battery_batteries_concrete_duo_discharge\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500duo_battery_apple_230_problem \u2500\u2500 Topic: 121\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500battery_batteries_concrete_discharge_temperature \u2500\u2500 Topic: 75\n      \u2502    \u2502    \u2502    \u2502    \u2502    \u2514\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500wire_wiring_ground_neutral_outlets\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500leds_uv_blue_light_boards \u2500\u2500 Topic: 66\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500wire_wiring_ground_neutral_outlets \u2500\u2500 Topic: 120\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500scope_scopes_phone_dial_number\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500dial_number_phone_line_output \u2500\u2500 Topic: 93\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500scope_scopes_motorola_generator_oscilloscope \u2500\u2500 Topic: 113\n      \u2502    \u2502    \u2502    \u2502    \u2502         \u2514\u2500celp_dsp_sampling_antenna_digital\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500antenna_antennas_receiver_cable_transmitter \u2500\u2500 Topic: 70\n      \u2502    \u2502    \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500celp_dsp_sampling_speech_voice \u2500\u2500 Topic: 52\n      \u2502    \u2502    \u2502    \u2502    \u2514\u2500window_printer_xv_mouse_windows\n      \u2502    \u2502    \u2502    \u2502         \u251c\u2500window_xv_error_widget_problem\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u251c\u2500error_symbol_undefined_xterm_rx\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500symbol_error_undefined_doug_parse \u2500\u2500 Topic: 63\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500rx_remote_server_xdm_xterm \u2500\u2500 Topic: 45\n      \u2502    \u2502    \u2502    \u2502         \u2502    \u2514\u2500window_xv_widget_application_expose\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u251c\u2500window_widget_expose_application_event\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500gc_mydisplay_draw_gxxor_drawing \u2500\u2500 Topic: 103\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500window_widget_application_expose_event \u2500\u2500 Topic: 25\n      \u2502    \u2502    \u2502    \u2502         \u2502         \u2514\u2500xv_den_polygon_points_algorithm\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500den_polygon_points_algorithm_polygons \u2500\u2500 Topic: 28\n      \u2502    \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500xv_24bit_image_bit_images \u2500\u2500 Topic: 57\n      \u2502    \u2502    \u2502    \u2502         \u2514\u2500printer_fonts_print_mouse_postscript\n      \u2502    \u2502    \u2502    \u2502              \u251c\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500scanner_logitech_grayscale_ocr_scanman \u2500\u2500 Topic: 108\n      \u2502    \u2502    \u2502    \u2502              \u2502    \u2514\u2500printer_fonts_print_font_deskjet\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u251c\u2500\u25a0\u2500\u2500printer_print_deskjet_hp_ink \u2500\u2500 Topic: 18\n      \u2502    \u2502    \u2502    \u2502              \u2502         \u2514\u2500\u25a0\u2500\u2500fonts_font_truetype_tt_atm \u2500\u2500 Topic: 49\n      \u2502    \u2502    \u2502    \u2502              \u2514\u2500mouse_ghostscript_midi_driver_postscript\n      \u2502    \u2502    \u2502    \u2502                   \u251c\u2500ghostscript_midi_postscript_files_file\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500ghostscript_postscript_pageview_ghostview_dsc \u2500\u2500 Topic: 104\n      \u2502    \u2502    \u2502    \u2502                   \u2502    \u2514\u2500midi_sound_file_windows_driver\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u251c\u2500\u25a0\u2500\u2500location_mar_file_host_rwrr \u2500\u2500 Topic: 83\n      \u2502    \u2502    \u2502    \u2502                   \u2502         \u2514\u2500\u25a0\u2500\u2500midi_sound_driver_blaster_soundblaster \u2500\u2500 Topic: 98\n      \u2502    \u2502    \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500mouse_driver_mice_ball_problem \u2500\u2500 Topic: 68\n      \u2502    \u2502    \u2502    \u2514\u2500game_team_games_25_season\n      \u2502    \u2502    \u2502         \u251c\u25001st_sale_condition_comics_hulk\n      \u2502    \u2502    \u2502         \u2502    \u251c\u2500sale_condition_offer_asking_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u251c\u2500condition_stereo_amp_speakers_asking\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500miles_car_amfm_toyota_cassette \u2500\u2500 Topic: 62\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500amp_speakers_condition_stereo_audio \u2500\u2500 Topic: 24\n      \u2502    \u2502    \u2502         \u2502    \u2502    \u2514\u2500games_sale_pom_cds_shipping\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u251c\u2500pom_cds_sale_shipping_cd\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500size_shipping_sale_condition_mattress \u2500\u2500 Topic: 100\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500pom_cds_cd_sale_picture \u2500\u2500 Topic: 37\n      \u2502    \u2502    \u2502         \u2502    \u2502         \u2514\u2500\u25a0\u2500\u2500games_game_snes_sega_genesis \u2500\u2500 Topic: 40\n      \u2502    \u2502    \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u251c\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u251c\u2500lens_tape_camera_backup_lenses\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500tape_backup_tapes_drive_4mm \u2500\u2500 Topic: 107\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500lens_camera_lenses_zoom_pouch \u2500\u2500 Topic: 114\n      \u2502    \u2502    \u2502         \u2502         \u2502    \u2514\u25001st_hulk_comics_art_appears\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u25001st_hulk_comics_art_appears \u2500\u2500 Topic: 105\n      \u2502    \u2502    \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500books_book_cover_trek_chemistry \u2500\u2500 Topic: 125\n      \u2502    \u2502    \u2502         \u2502         \u2514\u2500tickets_hotel_ticket_voucher_package\n      \u2502    \u2502    \u2502         \u2502              \u251c\u2500\u25a0\u2500\u2500hotel_voucher_package_vacation_room \u2500\u2500 Topic: 74\n      \u2502    \u2502    \u2502         \u2502              \u2514\u2500\u25a0\u2500\u2500tickets_ticket_june_airlines_july \u2500\u2500 Topic: 84\n      \u2502    \u2502    \u2502         \u2514\u2500game_team_games_season_hockey\n      \u2502    \u2502    \u2502              \u251c\u2500game_hockey_team_25_550\n      \u2502    \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500espn_pt_pts_game_la \u2500\u2500 Topic: 17\n      \u2502    \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500team_25_game_hockey_550 \u2500\u2500 Topic: 2\n      \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500year_game_hit_baseball_players \u2500\u2500 Topic: 0\n      \u2502    \u2502    \u2514\u2500bike_car_greek_insurance_msg\n      \u2502    \u2502         \u251c\u2500car_bike_insurance_cars_engine\n      \u2502    \u2502         \u2502    \u251c\u2500car_insurance_cars_radar_engine\n      \u2502    \u2502         \u2502    \u2502    \u251c\u2500insurance_health_private_care_canada\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500insurance_health_private_care_canada \u2500\u2500 Topic: 99\n      \u2502    \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500insurance_car_accident_rates_sue \u2500\u2500 Topic: 82\n      \u2502    \u2502         \u2502    \u2502    \u2514\u2500car_cars_radar_engine_detector\n      \u2502    \u2502         \u2502    \u2502         \u251c\u2500car_radar_cars_detector_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500radar_detector_detectors_ka_alarm \u2500\u2500 Topic: 39\n      \u2502    \u2502         \u2502    \u2502         \u2502    \u2514\u2500car_cars_mustang_ford_engine\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500clutch_shift_shifting_transmission_gear \u2500\u2500 Topic: 88\n      \u2502    \u2502         \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500car_cars_mustang_ford_v8 \u2500\u2500 Topic: 14\n      \u2502    \u2502         \u2502    \u2502         \u2514\u2500oil_diesel_odometer_diesels_car\n      \u2502    \u2502         \u2502    \u2502              \u251c\u2500odometer_oil_sensor_car_drain\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500odometer_sensor_speedo_gauge_mileage \u2500\u2500 Topic: 96\n      \u2502    \u2502         \u2502    \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500oil_drain_car_leaks_taillights \u2500\u2500 Topic: 102\n      \u2502    \u2502         \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500diesel_diesels_emissions_fuel_oil \u2500\u2500 Topic: 79\n      \u2502    \u2502         \u2502    \u2514\u2500bike_riding_ride_bikes_motorcycle\n      \u2502    \u2502         \u2502         \u251c\u2500bike_ride_riding_bikes_lane\n      \u2502    \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500bike_ride_riding_lane_car \u2500\u2500 Topic: 11\n      \u2502    \u2502         \u2502         \u2502    \u2514\u2500\u25a0\u2500\u2500bike_bikes_miles_honda_motorcycle \u2500\u2500 Topic: 19\n      \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500countersteering_bike_motorcycle_rear_shaft \u2500\u2500 Topic: 46\n      \u2502    \u2502         \u2514\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u251c\u2500greek_msg_kuwait_greece_water\n      \u2502    \u2502              \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u251c\u2500greek_msg_kuwait_greece_dog\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u251c\u2500greek_kuwait_greece_turkish_greeks\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500greek_greece_turkish_greeks_cyprus \u2500\u2500 Topic: 71\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kuwait_iraq_iran_gulf_arabia \u2500\u2500 Topic: 76\n      \u2502    \u2502              \u2502    \u2502    \u2502    \u2514\u2500msg_dog_drugs_drug_food\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u251c\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500clinton_bush_quayle_reagan_panicking \u2500\u2500 Topic: 101\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502    \u2514\u2500dog_dogs_cooper_trial_weaver\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500cooper_trial_weaver_spence_witnesses \u2500\u2500 Topic: 90\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500dog_dogs_bike_trained_springer \u2500\u2500 Topic: 67\n      \u2502    \u2502              \u2502    \u2502    \u2502         \u2514\u2500msg_drugs_drug_food_chinese\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500msg_food_chinese_foods_taste \u2500\u2500 Topic: 30\n      \u2502    \u2502              \u2502    \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500drugs_drug_marijuana_cocaine_alcohol \u2500\u2500 Topic: 72\n      \u2502    \u2502              \u2502    \u2502    \u2514\u2500water_theory_universe_science_larsons\n      \u2502    \u2502              \u2502    \u2502         \u251c\u2500water_nuclear_cooling_steam_dept\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500rocketry_rockets_engines_nuclear_plutonium \u2500\u2500 Topic: 115\n      \u2502    \u2502              \u2502    \u2502         \u2502    \u2514\u2500water_cooling_steam_dept_plants\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500water_dept_phd_environmental_atmospheric \u2500\u2500 Topic: 97\n      \u2502    \u2502              \u2502    \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500cooling_water_steam_towers_plants \u2500\u2500 Topic: 109\n      \u2502    \u2502              \u2502    \u2502         \u2514\u2500theory_universe_larsons_larson_science\n      \u2502    \u2502              \u2502    \u2502              \u251c\u2500\u25a0\u2500\u2500theory_universe_larsons_larson_science \u2500\u2500 Topic: 54\n      \u2502    \u2502              \u2502    \u2502              \u2514\u2500\u25a0\u2500\u2500oort_cloud_grbs_gamma_burst \u2500\u2500 Topic: 80\n      \u2502    \u2502              \u2502    \u2514\u2500helmet_kirlian_photography_lock_wax\n      \u2502    \u2502              \u2502         \u251c\u2500helmet_kirlian_photography_leaf_mask\n      \u2502    \u2502              \u2502         \u2502    \u251c\u2500kirlian_photography_leaf_pictures_deleted\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u251c\u2500deleted_joke_stuff_maddi_nickname\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500joke_maddi_nickname_nicknames_frank \u2500\u2500 Topic: 43\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500deleted_stuff_bookstore_joke_motto \u2500\u2500 Topic: 81\n      \u2502    \u2502              \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500kirlian_photography_leaf_pictures_aura \u2500\u2500 Topic: 85\n      \u2502    \u2502              \u2502         \u2502    \u2514\u2500helmet_mask_liner_foam_cb\n      \u2502    \u2502              \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500helmet_liner_foam_cb_helmets \u2500\u2500 Topic: 112\n      \u2502    \u2502              \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500mask_goalies_77_santore_tl \u2500\u2500 Topic: 123\n      \u2502    \u2502              \u2502         \u2514\u2500lock_wax_paint_plastic_ear\n      \u2502    \u2502              \u2502              \u251c\u2500\u25a0\u2500\u2500lock_cable_locks_bike_600 \u2500\u2500 Topic: 117\n      \u2502    \u2502              \u2502              \u2514\u2500wax_paint_ear_plastic_skin\n      \u2502    \u2502              \u2502                   \u251c\u2500\u25a0\u2500\u2500wax_paint_plastic_scratches_solvent \u2500\u2500 Topic: 65\n      \u2502    \u2502              \u2502                   \u2514\u2500\u25a0\u2500\u2500ear_wax_skin_greasy_acne \u2500\u2500 Topic: 116\n      \u2502    \u2502              \u2514\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u251c\u2500m4_mp_14_mw_mo\n      \u2502    \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500m4_mp_14_mw_mo \u2500\u2500 Topic: 111\n      \u2502    \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500test_ensign_nameless_deane_deanebinahccbrandeisedu \u2500\u2500 Topic: 118\n      \u2502    \u2502                   \u2514\u2500\u25a0\u2500\u2500ites_cheek_hello_hi_ken \u2500\u2500 Topic: 3\n      \u2502    \u2514\u2500space_medical_health_disease_cancer\n      \u2502         \u251c\u2500medical_health_disease_cancer_patients\n      \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500cancer_centers_center_medical_research \u2500\u2500 Topic: 122\n      \u2502         \u2502    \u2514\u2500health_medical_disease_patients_hiv\n      \u2502         \u2502         \u251c\u2500patients_medical_disease_candida_health\n      \u2502         \u2502         \u2502    \u251c\u2500\u25a0\u2500\u2500candida_yeast_infection_gonorrhea_infections \u2500\u2500 Topic: 48\n      \u2502         \u2502         \u2502    \u2514\u2500patients_disease_cancer_medical_doctor\n      \u2502         \u2502         \u2502         \u251c\u2500\u25a0\u2500\u2500hiv_medical_cancer_patients_doctor \u2500\u2500 Topic: 34\n      \u2502         \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500pain_drug_patients_disease_diet \u2500\u2500 Topic: 26\n      \u2502         \u2502         \u2514\u2500\u25a0\u2500\u2500health_newsgroup_tobacco_vote_votes \u2500\u2500 Topic: 9\n      \u2502         \u2514\u2500space_launch_nasa_shuttle_orbit\n      \u2502              \u251c\u2500space_moon_station_nasa_launch\n      \u2502              \u2502    \u251c\u2500\u25a0\u2500\u2500sky_advertising_billboard_billboards_space \u2500\u2500 Topic: 59\n      \u2502              \u2502    \u2514\u2500\u25a0\u2500\u2500space_station_moon_redesign_nasa \u2500\u2500 Topic: 16\n      \u2502              \u2514\u2500space_mission_hst_launch_orbit\n      \u2502                   \u251c\u2500space_launch_nasa_orbit_propulsion\n      \u2502                   \u2502    \u251c\u2500\u25a0\u2500\u2500space_launch_nasa_propulsion_astronaut \u2500\u2500 Topic: 47\n      \u2502                   \u2502    \u2514\u2500\u25a0\u2500\u2500orbit_km_jupiter_probe_earth \u2500\u2500 Topic: 86\n      \u2502                   \u2514\u2500\u25a0\u2500\u2500hst_mission_shuttle_orbit_arrays \u2500\u2500 Topic: 60\n      \u2514\u2500drive_file_key_windows_use\n          \u251c\u2500key_file_jpeg_encryption_image\n          \u2502    \u251c\u2500key_encryption_clipper_chip_keys\n          \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500key_clipper_encryption_chip_keys \u2500\u2500 Topic: 1\n          \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500entry_file_ripem_entries_key \u2500\u2500 Topic: 73\n          \u2502    \u2514\u2500jpeg_image_file_gif_images\n          \u2502         \u251c\u2500motif_graphics_ftp_available_3d\n          \u2502         \u2502    \u251c\u2500motif_graphics_openwindows_ftp_available\n          \u2502         \u2502    \u2502    \u251c\u2500\u25a0\u2500\u2500openwindows_motif_xview_windows_mouse \u2500\u2500 Topic: 20\n          \u2502         \u2502    \u2502    \u2514\u2500\u25a0\u2500\u2500graphics_widget_ray_3d_available \u2500\u2500 Topic: 95\n          \u2502         \u2502    \u2514\u2500\u25a0\u2500\u25003d_machines_version_comments_contact \u2500\u2500 Topic: 38\n          \u2502         \u2514\u2500jpeg_image_gif_images_format\n          \u2502              \u251c\u2500\u25a0\u2500\u2500gopher_ftp_files_stuffit_images \u2500\u2500 Topic: 51\n          \u2502              \u2514\u2500\u25a0\u2500\u2500jpeg_image_gif_format_images \u2500\u2500 Topic: 13\n          \u2514\u2500drive_db_card_scsi_windows\n              \u251c\u2500db_windows_dos_mov_os2\n              \u2502    \u251c\u2500\u25a0\u2500\u2500copy_protection_program_software_disk \u2500\u2500 Topic: 64\n              \u2502    \u2514\u2500\u25a0\u2500\u2500db_windows_dos_mov_os2 \u2500\u2500 Topic: 8\n              \u2514\u2500drive_card_scsi_drives_ide\n                      \u251c\u2500drive_scsi_drives_ide_disk\n                      \u2502    \u251c\u2500\u25a0\u2500\u2500drive_scsi_drives_ide_disk \u2500\u2500 Topic: 6\n                      \u2502    \u2514\u2500\u25a0\u2500\u2500meg_sale_ram_drive_shipping \u2500\u2500 Topic: 12\n                      \u2514\u2500card_modem_monitor_video_drivers\n                          \u251c\u2500\u25a0\u2500\u2500card_monitor_video_drivers_vga \u2500\u2500 Topic: 5\n                          \u2514\u2500\u25a0\u2500\u2500modem_port_serial_irq_com \u2500\u2500 Topic: 10\n
        "},{"location":"getting_started/visualization/visualize_hierarchy.html#visualize-hierarchical-documents","title":"Visualize Hierarchical Documents","text":"

        We can extend the previous method by calculating the topic representation at different levels of the hierarchy and plotting them on a 2D plane. To do so, we first need to calculate the hierarchical topics:

        from sklearn.datasets import fetch_20newsgroups\nfrom sentence_transformers import SentenceTransformer\nfrom bertopic import BERTopic\nfrom umap import UMAP\n\n# Prepare embeddings\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\nsentence_model = SentenceTransformer(\"all-MiniLM-L6-v2\")\nembeddings = sentence_model.encode(docs, show_progress_bar=False)\n\n# Train BERTopic and extract hierarchical topics\ntopic_model = BERTopic().fit(docs, embeddings)\nhierarchical_topics = topic_model.hierarchical_topics(docs)\n
        Then, we can visualize the hierarchical documents by either supplying it with our embeddings or by reducing their dimensionality ourselves:

        # Run the visualization with the original embeddings\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings)\n\n# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:\nreduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)\ntopic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)\n

        Note

        The visualization above was generated with the additional parameter hide_document_hover=True which disables the option to hover over the individual points and see the content of the documents. This makes the resulting visualization smaller and fit into your RAM. However, it might be interesting to set hide_document_hover=False to hover over the points and see the content of the documents.

        "},{"location":"getting_started/visualization/visualize_terms.html","title":"Terms","text":"

        We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other. To visualize this hierarchy, run the following:

        topic_model.visualize_barchart()\n
        "},{"location":"getting_started/visualization/visualize_terms.html#visualize-term-score-decline","title":"Visualize Term Score Decline","text":"

        Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation.

        To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic.

        To visualize the c-TF-IDF score decline, run the following:

        topic_model.visualize_term_rank()\n

        To enable the log scale on the y-axis for a better view of individual topics, run the following:

        topic_model.visualize_term_rank(log_scale=True)\n

        This visualization was heavily inspired by the \"Term Probability Decline\" visualization found in an analysis by the amazing tmtoolkit. Reference to that specific analysis can be found here.

        "},{"location":"getting_started/visualization/visualize_topics.html","title":"Topics","text":"

        Visualizing BERTopic and its derivatives is important in understanding the model, how it works, and more importantly, where it works. Since topic modeling can be quite a subjective field it is difficult for users to validate their models. Looking at the topics and seeing if they make sense is an important factor in alleviating this issue.

        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topics","title":"Visualize Topics","text":"

        After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to LDAvis.

        We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\ndocs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs) \n

        Then, we can call .visualize_topics to create a 2D representation of your topics. The resulting graph is a plotly interactive graph which can be converted to HTML:

        topic_model.visualize_topics()\n

        You can use the slider to select the topic which then lights up red. If you hover over a topic, then general information is given about the topic, including the size of the topic and its corresponding words.

        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topic-similarity","title":"Visualize Topic Similarity","text":"

        Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other. To visualize the heatmap, run the following:

        topic_model.visualize_heatmap()\n

        Note

        You can set n_clusters in visualize_heatmap to order the topics by their similarity. This will result in blocks being formed in the heatmap indicating which clusters of topics are similar to each other. This step is very much recommended as it will make reading the heatmap easier.

        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topics-over-time","title":"Visualize Topics over Time","text":"

        After creating topics over time with Dynamic Topic Modeling, we can visualize these topics by leveraging the interactive abilities of Plotly. Plotly allows us to show the frequency of topics over time whilst giving the option of hovering over the points to show the time-specific topic representations. Simply call .visualize_topics_over_time with the newly created topics over time:

        import re\nimport pandas as pd\nfrom bertopic import BERTopic\n\n# Prepare data\ntrump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')\ntrump.text = trump.apply(lambda row: re.sub(r\"http\\S+\", \"\", row.text).lower(), 1)\ntrump.text = trump.apply(lambda row: \" \".join(filter(lambda x:x[0]!=\"@\", row.text.split())), 1)\ntrump.text = trump.apply(lambda row: \" \".join(re.sub(\"[^a-zA-Z]+\", \" \", row.text).split()), 1)\ntrump = trump.loc[(trump.isRetweet == \"f\") & (trump.text != \"\"), :]\ntimestamps = trump.date.to_list()\ntweets = trump.text.to_list()\n\n# Create topics over time\nmodel = BERTopic(verbose=True)\ntopics, probs = model.fit_transform(tweets)\ntopics_over_time = model.topics_over_time(tweets, timestamps)\n

        Then, we visualize some interesting topics:

        model.visualize_topics_over_time(topics_over_time, topics=[9, 10, 72, 83, 87, 91])\n
        "},{"location":"getting_started/visualization/visualize_topics.html#visualize-topics-per-class","title":"Visualize Topics per Class","text":"

        You might want to extract and visualize the topic representation per class. For example, if you have specific groups of users that might approach topics differently, then extracting them would help understanding how these users talk about certain topics. In other words, this is simply creating a topic representation for certain classes that you might have in your data.

        First, we need to train our model:

        from bertopic import BERTopic\nfrom sklearn.datasets import fetch_20newsgroups\n\n# Prepare data and classes\ndata = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))\ndocs = data[\"data\"]\nclasses = [data[\"target_names\"][i] for i in data[\"target\"]]\n\n# Create topic model and calculate topics per class\ntopic_model = BERTopic()\ntopics, probs = topic_model.fit_transform(docs)\ntopics_per_class = topic_model.topics_per_class(docs, classes=classes)\n

        Then, we visualize the topic representation of major topics per class:

        topic_model.visualize_topics_per_class(topics_per_class)\n
        "},{"location":"getting_started/zeroshot/zeroshot.html","title":"Zero-shot Topic Modeling","text":"

        Zero-shot Topic Modeling is a technique that allows you to find topics in large amounts of documents that were predefined. When faced with many documents, you often have an idea of which topics will definitely be in there. Whether that is a result of simply knowing your data or if a domain expert is involved in defining those topics.

        This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. This allows for extensive flexibility as there are three scenario's to explore:

        • First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found.
        • Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics.
        • Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
        \"Religion\" the labels Embed cosine similaritydocumentzeroshot For each document, assign topics based on between and embeddings that could not be assigned to a zero-shot topic Cluster documents the to the to create a single list of topics Appendclusteringtopicszero-shot topics zeroshot topicslabels Define through . \"Clustering\" Zeroshot topic 1 \"Topic Modeling\" Zeroshot topic 2 \"Large Language Models (LLM)\" Zeroshot topic 3 \"Topic Modeling\" \"Large Language Models\" Topic Modeling Cluster non-assigned docs + Topic X Topic Y Topic Z Zero-shot Topics Topic Modeling LLM Clustering Merged BERTopic Topic Modeling LLM Clustering Topic X Topic Y Topic Z LLM No match found Clustering

        This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will follow the regular BERTopic pipeline and attempt to find clusters that do not fit with the zero-shot topics.

        "},{"location":"getting_started/zeroshot/zeroshot.html#example","title":"Example","text":"

        In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, there may be several other topics that we know should be in the documents. The dataset that we use is small subset of ArXiv papers. We know the data and believe there to be at least the following topics: clustering, topic modeling, and large language models. However, we are not sure whether other topics exist and want to explore those.

        Zero-shot BERTopic needs two parameters: * zeroshot_topic_list - The names of the topics to assign documents to. Making sure this is as descriptive as possible helps improve the assignment since they are based on cosine similarities between embeddings. * zeroshot_min_similarity - The minimum cosine similarity needed to match a document to a document. It is a value between 0 and 1.

        Using this feature is straightforward:

        from datasets import load_dataset\n\nfrom bertopic import BERTopic\nfrom bertopic.representation import KeyBERTInspired\n\n# We select a subsample of 5000 abstracts from ArXiv\ndataset = load_dataset(\"CShorten/ML-ArXiv-Papers\")[\"train\"]\ndocs = dataset[\"abstract\"][:5_000]\n\n# We define a number of topics that we know are in the documents\nzeroshot_topic_list = [\"Clustering\", \"Topic Modeling\", \"Large Language Models\"]\n\n# We fit our model using the zero-shot topics\n# and we define a minimum similarity. For each document,\n# if the similarity does not exceed that value, it will be used\n# for clustering instead.\ntopic_model = BERTopic(\n    embedding_model=\"thenlper/gte-small\", \n    min_topic_size=15,\n    zeroshot_topic_list=zeroshot_topic_list,\n    zeroshot_min_similarity=.85,\n    representation_model=KeyBERTInspired()\n)\ntopics, _ = topic_model.fit_transform(docs)\n

        When we run topic_model.get_topic_info() you will see something like this:

        The zeroshot_min_similarity parameter controls how many of the documents are assigned to the predefined zero-shot topics. Lower this value and you will have more documents assigned to zero-shot topics and fewer documents will be clustered. Increase this value you will have fewer documents assigned to zero-shot topics and more documents will be clustered.

        Note

        Setting the zeroshot_min_similarity parameter requires a bit of experimentation. Some embedding models have different similarity distributions, so trying out the values manually and exploring the results is highly advised.

        Tip

        Because zero-shot topic modeling is essentially merging two different topic models, the probs will be empty initially. If you want to have the probabilities of topics across documents, you can run topic_model.transform on your documents to extract the updated probs.

        "}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index aff64e8f..99c4c6f5 100755 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,347 +2,347 @@ https://maartengr.github.io/BERTopic/index.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/changelog.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/faq.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/usecases.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/algorithm/algorithm.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/bertopic.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/ctfidf.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/onlinecv.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/backends/base.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/backends/cohere.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/backends/openai.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/backends/word_doc.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/cluster/base.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/dimensionality/base.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/barchart.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/distribution.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/document_datamap.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/documents.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/dtm.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/heatmap.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/hierarchical_documents.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/hierarchy.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/term.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/topics.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/plotting/topics_per_class.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/base.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/cohere.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/generation.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/keybert.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/langchain.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/mmr.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/openai.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/pos.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/api/representation/zeroshot.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/clustering/clustering.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/embeddings/embeddings.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/guided/guided.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/manual/manual.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/merge/merge.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/multiaspect/multiaspect.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/multimodal/multimodal.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/online/online.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/representation/llm.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/representation/representation.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/search/search.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/seed_words/seed_words.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/semisupervised/semisupervised.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/serialization/serialization.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/supervised/supervised.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/topicreduction/topicreduction.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/topicrepresentation/topicrepresentation.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/topicsovertime/topicsovertime.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/visualization/visualization.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_documents.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_hierarchy.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_terms.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_topics.html - 2024-06-14 + 2024-07-22 daily https://maartengr.github.io/BERTopic/getting_started/zeroshot/zeroshot.html - 2024-06-14 + 2024-07-22 daily \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 479081f2..3ff41e8f 100755 Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ