From 2353f4c21d74e33e34e30dbae938304bff094792 Mon Sep 17 00:00:00 2001 From: Maarten Grootendorst Date: Mon, 22 Jul 2024 10:18:46 +0200 Subject: [PATCH] v0.16.3 (#2093) --- bertopic/_bertopic.py | 3 + docs/changelog.md | 27 +++ docs/getting_started/zeroshot/zeroshot.md | 14 +- docs/getting_started/zeroshot/zeroshot.svg | 186 ++++++++++----------- pyproject.toml | 2 +- 5 files changed, 129 insertions(+), 103 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 4dff5c1e..0c732338 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -2387,6 +2387,7 @@ def visualize_topics( self, topics: List[int] = None, top_n_topics: int = None, + use_ctfidf: bool = False, custom_labels: bool = False, title: str = "Intertopic Distance Map", width: int = 650, @@ -2403,6 +2404,7 @@ def visualize_topics( For example, if you want to visualize only topics 1 through 5: `topics = [1, 2, 3, 4, 5]`. top_n_topics: Only select the top n most frequent topics + use_ctfidf: Whether to use c-TF-IDF representations instead of the embeddings from the embedding model. custom_labels: Whether to use custom topic labels that were defined using `topic_model.set_topic_labels`. title: Title of the plot. @@ -2428,6 +2430,7 @@ def visualize_topics( self, topics=topics, top_n_topics=top_n_topics, + use_ctfidf=use_ctfidf, custom_labels=custom_labels, title=title, width=width, diff --git a/docs/changelog.md b/docs/changelog.md index c9b57246..68a4239f 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,33 @@ hide: # Changelog + +## **Version 0.16.3** +*Release date: 22 July, 2024* + +

Highlights:

+ +* Simplify zero-shot topic modeling by [@ianrandman](https://github.com/ianrandman) in [#2060](https://github.com/MaartenGr/BERTopic/pull/2060) +* Option to choose between c-TF-IDF and Topic Embeddings in many functions by [@azikoss](https://github.com/azikoss) in [#1894](https://github.com/MaartenGr/BERTopic/pull/1894) + * Use the `use_ctfidf` parameter in the following function to choose between c-TF-IDF and topic embeddings: + * `hierarchical_topics`, `reduce_topics`, `visualize_hierarchy`, `visualize_heatmap`, `visualize_topics` +* Linting with Ruff by [@afuetterer](https://github.com/afuetterer) in [#2033](https://github.com/MaartenGr/BERTopic/pull/2033) +* Switch from setup.py to pyproject.toml by [@afuetterer](https://github.com/afuetterer) in [#1978](https://github.com/MaartenGr/BERTopic/pull/1978) +* In multi-aspect context, allow Main model to be chained by [@ddicato](https://github.com/ddicato) in [#2002](https://github.com/MaartenGr/BERTopic/pull/2002) + +

Fixes:

+ +* Added templates for [issues](https://github.com/MaartenGr/BERTopic/tree/master/.github/ISSUE_TEMPLATE) and [pull requests](https://github.com/MaartenGr/BERTopic/blob/master/.github/PULL_REQUEST_TEMPLATE.md) +* Update River documentation example by [@Proteusiq](https://github.com/Proteusiq) in [#2004](https://github.com/MaartenGr/BERTopic/pull/2004) +* Fix PartOfSpeech reproducibility by [@Greenpp](https://github.com/Greenpp) in [#1996](https://github.com/MaartenGr/BERTopic/pull/1996) +* Fix PartOfSpeech ignoring first word by [@Greenpp](https://github.com/Greenpp) in [#2024](https://github.com/MaartenGr/BERTopic/pull/2024) +* Make sklearn embedding backend auto-select more cautious by [@freddyheppell](https://github.com/freddyheppell) in [#1984](https://github.com/MaartenGr/BERTopic/pull/1984) +* Fix typos by [@afuetterer](https://github.com/afuetterer) in [#1974](https://github.com/MaartenGr/BERTopic/pull/1974) +* Fix hierarchical_topics(...) when the distances between three clusters are the same by [@azikoss](https://github.com/azikoss) in [#1929](https://github.com/MaartenGr/BERTopic/pull/1929) +* Fixes to chain strategy example in outlier_reduction.md by [@reuning](https://github.com/reuning) in [#2065](https://github.com/MaartenGr/BERTopic/pull/2065) +* Remove obsolete flake8 config and update line length by [@afuetterer](https://github.com/afuetterer) in [#22066](https://github.com/MaartenGr/BERTopic/pull/2066) + + ## **Version 0.16.2** *Release date: 12 May, 2024* diff --git a/docs/getting_started/zeroshot/zeroshot.md b/docs/getting_started/zeroshot/zeroshot.md index 81916da9..951f6f0c 100644 --- a/docs/getting_started/zeroshot/zeroshot.md +++ b/docs/getting_started/zeroshot/zeroshot.md @@ -1,21 +1,17 @@ Zero-shot Topic Modeling is a technique that allows you to find topics in large amounts of documents that were predefined. When faced with many documents, you often have an idea of which topics will definitely be in there. Whether that is a result of simply knowing your data or if a domain expert is involved in defining those topics. This method allows you to not only find those specific topics but also create new topics for documents that would not fit with your predefined topics. -This allows for extensive flexibility as there are three scenario's to explore. +This allows for extensive flexibility as there are three scenario's to explore: -First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found. - -Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics. - -Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run. +* First, both zero-shot topics and clustered topics were detected. This means that some documents would fit with the predefined topics where others would not. For the latter, new topics were found. +* Second, only zero-shot topics were detected. Here, we would not need to find additional topics since all original documents were assigned to one of the predefined topics. +* Third, no zero-shot topics were detected. This means that none of the documents would fit with the predefined topics and a regular BERTopic would be run.
--8<-- "docs/getting_started/zeroshot/zeroshot.svg"
-This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will be put through a regular BERTopic model. - -This creates two models. One for the zero-shot topics and one for the non-zero-shot topics. We combine these two BERTopic models to create a single model that contains both zero-shot and non-zero-shot topics. +This method works as follows. First, we create a number of labels for our predefined topics and embed them using any embedding model. Then, we compare the embeddings of the documents with the predefined labels using cosine similarity. If they pass a user-defined threshold, the zero-shot topic is assigned to a document. If it does not, then that document, along with others, will follow the regular BERTopic pipeline and attempt to find clusters that do not fit with the zero-shot topics. ### **Example** In order to use zero-shot BERTopic, we create a list of topics that we want to assign to our documents. However, diff --git a/docs/getting_started/zeroshot/zeroshot.svg b/docs/getting_started/zeroshot/zeroshot.svg index 27702068..5b41d1f3 100644 --- a/docs/getting_started/zeroshot/zeroshot.svg +++ b/docs/getting_started/zeroshot/zeroshot.svg @@ -4,16 +4,16 @@ -"Clustering" +"Religion" the labels Embed cosine similaritydocumentzeroshot For each document, assign topics based on between and embeddings -ManualBERTopicBERTopic -Create two models: (zeroshot documents) (non-zeroshot documents) - the models into one -Merge + that could not be assigned to a zero-shot topic +Cluster documents + the to the to create a single list of topics +Appendclusteringtopicszero-shot topics zeroshot topicslabels Define through . "Clustering" @@ -46,105 +46,105 @@ - - - - - - + + + + + + - - - - - + + + + + Topic Modeling -BERTopic +Cluster non-assigned docs + Topic X Topic Y Topic Z - + - - + + - + - + - - + + - + - + - - + + - + -Manual BERTopic +Zero-shot Topics Topic Modeling LLM Clustering - - - - - - + + + + + + - - - - - + + + + + - + - - + + - + - + - - + + - + @@ -154,124 +154,124 @@ LLM Clustering - - - - - - + + + + + + - - - - - + + + + + Topic X Topic Y Topic Z - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + - + - - + + - + -Large Language Models +LLM - + - - + + - + No match found - + - - + + - + diff --git a/pyproject.toml b/pyproject.toml index 2dce9bc3..d8fe13e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bertopic" -version = "0.16.2" +version = "0.16.3" description = "BERTopic performs topic Modeling with state-of-the-art transformer models." readme = "README.md" license = {file = "LICENSE"}