Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed issue #2144 #2180

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 38 additions & 18 deletions bertopic/_bertopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,21 +478,20 @@ def fit_transform(
if documents.Document.values[0] is None:
custom_documents = self._images_to_text(documents, embeddings)

# Extract topics by calculating c-TF-IDF
self._extract_topics(custom_documents, embeddings=embeddings)
self._create_topic_vectors(documents=documents, embeddings=embeddings)

# Reduce topics
# Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations.
self._extract_topics(custom_documents, embeddings=embeddings, calculate_representation=not self.nr_topics)
if self.nr_topics:
custom_documents = self._reduce_topics(custom_documents)
self._create_topic_vectors(documents=documents, embeddings=embeddings)

# Save the top 3 most representative documents per topic
self._save_representative_docs(custom_documents)
else:
# Extract topics by calculating c-TF-IDF
self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)

# Reduce topics
else:
# Extract topics by calculating c-TF-IDF, reduce topics if needed, and get representations.
self._extract_topics(
documents, embeddings=embeddings, verbose=self.verbose, calculate_representation=not self.nr_topics
)
if self.nr_topics:
documents = self._reduce_topics(documents)

Expand Down Expand Up @@ -3972,6 +3971,7 @@ def _extract_topics(
embeddings: np.ndarray = None,
mappings=None,
verbose: bool = False,
calculate_representation: bool = True,
):
"""Extract topics from the clusters using a class-based TF-IDF.

Expand All @@ -3980,18 +3980,29 @@ def _extract_topics(
embeddings: The document embeddings
mappings: The mappings from topic to word
verbose: Whether to log the process of extracting topics
calculate_representation: Whether to extract the topic representations
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be the "additional" topic representations instead since c-TF-IDF is already calculated? I'm struggling a bit with having calculated c-TF-IDF before or not. Now it seems like c-TF-IDF is not calculated at all.

Copy link
Contributor Author

@PipaFlores PipaFlores Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it should be the "additional" representation

The c-TF-IDF is always calculated, since is not conditional to that boolean argument.
Is calculated in line 3995 self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) and then used to calculate the default representation in def _extract_words_per_topic
if not self.representation_model or not calculate_representation: # Default representation: c_tf_idf + top_n_words topics = {label: values[: self.top_n_words] for label, values in topics.items()}


Returns:
c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic
"""
if verbose:
logger.info("Representation - Extracting topics from clusters using representation models.")
action = "Representation" if calculate_representation else "Topics"
logger.info(
f"{action} - Extracting topics from clusters{' using representation models' if calculate_representation else ''}."
)

Comment on lines +3989 to +3993
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure I understood this correctly, are the two options as follows:

"Representation - Extracting topics from clusters using representation models"

Does this refer to the case where all representations are calculated (and not just c-TF-IDF)?

"Topics - Extracting topics from clusters"

Does this then refer to the case where only c-TF-IDF is calculate?

Also, perhaps it would be nice to create a variable for the second if/else statement the same way you did for the first.

method = ' using representation models' if calculate_representation else ''
logger.info(f"{action} - Extracting topics from clusters{method}."

Copy link
Contributor Author

@PipaFlores PipaFlores Oct 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, the "Topics - Extracting topics from clusters" refers to the case where only the c-TF-IDF is calculated.
And I would suggest changing it to "Extracting base representation from bag of words"

I can make the change suggested for the if else statement, but please check my comment below.

documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
self.topic_representations_ = self._extract_words_per_topic(words, documents)
self.topic_representations_ = self._extract_words_per_topic(
words,
documents,
calculate_representation=calculate_representation,
calculate_aspects=calculate_representation,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be sure I understand this correctly. You added this here because when you only calculate the base representation (calculate_representation=False) you do not want the additional representations (calculate_aspects=False), right?

If so, we might need to think about the term calculate_representation because it should refer to fine-tuning representations instead right? Or did you name it did way because technically, it might also be the default representation when you set it as "Main"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats right.

Indeed is about fine tuning. But yea, the nomenclature here is confusing for me as well.

It could be changed to "fine_tune_representations"

I will gather my thoughts on one single comment bellow, since this touches on the other comments.

)
self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)

if verbose:
logger.info("Representation - Completed \u2713")
logger.info(f"{action} - Completed \u2713")

def _save_representative_docs(self, documents: pd.DataFrame):
"""Save the 3 most representative docs per topic.
Expand Down Expand Up @@ -4245,6 +4256,7 @@ def _extract_words_per_topic(
words: List[str],
documents: pd.DataFrame,
c_tf_idf: csr_matrix = None,
calculate_representation: bool = True,
calculate_aspects: bool = True,
) -> Mapping[str, List[Tuple[str, float]]]:
"""Based on tf_idf scores per topic, extract the top n words per topic.
Expand All @@ -4258,6 +4270,7 @@ def _extract_words_per_topic(
words: List of all words (sorted according to tf_idf matrix position)
documents: DataFrame with documents and their topic IDs
c_tf_idf: A c-TF-IDF matrix from which to calculate the top words
calculate_representation: Whether to calculate the topic representations
calculate_aspects: Whether to calculate additional topic aspects

Returns:
Expand Down Expand Up @@ -4288,15 +4301,15 @@ def _extract_words_per_topic(

# Fine-tune the topic representations
topics = base_topics.copy()
if not self.representation_model:
if not self.representation_model or not calculate_representation:
# Default representation: c_tf_idf + top_n_words
topics = {label: values[: self.top_n_words] for label, values in topics.items()}
elif isinstance(self.representation_model, list):
elif calculate_representation and isinstance(self.representation_model, list):
for tuner in self.representation_model:
topics = tuner.extract_topics(self, documents, c_tf_idf, topics)
elif isinstance(self.representation_model, BaseRepresentation):
elif calculate_representation and isinstance(self.representation_model, BaseRepresentation):
topics = self.representation_model.extract_topics(self, documents, c_tf_idf, topics)
elif isinstance(self.representation_model, dict):
elif calculate_representation and isinstance(self.representation_model, dict):
if self.representation_model.get("Main"):
main_model = self.representation_model["Main"]
if isinstance(main_model, BaseRepresentation):
Expand Down Expand Up @@ -4350,6 +4363,13 @@ def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> p
if isinstance(self.nr_topics, int):
if self.nr_topics < initial_nr_topics:
documents = self._reduce_to_n_topics(documents, use_ctfidf)
else:
logger.info(
f"Topic reduction - Number of topics ({self.nr_topics}) is equal or higher than the clustered topics({len(self.get_topics())})."
)
documents = self._sort_mappings_by_frequency(documents)
self._extract_topics(documents, verbose=self.verbose)
return documents
elif isinstance(self.nr_topics, str):
documents = self._auto_reduce_topics(documents, use_ctfidf)
else:
Expand Down Expand Up @@ -4412,7 +4432,7 @@ def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)

# Update representations
documents = self._sort_mappings_by_frequency(documents)
self._extract_topics(documents, mappings=mappings)
self._extract_topics(documents, mappings=mappings, verbose=self.verbose)

self._update_topic_size(documents)
return documents
Expand Down Expand Up @@ -4468,7 +4488,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False)
# Update documents and topics
self.topic_mapper_.add_mappings(mapped_topics, topic_model=self)
documents = self._sort_mappings_by_frequency(documents)
self._extract_topics(documents, mappings=mappings)
self._extract_topics(documents, mappings=mappings, verbose=self.verbose)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you added verbosity here. I purposefully left this to False to prevent too much logging but was wondering why you think it was necessary to add it.

self._update_topic_size(documents)
return documents

Expand Down
1 change: 1 addition & 0 deletions tests/test_representation/test_representations.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def test_topic_reduction_edge_cases(model, documents, request):
topics = np.random.randint(-1, nr_topics - 1, len(documents))
old_documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
topic_model._update_topic_size(old_documents)
old_documents = topic_model._sort_mappings_by_frequency(old_documents)
topic_model._extract_topics(old_documents)
old_freq = topic_model.get_topic_freq()

Expand Down