Skip to content

Commit

Permalink
Fixed Issue: #1977
Browse files Browse the repository at this point in the history
  • Loading branch information
SSivakumar12 committed Oct 14, 2024
1 parent 9518035 commit 3f715d3
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion bertopic/representation/_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import random
import time
from typing import Union


def truncate_document(topic_model, doc_length, tokenizer, document: str):
def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str:
"""Truncate a document to a certain length.
If you want to add a custom tokenizer, then it will need to have a `decode` and
Expand Down Expand Up @@ -54,6 +55,12 @@ def decode(self, doc_chunks):
elif hasattr(tokenizer, "encode") and hasattr(tokenizer, "decode"):
encoded_document = tokenizer.encode(document)
truncated_document = tokenizer.decode(encoded_document[:doc_length])
else:
raise ValueError(
"Please select from one of the valid options for the `tokenizer` parameter: \n"
"{'char', 'whitespace', 'vectorizer'} \n"
"Alternatively if `tokenizer` is a callable ensure it has methods to encode and decode a document "
)
return truncated_document
return document

Expand Down

0 comments on commit 3f715d3

Please sign in to comment.