diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index 8ca31c8f..f45024ae 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -126,6 +126,13 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) + def extract_topics( self, topic_model, diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index df5c4839..264d1b20 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -148,6 +148,12 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) def extract_topics( self, diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index 83b18952..321b13fd 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -118,6 +118,12 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) def extract_topics( self, diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 8fd25a1b..95a7b991 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -180,6 +180,13 @@ def __init__( if not self.generator_kwargs.get("stop") and not chat: self.generator_kwargs["stop"] = "\n" + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) + def extract_topics( self, topic_model, diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py index b028e575..9205f9d6 100644 --- a/bertopic/representation/_textgeneration.py +++ b/bertopic/representation/_textgeneration.py @@ -114,6 +114,12 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) def extract_topics( self, diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index 2a99fd1f..4a524045 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -1,8 +1,9 @@ import random import time +from typing import Union -def truncate_document(topic_model, doc_length, tokenizer, document: str): +def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str: """Truncate a document to a certain length. If you want to add a custom tokenizer, then it will need to have a `decode` and