Fixed Issue: #1977

MaartenGr · Oct 22, 2024 · 2601ac3 · 2601ac3
1 parent 9518035
commit 2601ac3
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 1 deletion.
diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
@@ -126,6 +126,13 @@ def __init__(
         self.tokenizer = tokenizer
         self.prompts_ = []
 
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
+
     def extract_topics(
         self,
         topic_model,

diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
@@ -148,6 +148,12 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
@@ -118,6 +118,12 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
@@ -180,6 +180,13 @@ def __init__(
         if not self.generator_kwargs.get("stop") and not chat:
             self.generator_kwargs["stop"] = "\n"
 
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
+
     def extract_topics(
         self,
         topic_model,

diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py
@@ -114,6 +114,12 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
@@ -1,8 +1,9 @@
 import random
 import time
+from typing import Union
 
 
-def truncate_document(topic_model, doc_length, tokenizer, document: str):
+def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str:
     """Truncate a document to a certain length.
 
     If you want to add a custom tokenizer, then it will need to have a `decode` and