MaartenGr · MaartenGr · Dec 9, 2024 · Oct 14, 2024 · Nov 18, 2024 · Nov 21, 2024
diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
@@ -4,7 +4,7 @@
 from scipy.sparse import csr_matrix
 from typing import Mapping, List, Tuple, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 
 DEFAULT_PROMPT = """
@@ -126,12 +126,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.prompts_ = []
 
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
@@ -4,7 +4,7 @@
 from typing import Callable, Mapping, List, Tuple, Union
 
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 DEFAULT_PROMPT = "What are these documents about? Please give a single label."
 
@@ -148,12 +148,8 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
+
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
@@ -4,7 +4,7 @@
 from llama_cpp import Llama
 from typing import Mapping, List, Tuple, Any, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 
 DEFAULT_PROMPT = """
@@ -118,12 +118,7 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
@@ -8,6 +8,7 @@
 from bertopic.representation._utils import (
     retry_with_exponential_backoff,
     truncate_document,
+    validate_truncate_document_parameters
 )
 
 
@@ -180,12 +181,7 @@ def __init__(
         if not self.generator_kwargs.get("stop") and not chat:
             self.generator_kwargs["stop"] = "\n"
 
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py
@@ -5,7 +5,7 @@
 from transformers.pipelines.base import Pipeline
 from typing import Mapping, List, Tuple, Any, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 
 DEFAULT_PROMPT = """
@@ -114,12 +114,7 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,

diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
@@ -58,6 +58,15 @@ def decode(self, doc_chunks):
         return truncated_document
     return document
 
+def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]:
+    """validates parameters that are used in the function `truncate_document`"""
+    if tokenizer is None and doc_length is not None:
+        raise ValueError(
+            "Please select from one of the valid options for the `tokenizer` parameter: \n"
+            "{'char', 'whitespace', 'vectorizer'} \n"
+            "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+        )
+
 
 def retry_with_exponential_backoff(
     func,