Skip to content

Commit

Permalink
Fixed Issue: #1977
Browse files Browse the repository at this point in the history
  • Loading branch information
SSivakumar12 committed Oct 22, 2024
1 parent 9518035 commit 2601ac3
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 1 deletion.
7 changes: 7 additions & 0 deletions bertopic/representation/_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ def __init__(
self.tokenizer = tokenizer
self.prompts_ = []

if self.tokenizer is None and self.doc_length is not None:
raise ValueError(
"Please select from one of the valid options for the `tokenizer` parameter: \n"
"{'char', 'whitespace', 'vectorizer'} \n"
"If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
)

def extract_topics(
self,
topic_model,
Expand Down
6 changes: 6 additions & 0 deletions bertopic/representation/_langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def __init__(
self.diversity = diversity
self.doc_length = doc_length
self.tokenizer = tokenizer
if self.tokenizer is None and self.doc_length is not None:
raise ValueError(
"Please select from one of the valid options for the `tokenizer` parameter: \n"
"{'char', 'whitespace', 'vectorizer'} \n"
"If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
)

def extract_topics(
self,
Expand Down
6 changes: 6 additions & 0 deletions bertopic/representation/_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ def __init__(
self.tokenizer = tokenizer

self.prompts_ = []
if self.tokenizer is None and self.doc_length is not None:
raise ValueError(
"Please select from one of the valid options for the `tokenizer` parameter: \n"
"{'char', 'whitespace', 'vectorizer'} \n"
"If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
)

def extract_topics(
self,
Expand Down
7 changes: 7 additions & 0 deletions bertopic/representation/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,13 @@ def __init__(
if not self.generator_kwargs.get("stop") and not chat:
self.generator_kwargs["stop"] = "\n"

if self.tokenizer is None and self.doc_length is not None:
raise ValueError(
"Please select from one of the valid options for the `tokenizer` parameter: \n"
"{'char', 'whitespace', 'vectorizer'} \n"
"If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
)

def extract_topics(
self,
topic_model,
Expand Down
6 changes: 6 additions & 0 deletions bertopic/representation/_textgeneration.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ def __init__(
self.tokenizer = tokenizer

self.prompts_ = []
if self.tokenizer is None and self.doc_length is not None:
raise ValueError(
"Please select from one of the valid options for the `tokenizer` parameter: \n"
"{'char', 'whitespace', 'vectorizer'} \n"
"If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
)

def extract_topics(
self,
Expand Down
3 changes: 2 additions & 1 deletion bertopic/representation/_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import random
import time
from typing import Union


def truncate_document(topic_model, doc_length, tokenizer, document: str):
def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str:
"""Truncate a document to a certain length.
If you want to add a custom tokenizer, then it will need to have a `decode` and
Expand Down

0 comments on commit 2601ac3

Please sign in to comment.