Skip to content

Commit

Permalink
refactor: removed vocab_size from test_create_shuffled_dataset_chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
le1nux committed Jan 28, 2025
1 parent 9ebaeca commit 7c6da5d
Showing 1 changed file with 2 additions and 7 deletions.
9 changes: 2 additions & 7 deletions tests/end2end_tests/test_create_shuffled_dataset_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,13 @@ def pbin_file_path_list(files_num_documents: list[int]) -> list[Path]:
)
def test_create_shuffled_dataset_chunk(
pbin_file_path_list: list[Path],
files_num_documents: list[int],
num_chunks: int,
global_seed: int,
expect_error: bool,
):
def create_chunks(
num_chunks: int,
pbin_file_path_list: list[Path],
vocab_size: int,
) -> list[np.ndarray]:
chunks = []
parent_dir = pbin_file_path_list[0].parent
Expand All @@ -76,7 +74,6 @@ def create_chunks(
output_chunk_file_path=chunk_file_path,
chunk_id=chunk_id,
num_chunks=num_chunks,
vocab_size=vocab_size,
file_existence_policy=FileExistencePolicy.ERROR,
global_seed=global_seed,
)
Expand All @@ -85,13 +82,11 @@ def create_chunks(
chunks.append(tokenized_dataset)
return chunks

vocab_size = sum(files_num_documents)

if expect_error:
with pytest.raises(ValueError):
create_chunks(num_chunks, pbin_file_path_list, vocab_size)
create_chunks(num_chunks, pbin_file_path_list)
return
chunks = create_chunks(num_chunks, pbin_file_path_list, vocab_size)
chunks = create_chunks(num_chunks, pbin_file_path_list)

chunks_combined = []
for i in range(num_chunks):
Expand Down

0 comments on commit 7c6da5d

Please sign in to comment.