From d4cc4587319af5b10e7485c02ce70a8e3ab35395 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 7 Jan 2025 16:39:58 -0500 Subject: [PATCH] Update test_chunkers to be more modular Signed-off-by: Aakanksha Duggal --- tests/functional/test_chunkers.py | 69 +++++++++++++++---------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index d364ac48..f06db504 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -8,56 +8,53 @@ # First Party from instructlab.sdg.utils.chunkers import DocumentChunker +# Constants for Test Directory and Test Documents TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata") +TEST_DOCUMENTS = { + "pdf": "sample_documents/phoenix.pdf", + "md": "sample_documents/phoenix.md", +} + + +@pytest.fixture(scope="module") +def test_paths(): + """Fixture to return paths to test documents.""" + return { + doc_type: Path(os.path.join(TEST_DATA_DIR, path)) + for doc_type, path in TEST_DOCUMENTS.items() + } @pytest.fixture def tokenizer_model_name(): + """Fixture to return the path to the tokenizer model.""" return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -def test_chunk_pdf(tmp_path, tokenizer_model_name): - pdf_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.pdf")) - leaf_node = [ - { - "documents": ["Lorem ipsum"], - "filepaths": [pdf_path], - "taxonomy_path": "knowledge", - } - ] - chunker = DocumentChunker( - document_paths=[pdf_path], - output_dir=tmp_path, - tokenizer_model_name=tokenizer_model_name, - server_ctx_size=4096, - chunk_word_count=500, - ) - chunks = chunker.chunk_documents() - assert len(chunks) > 9 - assert "Phoenix is a minor constellation" in chunks[0] - for chunk in chunks: - # inexact sanity-checking of chunk max length - assert len(chunk) < 2500 - - -def test_chunk_md(tmp_path, tokenizer_model_name): - markdown_path = Path(os.path.join(TEST_DATA_DIR, "sample_documents", "phoenix.md")) - leaf_node = [ - { - "documents": [markdown_path.read_text(encoding="utf-8")], - "filepaths": [markdown_path], - "taxonomy_path": "knowledge", - } - ] +@pytest.mark.parametrize( + "doc_type, expected_chunks, contains_text", + [ + ("pdf", 9, "Phoenix is a minor constellation"), + ("md", 7, None), # Assuming there's no specific text to check in Markdown + ], +) +def test_chunk_documents( + tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text +): + """ + Generalized test function for chunking documents. + """ + document_path = test_paths[doc_type] chunker = DocumentChunker( - document_paths=[markdown_path], + document_paths=[document_path], output_dir=tmp_path, tokenizer_model_name=tokenizer_model_name, server_ctx_size=4096, chunk_word_count=500, ) chunks = chunker.chunk_documents() - assert len(chunks) > 7 + assert len(chunks) > expected_chunks + if contains_text: + assert contains_text in chunks[0] for chunk in chunks: - # inexact sanity-checking of chunk max length assert len(chunk) < 2500