Skip to content
This repository has been archived by the owner on Nov 11, 2024. It is now read-only.

Commit

Permalink
Add empty section cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
major committed Oct 18, 2024
1 parent 0bf8cbc commit 20f201f
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
12 changes: 12 additions & 0 deletions tests/test_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from unittest import mock

from langchain_core.documents import Document

from textprep import splitter


Expand Down Expand Up @@ -57,3 +59,13 @@ def test_parse_markdown_functional(errata_doc):
assert "security update" in result["frontmatter"]["title"]
assert "Synopsis" in result["content"][0].metadata["H2"]
assert "Moderate" in result["content"][0].page_content


def test_remove_empty_sections():
"""Test removing empty sections."""
sections = [
Document(metadata={"H1": "header"}, page_content="Valid data"),
Document(metadata={"H2": "header"}, page_content="(none)"),
]
result = splitter.remove_empty_sections(sections)
assert len(result) == 1
5 changes: 5 additions & 0 deletions textprep/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ def split_markdown_by_headers(content: str) -> list:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
doc_splits = markdown_splitter.split_text(content)
return list(doc_splits) if doc_splits else []


def remove_empty_sections(sections: list) -> list:
"""Remove empty sections from markdown documents."""
return [x for x in sections if x.page_content.strip() != "(none)"]

0 comments on commit 20f201f

Please sign in to comment.