Skip to content

Commit

Permalink
Merge pull request #29 from scottwater/sentence_endings
Browse files Browse the repository at this point in the history
Handles text that does not end in a white space for SentenceTextSpliter
  • Loading branch information
moekiorg authored Oct 11, 2024
2 parents 9876eda + e04c7ae commit 7934467
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 20 deletions.
2 changes: 1 addition & 1 deletion lib/baran/sentence_text_splitter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def initialize(chunk_size: 1024, chunk_overlap: 64)

def splitted(text)
# Use a regex to split text based on the specified sentence-ending characters followed by whitespace
text.scan(/[^.!?]+[.!?]+(?:\s+)/).map(&:strip)
text.scan(/[^.!?]+[.!?]+(?:\s+|\z)/).map(&:strip)
end
end
end
48 changes: 29 additions & 19 deletions test/test_sentence_text_spliter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
class TestSentenceTextSplitter < MiniTest::Unit::TestCase
def setup
@splitter = Baran::SentenceTextSplitter.new(chunk_size: 10, chunk_overlap: 5)
end

def test_chunks
story = <<~TEXT
@story = <<~TEXT
Hack and jill
went up the hill to fetch
a pail of water. Jack fell
Expand All @@ -19,23 +16,36 @@ def test_chunks
No, the water was splashed on Bo Peep.
TEXT

chunks = @splitter.chunks(story)

sentences = chunks
.map { |chunk|
chunk[:text]
.gsub(/\s+/, ' ')
.strip
}

expected = [
"Hack and jill went up the hill to fetch a pail of water.",
"Jack fell down and broke his crown and Jill came tumbling after.",
"The pail went flying!",
"Was the water spilled?",
@expected =[
"Hack and jill went up the hill to fetch a pail of water.",
"Jack fell down and broke his crown and Jill came tumbling after.",
"The pail went flying!",
"Was the water spilled?",
"No, the water was splashed on Bo Peep."
]
end

def test_chunks
chunks = @splitter.chunks(@story)
sentences = format_chunks(chunks)
assert_equal(sentences, @expected)
end

def test_chunks_without_trailing_whitespace
chunks = @splitter.chunks(@story.strip)
sentences = format_chunks(chunks)
assert_equal(sentences, @expected)
end


private

assert_equal(sentences, expected)
def format_chunks(chunks)
chunks
.map { |chunk|
chunk[:text]
.gsub(/\s+/, ' ')
.strip
}
end
end

0 comments on commit 7934467

Please sign in to comment.