From 494cafbb86d1285d5e8086c5e3586e84ac371185 Mon Sep 17 00:00:00 2001 From: Mantas Date: Tue, 16 Apr 2024 14:23:48 +0300 Subject: [PATCH] Fix txt repeating text removal --- src/Code/Converters/TxtConverter.php | 43 +++++++++++++++++++++++++++- tests/formats/TxtTest.php | 15 ++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/Code/Converters/TxtConverter.php b/src/Code/Converters/TxtConverter.php index 40229a2..6d47b40 100644 --- a/src/Code/Converters/TxtConverter.php +++ b/src/Code/Converters/TxtConverter.php @@ -130,7 +130,10 @@ public function fileContentToInternalFormat($file_content, $original_file_conten } unset($row); - return self::fillStartAndEndTimes($internal_format); + $internal_format = self::fillStartAndEndTimes($internal_format); + $internal_format = self::removeRepeatingTextStarts($internal_format); + + return $internal_format; } // start and end timestamp @@ -518,6 +521,44 @@ private static function twoLinesSeparatedByEmptyLine(string $file_content) return self::fillStartAndEndTimes($internal_format); } + public static function removeRepeatingTextStarts($internal_format) + { + if (count($internal_format) <= 2) { + return $internal_format; // don't try to filter if there almost no lines + } + + $repeating_string = ''; + + $first_lines = []; + foreach ($internal_format as $subtitle) { + $first_lines[] = $subtitle['lines'][0]; + } + + $length = strlen($first_lines[0]); + for ($i = 0; $i < $length; $i++) { + $letter = $first_lines[0][$i]; + + foreach ($first_lines as $line) { + if (!isset($line[$i])) { + break 2; + } + $line_letter = $line[$i]; + if ($line_letter !== $letter) { + break 2; + } + } + $repeating_string .= $letter; + } + + $repeating_length = strlen($repeating_string); + foreach ($internal_format as &$subtitle) { + $subtitle['lines'][0] = substr($subtitle['lines'][0], $repeating_length); + } + unset($subtitle); + + return $internal_format; + } + private static function hasTime($line) { return preg_match(self::$time_regexp, $line) === 1; diff --git a/tests/formats/TxtTest.php b/tests/formats/TxtTest.php index 8ead694..c6f791b 100644 --- a/tests/formats/TxtTest.php +++ b/tests/formats/TxtTest.php @@ -469,6 +469,21 @@ public function testDoesNotRemoveNotHtmlTag() $this->assertInternalFormatsEqual($expected, $actual); } + public function testRemoveRepeatingTextFromBeginningOfText() + { + $actual = Subtitles::loadFromString(' +00:00:00:a +00:00:01:b +00:00:02:c + ')->getInternalFormat(); + $expected = (new Subtitles()) + ->add(0, 1, 'a') + ->add(1, 2, 'b') + ->add(2, 3, 'c') + ->getInternalFormat(); + $this->assertInternalFormatsEqual($expected, $actual); + } + // ---------------------------------- private ---------------------------------------------------------------------- private static function generatedSubtitles()