Fix multiple numbers with the same decimal place accumulating

Now "one hundred two hundred" is correctly interpreted as "100 200", instead of "10,200".
ideasman42 · Jan 9, 2022 · 569fdae · 569fdae
1 parent bf89cfd
commit 569fdae
Show file tree

Hide file tree

Showing 3 changed files with 198 additions and 56 deletions.
diff --git a/changelog.rst b/changelog.rst
@@ -3,7 +3,9 @@
 Changelog
 #########
 
-- 2022/01/05: Fix bug adding where quoted numbers would accumulate,
+- 2022/01/09: Fix ``--numbers-as-digits`` bug where numbers with the same decimal place value where accumulated,
+  e.g. "one hundred two hundred" is now interrupted as "100 200" instead of "10,200".
+- 2022/01/05: Fix ``--numbers-as-digits`` bug adding where quoted numbers would accumulate,
   e.g. "one hundred and fifty twelve" would be interpreted as "162" instead of "150 12".
 - 2022/01/05: Fix bug interpreting a series of numbers over 9, as well as stripping leading zeros.
 - 2021/10/05: Fix bug where quickly running begin/end would leave dictation enabled.

diff --git a/nerd-dictation b/nerd-dictation
@@ -311,55 +311,10 @@ class from_words_to_digits:
         valid_zero_words,
     ) = from_words_to_digits_setup_once()
 
-    @staticmethod
-    def _parse_number_as_series_of_units(
-        word_list: List[str],
-        word_index: int,
-    ) -> Tuple[str, str, int, bool]:
-        """
-        Convert a series of unit numbers into a single int.
-        `one two three` -> `123`.
-        """
-        number_words = from_words_to_digits._number_words
-
-        # Never re-format, as interpreting this as a whole number removes trailing leading.
-        allow_reformat = False
-
-        # First detect a series of numbers, e.g:
-        # two four six eight
-        # Should be 2468 not the result of (2+4+6+8).
-        word_index_init = word_index
-        unit_numbers: List[int] = []
-        increment_prev = -1
-        while word_index < len(word_list):
-            word_data = number_words.get(word_list[word_index])
-            if word_data is None:
-                break
-
-            scale, increment, suffix, is_final = word_data
-            # Never accumulate numbers with "and" this can stay as a literal.
-            if not is_final:
-                break
-            if suffix == "" and scale == 1 and increment < 100:
-                # This allows speaking: "fifty fifty one"
-                # To be interpreted as 5051.
-                if unit_numbers and (increment_prev >= 20) and (increment < 10) and (increment != 0):
-                    unit_numbers[-1] += increment
-                else:
-                    unit_numbers.append(increment)
-                increment_prev = increment
-                word_index += 1
-            else:
-                break
-
-        if len(unit_numbers) > 1:
-            return "".join([str(value) for value in unit_numbers]), "", word_index, allow_reformat
-
-        return "", "", word_index_init, False
-
     @staticmethod
     def _parse_number_as_whole_value(
         word_list: List[str],
+        word_list_len: int,
         word_index: int,
         imply_single_unit: bool = False,
     ) -> Tuple[str, str, int, bool]:
@@ -386,8 +341,9 @@ class from_words_to_digits:
         result_final = ("", "", word_index, allow_reformat)
 
         # Loop while splitting to break into individual words.
-        while word_index < len(word_list):
+        while word_index < word_list_len:
             word_data = number_words.get(word_list[word_index])
+
             if word_data is None:
                 # raise Exception('Illegal word: ' + word)
                 break
@@ -452,21 +408,142 @@ class from_words_to_digits:
         # Return the result plus the current.
         return "{:d}".format(result + current), suffix, word_index, allow_reformat
 
+    @staticmethod
+    def _allow_follow_on_word(w_prev: str, w: str) -> bool:
+        valid_unit_words = from_words_to_digits.valid_unit_words
+        number_words = from_words_to_digits._number_words
+
+        if not w_prev in valid_unit_words:
+            return False
+        if not w in valid_unit_words:
+            return False
+        increment_prev = number_words[w_prev][1]
+        increment = number_words[w][1]
+        if (increment_prev >= 20) and (increment < 10) and (increment != 0):
+            return True
+        return False
+
+    @staticmethod
+    def parse_number_calc_delimiter_from_series(
+        word_list: List[str],
+        word_index: int,
+        word_index_len: int,
+    ) -> int:
+
+        valid_unit_words = from_words_to_digits.valid_unit_words
+        number_words = from_words_to_digits._number_words
+
+        i = word_index
+        i_span_beg = word_index
+        w_prev = ""
+        result_prev = None
+        result_test = None
+        while i < word_index_len:
+            w = word_list[i]
+            if w not in number_words:
+                break
+
+            if (i != word_index) and from_words_to_digits._allow_follow_on_word(word_list[i - 1], w):
+                # Don't set `w_prev` so we can detect "thirteen and fifty five" without the last "five" delimiting.
+                pass
+            else:
+                if (w_prev not in {"", "and"}) and w in valid_unit_words:
+                    # Exception ... allow "thirty three", two words...
+                    result_prev = result_test
+                    result_test = from_words_to_digits._parse_number_as_whole_value(
+                        word_list,
+                        i,  # Limit.
+                        i_span_beg,  # Split start.
+                    )
+                    assert result_test[2] == i
+
+                    if result_prev:
+                        if len(result_prev[0]) == len(result_test[0]):
+                            return result_prev[2]
+
+                    i_span_beg = i
+                w_prev = w
+            i += 1
+
+        result_prev = result_test
+        result_test = from_words_to_digits._parse_number_as_whole_value(
+            word_list,
+            i,  # Limit.
+            i_span_beg,  # Split start.
+        )
+
+        if result_prev:
+            if len(result_prev[0]) == len(result_test[0]):
+                return result_prev[2]
+
+        return word_index_len
+
+    @staticmethod
+    def parse_number_calc_delimiter_from_slide(
+        word_list: List[str],
+        word_index: int,
+        word_index_len: int,
+    ) -> int:
+
+        valid_unit_words = from_words_to_digits.valid_unit_words
+        number_words = from_words_to_digits._number_words
+        i = word_index
+        w_prev = ""
+        while i < word_index_len:
+            w = word_list[i]
+            if w not in number_words:
+                break
+            if (i != word_index) and from_words_to_digits._allow_follow_on_word(word_list[i - 1], w):
+                # Don't set `w_prev` so we can detect "thirteen and fifty five" without the last "five" delimiting.
+                pass
+            else:
+                if (w_prev not in {"", "and"}) and w in valid_unit_words:
+                    result_test_lhs = from_words_to_digits._parse_number_as_whole_value(
+                        word_list,
+                        i,  # Limit.
+                        word_index,  # Split start.
+                    )
+                    result_test_rhs = from_words_to_digits._parse_number_as_whole_value(
+                        word_list,
+                        word_index_len,  # Limit.
+                        i,  # Split start.
+                    )
+
+                    # If the number on the right is larger, split here.
+                    if len(result_test_lhs[0]) <= len(result_test_rhs[0]):
+                        return result_test_lhs[2]
+
+                w_prev = w
+            i += 1
+
+        return word_index_len
+
     @staticmethod
     def parse_number(
         word_list: List[str],
         word_index: int,
         imply_single_unit: bool = False,
     ) -> Tuple[str, str, int, bool]:
 
-        # Check if this is a series of unit values, in this case it makes most sense to put the number in a series
-        # (think reciting a phone number).
-        ret = from_words_to_digits._parse_number_as_series_of_units(word_list, word_index)
-        if ret[2] != word_index:
-            return ret
+        word_list_len = len(word_list)
+
+        # Delimit, prevent accumulating "one hundred two hundred" -> "300" for example.
+        word_list_len = from_words_to_digits.parse_number_calc_delimiter_from_series(
+            word_list,
+            word_index,
+            word_list_len,
+        )
+        word_list_len = from_words_to_digits.parse_number_calc_delimiter_from_slide(
+            word_list,
+            word_index,
+            word_list_len,
+        )
 
         return from_words_to_digits._parse_number_as_whole_value(
-            word_list, word_index, imply_single_unit=imply_single_unit
+            word_list,
+            word_list_len,
+            word_index,
+            imply_single_unit=imply_single_unit,
         )
 
     @staticmethod
@@ -513,6 +590,24 @@ class from_words_to_digits:
                     i -= 1
             i += 1
 
+        # Group numbers - recite single digit phone numbers for example.
+        # This could be optional, but generally seems handy (good default behavior),
+        # e.g. "twenty twenty" -> "2020".
+        i = 0
+        while i < len(word_list):
+            if word_list[i].isdigit() and len(word_list[i]) <= 2:
+                j = i + 1
+                while j < len(word_list):
+                    if word_list[j].isdigit() and len(word_list[j]) <= 2:
+                        j += 1
+                    else:
+                        break
+                if i + 1 != j:
+                    word_list[i:j] = ["".join(word_list[i:j])]
+                i = j
+            else:
+                i += 1
+
 
 # -----------------------------------------------------------------------------
 # Process Text

diff --git a/tests/from_words_to_digits.py b/tests/from_words_to_digits.py
@@ -90,7 +90,7 @@ def name_of_caller(frame=1):
 # -----------------------------------------------------------------------------
 # Tests
 
-VERBOSE = True
+VERBOSE = False
 
 import unittest
 
@@ -105,7 +105,7 @@ def assertNumberFromTextEqual(self, words_input, expected_output):
         actual_output = tuple(words)
         expected_output = tuple(expected_output.split())
         if VERBOSE:
-            print("{:>38}: {!r} -> {!r}".format(name_of_caller(frame=2), words_input, " ".join(actual_output)))
+            print("{:>54}: {!r} -> {!r}".format(name_of_caller(frame=2), words_input, " ".join(actual_output)))
         self.assertEqual(actual_output, expected_output)
 
 
@@ -160,6 +160,51 @@ def test_zero(self):
         self.assertNumberFromTextEqual("two hundred and zero", "200 and 0")
         self.assertNumberFromTextEqual("two hundred and zero and one", "200 and 0 and 1")
 
+    def test_hundreds(self):
+        self.assertNumberFromTextEqual("one hundred two hundred", "100 200")
+        self.assertNumberFromTextEqual("one hundred two hundred three hundred", "100 200 300")
+
+    def test_hundreds_and_units(self):
+        self.assertNumberFromTextEqual("one hundred two hundred and one", "100 201")
+        self.assertNumberFromTextEqual("one hundred two hundred three hundred and one", "100 200 301")
+
+    def test_hundreds_complex_1(self):
+        self.assertNumberFromTextEqual("one hundred two hundred thousand and one", "100 200,001")
+
+    def test_hundreds_complex_2(self):
+        self.assertNumberFromTextEqual(
+            "one hundred and three two hundred and thirteen thirteen thousand three hundred and two",
+            ("103 " "213 " "13,302"),
+        )
+
+    def test_hundreds_complex_3(self):
+        self.assertNumberFromTextEqual(
+            (
+                "ninety two "
+                "three hundred and three "
+                "two hundred and thirteen "
+                "thirteen thousand three hundred "
+                "four hundred"
+            ),
+            ("92 " "303 " "213 " "13,300 " "400"),
+        )
+
+    def test_mixed_complex_pairs_1(self):
+        self.assertNumberFromTextEqual(
+            (
+                "fifteen million and two "
+                "three thousand and forty four hundred and three "
+                "five million and thirty three"
+            ),
+            ("15,000,002 " "7,403 " "5,000,033"),
+        )
+
+    def test_mixed_complex_pairs_2(self):
+        self.assertNumberFromTextEqual(
+            ("sixty hundred " "fifty thousand"),
+            ("6,000 " "50,000"),
+        )
+
 
 if __name__ == "__main__":
     nerd_dictation = execfile_as_module(