Skip to content

Commit

Permalink
Fix multiple numbers with the same decimal place accumulating
Browse files Browse the repository at this point in the history
Now "one hundred two hundred" is correctly interpreted as "100 200",
instead of "10,200".
  • Loading branch information
ideasman42 committed Jan 9, 2022
1 parent bf89cfd commit 569fdae
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 56 deletions.
4 changes: 3 additions & 1 deletion changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
Changelog
#########

- 2022/01/05: Fix bug adding where quoted numbers would accumulate,
- 2022/01/09: Fix ``--numbers-as-digits`` bug where numbers with the same decimal place value where accumulated,
e.g. "one hundred two hundred" is now interrupted as "100 200" instead of "10,200".
- 2022/01/05: Fix ``--numbers-as-digits`` bug adding where quoted numbers would accumulate,
e.g. "one hundred and fifty twelve" would be interpreted as "162" instead of "150 12".
- 2022/01/05: Fix bug interpreting a series of numbers over 9, as well as stripping leading zeros.
- 2021/10/05: Fix bug where quickly running begin/end would leave dictation enabled.
Expand Down
201 changes: 148 additions & 53 deletions nerd-dictation
Original file line number Diff line number Diff line change
Expand Up @@ -311,55 +311,10 @@ class from_words_to_digits:
valid_zero_words,
) = from_words_to_digits_setup_once()

@staticmethod
def _parse_number_as_series_of_units(
word_list: List[str],
word_index: int,
) -> Tuple[str, str, int, bool]:
"""
Convert a series of unit numbers into a single int.
`one two three` -> `123`.
"""
number_words = from_words_to_digits._number_words

# Never re-format, as interpreting this as a whole number removes trailing leading.
allow_reformat = False

# First detect a series of numbers, e.g:
# two four six eight
# Should be 2468 not the result of (2+4+6+8).
word_index_init = word_index
unit_numbers: List[int] = []
increment_prev = -1
while word_index < len(word_list):
word_data = number_words.get(word_list[word_index])
if word_data is None:
break

scale, increment, suffix, is_final = word_data
# Never accumulate numbers with "and" this can stay as a literal.
if not is_final:
break
if suffix == "" and scale == 1 and increment < 100:
# This allows speaking: "fifty fifty one"
# To be interpreted as 5051.
if unit_numbers and (increment_prev >= 20) and (increment < 10) and (increment != 0):
unit_numbers[-1] += increment
else:
unit_numbers.append(increment)
increment_prev = increment
word_index += 1
else:
break

if len(unit_numbers) > 1:
return "".join([str(value) for value in unit_numbers]), "", word_index, allow_reformat

return "", "", word_index_init, False

@staticmethod
def _parse_number_as_whole_value(
word_list: List[str],
word_list_len: int,
word_index: int,
imply_single_unit: bool = False,
) -> Tuple[str, str, int, bool]:
Expand All @@ -386,8 +341,9 @@ class from_words_to_digits:
result_final = ("", "", word_index, allow_reformat)

# Loop while splitting to break into individual words.
while word_index < len(word_list):
while word_index < word_list_len:
word_data = number_words.get(word_list[word_index])

if word_data is None:
# raise Exception('Illegal word: ' + word)
break
Expand Down Expand Up @@ -452,21 +408,142 @@ class from_words_to_digits:
# Return the result plus the current.
return "{:d}".format(result + current), suffix, word_index, allow_reformat

@staticmethod
def _allow_follow_on_word(w_prev: str, w: str) -> bool:
valid_unit_words = from_words_to_digits.valid_unit_words
number_words = from_words_to_digits._number_words

if not w_prev in valid_unit_words:
return False
if not w in valid_unit_words:
return False
increment_prev = number_words[w_prev][1]
increment = number_words[w][1]
if (increment_prev >= 20) and (increment < 10) and (increment != 0):
return True
return False

@staticmethod
def parse_number_calc_delimiter_from_series(
word_list: List[str],
word_index: int,
word_index_len: int,
) -> int:

valid_unit_words = from_words_to_digits.valid_unit_words
number_words = from_words_to_digits._number_words

i = word_index
i_span_beg = word_index
w_prev = ""
result_prev = None
result_test = None
while i < word_index_len:
w = word_list[i]
if w not in number_words:
break

if (i != word_index) and from_words_to_digits._allow_follow_on_word(word_list[i - 1], w):
# Don't set `w_prev` so we can detect "thirteen and fifty five" without the last "five" delimiting.
pass
else:
if (w_prev not in {"", "and"}) and w in valid_unit_words:
# Exception ... allow "thirty three", two words...
result_prev = result_test
result_test = from_words_to_digits._parse_number_as_whole_value(
word_list,
i, # Limit.
i_span_beg, # Split start.
)
assert result_test[2] == i

if result_prev:
if len(result_prev[0]) == len(result_test[0]):
return result_prev[2]

i_span_beg = i
w_prev = w
i += 1

result_prev = result_test
result_test = from_words_to_digits._parse_number_as_whole_value(
word_list,
i, # Limit.
i_span_beg, # Split start.
)

if result_prev:
if len(result_prev[0]) == len(result_test[0]):
return result_prev[2]

return word_index_len

@staticmethod
def parse_number_calc_delimiter_from_slide(
word_list: List[str],
word_index: int,
word_index_len: int,
) -> int:

valid_unit_words = from_words_to_digits.valid_unit_words
number_words = from_words_to_digits._number_words
i = word_index
w_prev = ""
while i < word_index_len:
w = word_list[i]
if w not in number_words:
break
if (i != word_index) and from_words_to_digits._allow_follow_on_word(word_list[i - 1], w):
# Don't set `w_prev` so we can detect "thirteen and fifty five" without the last "five" delimiting.
pass
else:
if (w_prev not in {"", "and"}) and w in valid_unit_words:
result_test_lhs = from_words_to_digits._parse_number_as_whole_value(
word_list,
i, # Limit.
word_index, # Split start.
)
result_test_rhs = from_words_to_digits._parse_number_as_whole_value(
word_list,
word_index_len, # Limit.
i, # Split start.
)

# If the number on the right is larger, split here.
if len(result_test_lhs[0]) <= len(result_test_rhs[0]):
return result_test_lhs[2]

w_prev = w
i += 1

return word_index_len

@staticmethod
def parse_number(
word_list: List[str],
word_index: int,
imply_single_unit: bool = False,
) -> Tuple[str, str, int, bool]:

# Check if this is a series of unit values, in this case it makes most sense to put the number in a series
# (think reciting a phone number).
ret = from_words_to_digits._parse_number_as_series_of_units(word_list, word_index)
if ret[2] != word_index:
return ret
word_list_len = len(word_list)

# Delimit, prevent accumulating "one hundred two hundred" -> "300" for example.
word_list_len = from_words_to_digits.parse_number_calc_delimiter_from_series(
word_list,
word_index,
word_list_len,
)
word_list_len = from_words_to_digits.parse_number_calc_delimiter_from_slide(
word_list,
word_index,
word_list_len,
)

return from_words_to_digits._parse_number_as_whole_value(
word_list, word_index, imply_single_unit=imply_single_unit
word_list,
word_list_len,
word_index,
imply_single_unit=imply_single_unit,
)

@staticmethod
Expand Down Expand Up @@ -513,6 +590,24 @@ class from_words_to_digits:
i -= 1
i += 1

# Group numbers - recite single digit phone numbers for example.
# This could be optional, but generally seems handy (good default behavior),
# e.g. "twenty twenty" -> "2020".
i = 0
while i < len(word_list):
if word_list[i].isdigit() and len(word_list[i]) <= 2:
j = i + 1
while j < len(word_list):
if word_list[j].isdigit() and len(word_list[j]) <= 2:
j += 1
else:
break
if i + 1 != j:
word_list[i:j] = ["".join(word_list[i:j])]
i = j
else:
i += 1


# -----------------------------------------------------------------------------
# Process Text
Expand Down
49 changes: 47 additions & 2 deletions tests/from_words_to_digits.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def name_of_caller(frame=1):
# -----------------------------------------------------------------------------
# Tests

VERBOSE = True
VERBOSE = False

import unittest

Expand All @@ -105,7 +105,7 @@ def assertNumberFromTextEqual(self, words_input, expected_output):
actual_output = tuple(words)
expected_output = tuple(expected_output.split())
if VERBOSE:
print("{:>38}: {!r} -> {!r}".format(name_of_caller(frame=2), words_input, " ".join(actual_output)))
print("{:>54}: {!r} -> {!r}".format(name_of_caller(frame=2), words_input, " ".join(actual_output)))
self.assertEqual(actual_output, expected_output)


Expand Down Expand Up @@ -160,6 +160,51 @@ def test_zero(self):
self.assertNumberFromTextEqual("two hundred and zero", "200 and 0")
self.assertNumberFromTextEqual("two hundred and zero and one", "200 and 0 and 1")

def test_hundreds(self):
self.assertNumberFromTextEqual("one hundred two hundred", "100 200")
self.assertNumberFromTextEqual("one hundred two hundred three hundred", "100 200 300")

def test_hundreds_and_units(self):
self.assertNumberFromTextEqual("one hundred two hundred and one", "100 201")
self.assertNumberFromTextEqual("one hundred two hundred three hundred and one", "100 200 301")

def test_hundreds_complex_1(self):
self.assertNumberFromTextEqual("one hundred two hundred thousand and one", "100 200,001")

def test_hundreds_complex_2(self):
self.assertNumberFromTextEqual(
"one hundred and three two hundred and thirteen thirteen thousand three hundred and two",
("103 " "213 " "13,302"),
)

def test_hundreds_complex_3(self):
self.assertNumberFromTextEqual(
(
"ninety two "
"three hundred and three "
"two hundred and thirteen "
"thirteen thousand three hundred "
"four hundred"
),
("92 " "303 " "213 " "13,300 " "400"),
)

def test_mixed_complex_pairs_1(self):
self.assertNumberFromTextEqual(
(
"fifteen million and two "
"three thousand and forty four hundred and three "
"five million and thirty three"
),
("15,000,002 " "7,403 " "5,000,033"),
)

def test_mixed_complex_pairs_2(self):
self.assertNumberFromTextEqual(
("sixty hundred " "fifty thousand"),
("6,000 " "50,000"),
)


if __name__ == "__main__":
nerd_dictation = execfile_as_module(
Expand Down

0 comments on commit 569fdae

Please sign in to comment.