From 6541806a8460bf3548583eb2a94fc1631cdcea8d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Fri, 4 Jun 2021 09:55:15 +1000 Subject: [PATCH 01/21] Cleanup: move sample rate to a variable --- nerd-dictation | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nerd-dictation b/nerd-dictation index 7a4ae87..492799c 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -64,6 +64,9 @@ RECORD_OVERTIME = 0.3 # The amount of time to sleep (in seconds) when idle and 'progressive_continuous' mode is enabled. PROGRESSIVE_CONTINUOUS_SLEEP_WHEN_IDLE = 0.2 +# The sample right to use for recording. +RECORD_SAMPLE_RATE = 44100 + # ----------------------------------------------------------------------------- # General Utilities # @@ -542,7 +545,7 @@ def text_from_vosk_pipe( cmd = ( "parec", "--record", - "--rate=44100", + "--rate=%d" % RECORD_SAMPLE_RATE, "--channels=1", *(("--device=%s" % pulse_device_name,) if pulse_device_name else ()), "--format=s16ne", @@ -562,7 +565,7 @@ def text_from_vosk_pipe( vosk.SetLogLevel(-1) model = vosk.Model(vosk_model_dir) - rec = vosk.KaldiRecognizer(model, 44100) + rec = vosk.KaldiRecognizer(model, RECORD_SAMPLE_RATE) # 1mb (allow for loading the model to take some time). block_size = 104_8576 From b5f29a944adbf65c7f3bf4fedade4f8b3f554cd5 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Fri, 4 Jun 2021 10:01:32 +1000 Subject: [PATCH 02/21] Add '--defer-exit' option to delay exiting N seconds Useful when a key is released a little early push-to-talk set up. --- changelog.rst | 1 + nerd-dictation | 32 +++++++++++++++++++++----------- readme.rst | 4 ++++ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/changelog.rst b/changelog.rst index a35d3cc..f1898ee 100644 --- a/changelog.rst +++ b/changelog.rst @@ -3,6 +3,7 @@ Changelog ######### +- 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. - 2021/05/30: Fix error with ``xdotool`` mistaking text as arguments. - 2021/05/30: Fix adding numbers with "and", "one and two" now resolve to "1 and 2" not "3". - 2021/05/30: Add numeric scales up to 'centillion' (10**303). diff --git a/nerd-dictation b/nerd-dictation index 492799c..a6e7102 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -54,12 +54,6 @@ USER_CONFIG_DIR = "nerd-dictation" USER_CONFIG = "nerd-dictation.py" -# Extra time to record (in seconds). -# NOTE: ideally this would not be necessary, however buffering data from the pipe means -# some extra delay is needed. -# -# TODO: make this an option or use a more sophisticated method of recording. -RECORD_OVERTIME = 0.3 # The amount of time to sleep (in seconds) when idle and 'progressive_continuous' mode is enabled. PROGRESSIVE_CONTINUOUS_SLEEP_WHEN_IDLE = 0.2 @@ -708,6 +702,7 @@ def main_begin( numbers_as_digits: bool = False, numbers_use_separator: bool = False, timeout: float = 0.0, + delay_exit: float = 0.0, punctuate_from_previous_timeout: float = 0.0, output: str = "TYPE", ) -> None: @@ -752,7 +747,7 @@ def main_begin( # touch_mtime = None - use_overtime = RECORD_OVERTIME > 0.0 and timeout == 0.0 + use_overtime = delay_exit > 0.0 and timeout == 0.0 # Lazy loaded so recording can start 1st. user_config = None @@ -763,14 +758,14 @@ def main_begin( return -1 # Cancel. if file_mtime_or_none(path_to_cookie) != cookie_timestamp: - # Implement `RECORD_OVERTIME` workaround. + # Implement `delay_exit` workaround. if use_overtime: if touch_mtime is None: touch_mtime = time.time() - if time.time() - touch_mtime < RECORD_OVERTIME: - # Continue until `RECORD_OVERTIME` is reached. + if time.time() - touch_mtime < delay_exit: + # Continue until `delay_exit` is reached. return 0 - # End `RECORD_OVERTIME`. + # End `delay_exit`. return 1 # End. return 0 # Continue. @@ -982,6 +977,20 @@ This creates the directory used to store internal data, so other commands such a required=False, ) + subparse.add_argument( + "--delay-exit", + dest="delay_exit", + default=0.0, + type=float, + metavar="SECONDS", + help=( + "The time to continue running after an exit request.\n" + 'this can be useful so "push to talk" setups can be released while you finish speaking\n' + "(zero disables)." + ), + required=False, + ) + subparse.add_argument( "--punctuate-from-previous-timeout", dest="punctuate_from_previous_timeout", @@ -1066,6 +1075,7 @@ This creates the directory used to store internal data, so other commands such a numbers_as_digits=args.numbers_as_digits, numbers_use_separator=args.numbers_use_separator, timeout=args.timeout, + delay_exit=args.delay_exit, punctuate_from_previous_timeout=args.punctuate_from_previous_timeout, output=args.output, ), diff --git a/readme.rst b/readme.rst index 052ac3b..55203d6 100644 --- a/readme.rst +++ b/readme.rst @@ -173,6 +173,7 @@ usage:: nerd-dictation begin [-h] [--cookie FILE_PATH] [--vosk-model-dir DIR] [--pulse-device-name IDENTIFIER] [--defer-output] [--continuous] [--timeout SECONDS] + [--delay-exit SECONDS] [--punctuate-from-previous-timeout SECONDS] [--full-sentence] [--numbers-as-digits] [--numbers-use-separator] [--output OUTPUT_METHOD] @@ -196,6 +197,9 @@ optional arguments: Only used when ``--defer-output`` is disabled. --timeout SECONDS Time out recording when no speech is processed for the time in seconds. This can be used to avoid having to explicitly exit (zero disables). + --delay-exit SECONDS The time to continue running after an exit request. + this can be useful so "push to talk" setups can be released while you finish speaking + (zero disables). --punctuate-from-previous-timeout SECONDS The time-out in seconds for detecting the state of dictation from the previous recording, this can be useful so punctuation it is added before entering the dictation(zero disables). --full-sentence Capitalize the first character. From 01cf0ef63d9911bb44bb6207d46e200fd44833e1 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 7 Jun 2021 22:49:30 +1000 Subject: [PATCH 03/21] Improve logic for capturing the recording output Capture more of the end of the recording before exiting. --- changelog.rst | 1 + nerd-dictation | 92 ++++++++++++++++++++++++++------------------------ 2 files changed, 48 insertions(+), 45 deletions(-) diff --git a/changelog.rst b/changelog.rst index f1898ee..22f3ad6 100644 --- a/changelog.rst +++ b/changelog.rst @@ -4,6 +4,7 @@ Changelog ######### - 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. +- 2021/06/07: Improve recording logic to capture more of the end of the recording before exiting. - 2021/05/30: Fix error with ``xdotool`` mistaking text as arguments. - 2021/05/30: Fix adding numbers with "and", "one and two" now resolve to "1 and 2" not "3". - 2021/05/30: Add numeric scales up to 'centillion' (10**303). diff --git a/nerd-dictation b/nerd-dictation index a6e7102..4320102 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -618,60 +618,62 @@ def text_from_vosk_pipe( handled_any = True - while True: + # Use code to delay exiting, allowing reading the recording buffer to catch-up. + code = 0 + + while code == 0: # -1=cancel, 0=continue, 1=finish. code = exit_fn() - if code != -1: - # Mostly the data read is quite small (under 1k). - # Only the 1st entry in the loop reads a lot of data due to the time it takes to initialize the VOSK module. - data = stdout.read(block_size) - - if data: - ok = rec.AcceptWaveform(data) + # Mostly the data read is quite small (under 1k). + # Only the 1st entry in the loop reads a lot of data due to the time it takes to initialize the VOSK module. + data = stdout.read(block_size) + + if data: + ok = rec.AcceptWaveform(data) + + if ok: + json_text = rec.Result() + json_text_partial_prev = "" + json_data = json.loads(json_text) + text = json_data["text"] + assert isinstance(text, str) + if text: + handle_fn_wrapper(text, False) + else: + # Only for comparison, to detect change. + # if use_timeout: + json_text = rec.PartialResult() + # Without this, there are *many* calls with the same partial text. + if json_text_partial_prev != json_text: + json_text_partial_prev = json_text - if ok: - json_text = rec.Result() - json_text_partial_prev = "" json_data = json.loads(json_text) - text = json_data["text"] - assert isinstance(text, str) + text = json_data["partial"] if text: - handle_fn_wrapper(text, False) - else: - # Only for comparison, to detect change. - # if use_timeout: - json_text = rec.PartialResult() - # Without this, there are *many* calls with the same partial text. - if json_text_partial_prev != json_text: - json_text_partial_prev = json_text - - json_data = json.loads(json_text) - text = json_data["partial"] - if text: - handle_fn_wrapper(text, True) - else: - if progressive and progressive_continuous: - # Continuous' mode is intended to support being left running in the background. - # As there was nothing to do sleep for 1/5th of a second, - # prevents excessive processor load. - # This is fairly arbitrary but small enough for users not to notice a delay. - time.sleep(PROGRESSIVE_CONTINUOUS_SLEEP_WHEN_IDLE) - - # Monitor the partial output. - # Finish if no changes are made for `timeout` seconds. - if use_timeout: - if json_text != timeout_text_prev: - timeout_text_prev = json_text - timeout_time_prev = time.time() - elif time.time() - timeout_time_prev > timeout: + handle_fn_wrapper(text, True) + else: + if progressive and progressive_continuous: + # Continuous' mode is intended to support being left running in the background. + # As there was nothing to do sleep for 1/5th of a second, + # prevents excessive processor load. + # This is fairly arbitrary but small enough for users not to notice a delay. + time.sleep(PROGRESSIVE_CONTINUOUS_SLEEP_WHEN_IDLE) + + # Monitor the partial output. + # Finish if no changes are made for `timeout` seconds. + if use_timeout: + if json_text != timeout_text_prev: + timeout_text_prev = json_text + timeout_time_prev = time.time() + elif time.time() - timeout_time_prev > timeout: + if code == 0: code = 1 # The time was exceeded, exit! - if code != 0: - import signal + # Close the recording process. + import signal - os.kill(ps.pid, signal.SIGINT) - break + os.kill(ps.pid, signal.SIGINT) if code == -1: sys.stderr.write("Text input canceled!\n") From d7d946e0dc2fc8e666fef14936ad67b3853af57d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 14 Jun 2021 02:16:51 +1000 Subject: [PATCH 04/21] Cleanup: correct comment --- nerd-dictation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nerd-dictation b/nerd-dictation index 4320102..348d919 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -428,7 +428,7 @@ class from_words_to_digits: if (i_number_prev != -1) and (i_number_prev + 1 != i): words_between = tuple(word_list[i_number_prev + 1 : i]) found = True - # While other more here for now this is enough. + # While more could be added here, for now this is enough. if words_between == ("point",): word_list[i_number_prev : i + 1] = [word_list[i_number_prev] + "." + word_list[i]] elif words_between == ("minus",): From 2bc13c6d553479575caf1ef12264dfb6d07f13e3 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Fri, 25 Jun 2021 14:12:09 +1000 Subject: [PATCH 05/21] Add exception for unlikely but possible error loading them module-spec Quiets mypy warning. --- nerd-dictation | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nerd-dictation b/nerd-dictation index 348d919..07c943a 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -111,6 +111,8 @@ def execfile(filepath: str, mod: Optional[ModuleType] = None) -> Optional[Module mod_name = "__main__" mod_spec = importlib.util.spec_from_file_location(mod_name, filepath) + if mod_spec is None: + raise Exception("Unable to retrieve the module-spec from %r" % filepath) if mod is None: mod = importlib.util.module_from_spec(mod_spec) From 1f286f9cf865f41d66122565a5b46463185f0fa4 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Fri, 25 Jun 2021 14:14:26 +1000 Subject: [PATCH 06/21] Fix #6 high usage of CPU Add `--idle-time` argument, to prevent high CPU usage while recording. The default is 0.1 seconds (reading data 10 times a second). Reading & processing small amounts of data at a time doesn't have an especially big advantage compared with the cost of high CPU usage. --- changelog.rst | 2 ++ nerd-dictation | 45 +++++++++++++++++++++++++++++++++++---------- readme.rst | 6 +++++- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/changelog.rst b/changelog.rst index 22f3ad6..93f6024 100644 --- a/changelog.rst +++ b/changelog.rst @@ -3,6 +3,8 @@ Changelog ######### +- 2021/06/25: Add ``--idle-time``, optionally idle to avoid high CPU usage for no perceptual gain (fixes #6). +- 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. - 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. - 2021/06/07: Improve recording logic to capture more of the end of the recording before exiting. - 2021/05/30: Fix error with ``xdotool`` mistaking text as arguments. diff --git a/nerd-dictation b/nerd-dictation index 07c943a..3488618 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -55,9 +55,6 @@ USER_CONFIG_DIR = "nerd-dictation" USER_CONFIG = "nerd-dictation.py" -# The amount of time to sleep (in seconds) when idle and 'progressive_continuous' mode is enabled. -PROGRESSIVE_CONTINUOUS_SLEEP_WHEN_IDLE = 0.2 - # The sample right to use for recording. RECORD_SAMPLE_RATE = 44100 @@ -524,6 +521,7 @@ def text_from_vosk_pipe( process_fn: Callable[[str], str], handle_fn: Callable[[str], None], timeout: float, + idle_time: float, progressive: bool, progressive_continuous: bool, pulse_device_name: str = "", @@ -623,10 +621,26 @@ def text_from_vosk_pipe( # Use code to delay exiting, allowing reading the recording buffer to catch-up. code = 0 + if idle_time > 0.0: + idle_time_prev = time.time() + while code == 0: # -1=cancel, 0=continue, 1=finish. code = exit_fn() + if idle_time > 0.0: + # Subtract processing time from the previous loop. + # Skip idling in the event dictation can't keep up with the recording. + idle_time_curr = time.time() + idle_time_test = idle_time - (idle_time_curr - idle_time_prev) + if idle_time_test > 0.0: + # Prevents excessive processor load. + print(idle_time_test) + time.sleep(idle_time_test) + idle_time_prev = time.time() + else: + idle_time_prev = idle_time_curr + # Mostly the data read is quite small (under 1k). # Only the 1st entry in the loop reads a lot of data due to the time it takes to initialize the VOSK module. data = stdout.read(block_size) @@ -654,13 +668,6 @@ def text_from_vosk_pipe( text = json_data["partial"] if text: handle_fn_wrapper(text, True) - else: - if progressive and progressive_continuous: - # Continuous' mode is intended to support being left running in the background. - # As there was nothing to do sleep for 1/5th of a second, - # prevents excessive processor load. - # This is fairly arbitrary but small enough for users not to notice a delay. - time.sleep(PROGRESSIVE_CONTINUOUS_SLEEP_WHEN_IDLE) # Monitor the partial output. # Finish if no changes are made for `timeout` seconds. @@ -706,6 +713,7 @@ def main_begin( numbers_as_digits: bool = False, numbers_use_separator: bool = False, timeout: float = 0.0, + idle_time: float = 0.0, delay_exit: float = 0.0, punctuate_from_previous_timeout: float = 0.0, output: str = "TYPE", @@ -861,6 +869,7 @@ def main_begin( vosk_model_dir=vosk_model_dir, pulse_device_name=pulse_device_name, timeout=timeout, + idle_time=idle_time, progressive=progressive, progressive_continuous=progressive_continuous, exit_fn=exit_fn, @@ -981,6 +990,21 @@ This creates the directory used to store internal data, so other commands such a required=False, ) + subparse.add_argument( + "--idle-time", + dest="idle_time", + default=0.1, + type=float, + metavar="SECONDS", + help=( + "Time to idle between processing audio from the recording.\n" + "Setting to zero is the most responsive at the cost of high CPU usage.\n" + "The default value is 0.1 (processing 10 times a second), which is quite responsive in practice\n" + "(the maximum value is clamped to 0.5)" + ), + required=False, + ) + subparse.add_argument( "--delay-exit", dest="delay_exit", @@ -1079,6 +1103,7 @@ This creates the directory used to store internal data, so other commands such a numbers_as_digits=args.numbers_as_digits, numbers_use_separator=args.numbers_use_separator, timeout=args.timeout, + idle_time=min(args.idle_time, 0.5), delay_exit=args.delay_exit, punctuate_from_previous_timeout=args.punctuate_from_previous_timeout, output=args.output, diff --git a/readme.rst b/readme.rst index 55203d6..48de9b8 100644 --- a/readme.rst +++ b/readme.rst @@ -173,7 +173,7 @@ usage:: nerd-dictation begin [-h] [--cookie FILE_PATH] [--vosk-model-dir DIR] [--pulse-device-name IDENTIFIER] [--defer-output] [--continuous] [--timeout SECONDS] - [--delay-exit SECONDS] + [--idle-time SECONDS] [--delay-exit SECONDS] [--punctuate-from-previous-timeout SECONDS] [--full-sentence] [--numbers-as-digits] [--numbers-use-separator] [--output OUTPUT_METHOD] @@ -197,6 +197,10 @@ optional arguments: Only used when ``--defer-output`` is disabled. --timeout SECONDS Time out recording when no speech is processed for the time in seconds. This can be used to avoid having to explicitly exit (zero disables). + --idle-time SECONDS Time to idle between processing audio from the recording. + Setting to zero is the most responsive at the cost of high CPU usage. + The default value is 0.1 (processing 10 times a second), which is quite responsive in practice + (the maximum value is clamped to 0.5) --delay-exit SECONDS The time to continue running after an exit request. this can be useful so "push to talk" setups can be released while you finish speaking (zero disables). From 8079ed585fb81ed2a5cad6c3e36bf76b5f0b25f6 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 28 Jun 2021 05:49:49 +1000 Subject: [PATCH 07/21] Correct duplicate change log entry --- changelog.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/changelog.rst b/changelog.rst index 93f6024..a9248aa 100644 --- a/changelog.rst +++ b/changelog.rst @@ -5,7 +5,6 @@ Changelog - 2021/06/25: Add ``--idle-time``, optionally idle to avoid high CPU usage for no perceptual gain (fixes #6). - 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. -- 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. - 2021/06/07: Improve recording logic to capture more of the end of the recording before exiting. - 2021/05/30: Fix error with ``xdotool`` mistaking text as arguments. - 2021/05/30: Fix adding numbers with "and", "one and two" now resolve to "1 and 2" not "3". From c94376fe2fc729baf9367106f2809ee17e63983a Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Wed, 7 Jul 2021 15:37:13 +1000 Subject: [PATCH 08/21] Fix #10 lots of numbers being spit out Accidentally left in a print. --- nerd-dictation | 1 - 1 file changed, 1 deletion(-) diff --git a/nerd-dictation b/nerd-dictation index 3488618..f476c90 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -635,7 +635,6 @@ def text_from_vosk_pipe( idle_time_test = idle_time - (idle_time_curr - idle_time_prev) if idle_time_test > 0.0: # Prevents excessive processor load. - print(idle_time_test) time.sleep(idle_time_test) idle_time_prev = time.time() else: From 439cffbfd60376364cb1f8e022cf974aa2383dd0 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Thu, 8 Jul 2021 12:25:42 +1000 Subject: [PATCH 09/21] Cleanup: remove unused assignment --- nerd-dictation | 1 - 1 file changed, 1 deletion(-) diff --git a/nerd-dictation b/nerd-dictation index f476c90..f83a0e1 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -398,7 +398,6 @@ class from_words_to_digits: if is_final: result_final = (result + current, suffix, word_index) word_index_final = word_index - has_final = True # Once there is a suffix, don't attempt to parse extra numbers. if suffix: From dd99ef17d83c7585b8e10a69fea2c1957d52818d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Thu, 8 Jul 2021 12:51:07 +1000 Subject: [PATCH 10/21] Add optional `--sample-rate` argument Support sitting the sample rate for recording. --- changelog.rst | 1 + nerd-dictation | 24 +++++++++++++++++++----- readme.rst | 9 ++++++--- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/changelog.rst b/changelog.rst index a9248aa..8cafe1f 100644 --- a/changelog.rst +++ b/changelog.rst @@ -3,6 +3,7 @@ Changelog ######### +- 2021/07/08: Add ``--sample-rate``, optionally set the sample rate used for recording. - 2021/06/25: Add ``--idle-time``, optionally idle to avoid high CPU usage for no perceptual gain (fixes #6). - 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. - 2021/06/07: Improve recording logic to capture more of the end of the recording before exiting. diff --git a/nerd-dictation b/nerd-dictation index f83a0e1..ea0511e 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -55,9 +55,6 @@ USER_CONFIG_DIR = "nerd-dictation" USER_CONFIG = "nerd-dictation.py" -# The sample right to use for recording. -RECORD_SAMPLE_RATE = 44100 - # ----------------------------------------------------------------------------- # General Utilities # @@ -523,6 +520,7 @@ def text_from_vosk_pipe( idle_time: float, progressive: bool, progressive_continuous: bool, + sample_rate: int, pulse_device_name: str = "", ) -> bool: # Delay some imports until recording has started to avoid minor delays. @@ -538,7 +536,7 @@ def text_from_vosk_pipe( cmd = ( "parec", "--record", - "--rate=%d" % RECORD_SAMPLE_RATE, + "--rate=%d" % sample_rate, "--channels=1", *(("--device=%s" % pulse_device_name,) if pulse_device_name else ()), "--format=s16ne", @@ -558,7 +556,7 @@ def text_from_vosk_pipe( vosk.SetLogLevel(-1) model = vosk.Model(vosk_model_dir) - rec = vosk.KaldiRecognizer(model, RECORD_SAMPLE_RATE) + rec = vosk.KaldiRecognizer(model, sample_rate) # 1mb (allow for loading the model to take some time). block_size = 104_8576 @@ -705,6 +703,7 @@ def main_begin( vosk_model_dir: str, path_to_cookie: str = "", pulse_device_name: str = "", + sample_rate: int = 44100, progressive: bool = False, progressive_continuous: bool = False, full_sentence: bool = False, @@ -866,6 +865,7 @@ def main_begin( found_any = text_from_vosk_pipe( vosk_model_dir=vosk_model_dir, pulse_device_name=pulse_device_name, + sample_rate=sample_rate, timeout=timeout, idle_time=idle_time, progressive=progressive, @@ -949,6 +949,19 @@ This creates the directory used to store internal data, so other commands such a required=False, ) + subparse.add_argument( + "--sample-rate", + dest="sample_rate", + default=44100, + type=int, + metavar="HZ", + help=( + "The sample rate to use for recording (in Hz).\n" + "Defaults to 44100." + ), + required=False, + ) + subparse.add_argument( "--defer-output", dest="defer_output", @@ -1095,6 +1108,7 @@ This creates the directory used to store internal data, so other commands such a path_to_cookie=args.path_to_cookie, vosk_model_dir=args.vosk_model_dir, pulse_device_name=args.pulse_device_name, + sample_rate=args.sample_rate, progressive=not (args.defer_output or args.output == "STDOUT"), progressive_continuous=args.progressive_continuous, full_sentence=args.full_sentence, diff --git a/readme.rst b/readme.rst index 48de9b8..7ba6dee 100644 --- a/readme.rst +++ b/readme.rst @@ -171,9 +171,10 @@ Subcommand: ``begin`` usage:: nerd-dictation begin [-h] [--cookie FILE_PATH] [--vosk-model-dir DIR] - [--pulse-device-name IDENTIFIER] [--defer-output] - [--continuous] [--timeout SECONDS] - [--idle-time SECONDS] [--delay-exit SECONDS] + [--pulse-device-name IDENTIFIER] + [--sample-rate HZ] [--defer-output] [--continuous] + [--timeout SECONDS] [--idle-time SECONDS] + [--delay-exit SECONDS] [--punctuate-from-previous-timeout SECONDS] [--full-sentence] [--numbers-as-digits] [--numbers-use-separator] [--output OUTPUT_METHOD] @@ -189,6 +190,8 @@ optional arguments: --pulse-device-name IDENTIFIER The name of the pulse-audio device to use for recording. See the output of "pactl list" to find device names. + --sample-rate HZ The sample rate to use for recording (in Hz). + Defaults to 44100. --defer-output When enabled, output is deferred until exiting. This prevents text being typed during speech (implied with ``--output=STDOUT``) From 5fe024aca6f91c77f37d0f5afa8dbf77f38d215f Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Tue, 27 Jul 2021 09:13:34 +1000 Subject: [PATCH 11/21] Example Configuration: strip empty strings Allow replacements to remove words entirely without adding extra spaces. Noted in #13. --- examples/nerd-dictation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/nerd-dictation.py b/examples/nerd-dictation.py index d984384..ad1b352 100644 --- a/examples/nerd-dictation.py +++ b/examples/nerd-dictation.py @@ -61,4 +61,7 @@ def nerd_dictation_process(text): words[i] = w + # Strip any words that were replaced with empty strings. + words[:] = [w for w in words if w] + return " ".join(words) From 18e25eceb75965923b5f0d219bfb99aff5add706 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 30 Aug 2021 23:02:40 +1000 Subject: [PATCH 12/21] hacking: correct typos --- hacking.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hacking.rst b/hacking.rst index 0068718..27ffd9a 100644 --- a/hacking.rst +++ b/hacking.rst @@ -27,9 +27,9 @@ Style ----- - Auto formatting is handled with black by running: - ``black nerd-dictator`` + ``black nerd-dictation`` - Ensure correct type annotations by running: - ``mypy --strict nerd-dictator``. + ``mypy --strict nerd-dictation``. - Check for errors with: ``pylint nerd-dictation --disable=C0103,C0111,C0301,C0302,C0415,E0401,E0611,I1101,R0801,R0902,R0903,R0912,R0913,R0914,R0915,R1705,W0212,W0703`` From 629847499336d0ad72e2135b2197e088d6b2eb36 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 30 Aug 2021 23:25:20 +1000 Subject: [PATCH 13/21] Cleanup: minor formatting changes --- nerd-dictation | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/nerd-dictation b/nerd-dictation index ea0511e..85ce92a 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -955,10 +955,7 @@ This creates the directory used to store internal data, so other commands such a default=44100, type=int, metavar="HZ", - help=( - "The sample rate to use for recording (in Hz).\n" - "Defaults to 44100." - ), + help=("The sample rate to use for recording (in Hz).\n" "Defaults to 44100."), required=False, ) @@ -968,7 +965,8 @@ This creates the directory used to store internal data, so other commands such a default=False, action="store_true", help=( - "When enabled, output is deferred until exiting.\n\n" + "When enabled, output is deferred until exiting.\n" + "\n" "This prevents text being typed during speech (implied with ``--output=STDOUT``)" ), required=False, From edd75d1ce36b8fcfb261749d7143167cac8c1e20 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Sat, 4 Sep 2021 13:01:05 +1000 Subject: [PATCH 14/21] Clarify which identifier --pulse-device-name uses Address ambiguity raised by #20. --- nerd-dictation | 2 +- readme.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nerd-dictation b/nerd-dictation index 85ce92a..17e456e 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -944,7 +944,7 @@ This creates the directory used to store internal data, so other commands such a metavar="IDENTIFIER", help=( "The name of the pulse-audio device to use for recording.\n" - 'See the output of "pactl list" to find device names.' + "See the output of \"pactl list\" to find device names (using the identifier following \"Name:\")." ), required=False, ) diff --git a/readme.rst b/readme.rst index 7ba6dee..72c1ae6 100644 --- a/readme.rst +++ b/readme.rst @@ -189,7 +189,7 @@ optional arguments: --vosk-model-dir DIR Path to the VOSK model, see: https://alphacephei.com/vosk/models --pulse-device-name IDENTIFIER The name of the pulse-audio device to use for recording. - See the output of "pactl list" to find device names. + See the output of "pactl list" to find device names (using the identifier following "Name:"). --sample-rate HZ The sample rate to use for recording (in Hz). Defaults to 44100. --defer-output When enabled, output is deferred until exiting. From 363a1924d4adff3a85496394f312e1cb21ee0958 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Fri, 17 Sep 2021 17:21:28 +1000 Subject: [PATCH 15/21] Docs: suggest "pactl list sources" in help text This addresses confusion from #20 where it wasn't clear that the name of the "Source" was needed instead of the "Card" name for example. --- nerd-dictation | 2 +- readme.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nerd-dictation b/nerd-dictation index 17e456e..0d3945d 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -944,7 +944,7 @@ This creates the directory used to store internal data, so other commands such a metavar="IDENTIFIER", help=( "The name of the pulse-audio device to use for recording.\n" - "See the output of \"pactl list\" to find device names (using the identifier following \"Name:\")." + "See the output of \"pactl list sources\" to find device names (using the identifier following \"Name:\")." ), required=False, ) diff --git a/readme.rst b/readme.rst index 72c1ae6..2a28e66 100644 --- a/readme.rst +++ b/readme.rst @@ -189,7 +189,7 @@ optional arguments: --vosk-model-dir DIR Path to the VOSK model, see: https://alphacephei.com/vosk/models --pulse-device-name IDENTIFIER The name of the pulse-audio device to use for recording. - See the output of "pactl list" to find device names (using the identifier following "Name:"). + See the output of "pactl list sources" to find device names (using the identifier following "Name:"). --sample-rate HZ The sample rate to use for recording (in Hz). Defaults to 44100. --defer-output When enabled, output is deferred until exiting. From 3799b8d9e5ca084dde884d36d4a5b527990a0196 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 20 Sep 2021 10:37:27 +1000 Subject: [PATCH 16/21] readme: add link to elograf nerd-dictation front-end --- readme.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/readme.rst b/readme.rst index 2a28e66..3d12541 100644 --- a/readme.rst +++ b/readme.rst @@ -275,6 +275,12 @@ Store the result of speech to text as a variable in the shell: SPEECH="$(nerd-dictation begin --timeout=1.0 --output=STDOUT)" +Other Software +============== + +- `Elograf `__ - nerd-dictation GUI front-end that runs as a tray icon. + + Limitations =========== From 4ce8abe803fc024dd409354f7d8e272d79d69f1a Mon Sep 17 00:00:00 2001 From: Ryan Pavlik Date: Tue, 21 Sep 2021 14:00:20 -0500 Subject: [PATCH 17/21] Fix typo. --- nerd-dictation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nerd-dictation b/nerd-dictation index 17e456e..1e79f92 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -490,7 +490,7 @@ def process_text( text = text.replace("\n", " ") words = text.split(" ") - # Fist parse numbers. + # First parse numbers. if numbers_as_digits: from_words_to_digits.parse_numbers_in_word_list( words, From c94f8c4835743cd6920b7552645c721e8d54fc89 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Mon, 4 Oct 2021 22:46:30 +1100 Subject: [PATCH 18/21] Add example configuration for begin/end commands See #24 for suggestion. --- examples/begin_end_commands/nerd-dictation.py | 73 +++++++++++++++++++ examples/{ => default}/nerd-dictation.py | 0 2 files changed, 73 insertions(+) create mode 100644 examples/begin_end_commands/nerd-dictation.py rename examples/{ => default}/nerd-dictation.py (100%) diff --git a/examples/begin_end_commands/nerd-dictation.py b/examples/begin_end_commands/nerd-dictation.py new file mode 100644 index 0000000..26473c2 --- /dev/null +++ b/examples/begin_end_commands/nerd-dictation.py @@ -0,0 +1,73 @@ +# User configuration file typically located at `~/.config/nerd-dictation/nerd-dictation.py` + +# This examples shows how explicit start/end commands can be implemented. +# +# This assumes dictation is always running in the background, +# special commands are spoken to start/end dictation which are excluded + +# Global, track when dictation is active. +is_active = False + +# ----------------------------------------------------------------------------- +# Constants + +# Commands to use. +START_COMMAND = ("start", "dictation") +FINISH_COMMAND = ("finish", "dictation") + + +# ----------------------------------------------------------------------------- +# Utility Functions + +def match_words_at_index(haystack_words, haystack_index, needle_words): + """ + Check needle_words is in haystack_words at haystack_index. + """ + return ( + (needle_words[0] == haystack_words[haystack_index]) and + (haystack_index + len(needle_words) <= len(haystack_words)) and + (needle_words[1:] == haystack_words[haystack_index + 1 : haystack_index + len(needle_words)]) + ) + + +# ----------------------------------------------------------------------------- +# Main Processing Function + +def nerd_dictation_process(text): + global is_active + + words_input = tuple(text.split(" ")) + words = [] + + i = 0 + + # First check if there is text prior to having begun/ended dictation. + # The part should always be ignored. + if is_active: + while i < len(words_input): + if match_words_at_index(words_input, i, START_COMMAND): + i += len(START_COMMAND) + break + i += 1 + if i == len(words_input): + i = 0 + # Else keep the advance of 'i', since it skips text before dictation started. + + while i < len(words_input): + word = words_input[i] + if is_active: + if match_words_at_index(words_input, i, FINISH_COMMAND): + is_active = False + i += len(FINISH_COMMAND) + continue + else: + if match_words_at_index(words_input, i, START_COMMAND): + is_active = True + i += len(START_COMMAND) + continue + + if is_active: + words.append(word) + i += 1 + + return " ".join(words) diff --git a/examples/nerd-dictation.py b/examples/default/nerd-dictation.py similarity index 100% rename from examples/nerd-dictation.py rename to examples/default/nerd-dictation.py From 3018c6633268a641d2d889daa2719a008eea1205 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Tue, 5 Oct 2021 13:20:05 +1100 Subject: [PATCH 19/21] readme: link to examples --- readme.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/readme.rst b/readme.rst index 3d12541..b353410 100644 --- a/readme.rst +++ b/readme.rst @@ -275,6 +275,17 @@ Store the result of speech to text as a variable in the shell: SPEECH="$(nerd-dictation begin --timeout=1.0 --output=STDOUT)" +Example Configurations +---------------------- + +These are example configurations you may use as a reference. + +- `Word Replacement + `__. +- `Start/Finish Commands + `__. + + Other Software ============== From fc79b67fb04d3f5c77f5ff32970772ec68caf46d Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Tue, 5 Oct 2021 17:45:27 +1100 Subject: [PATCH 20/21] Fix fast begin/end commands leaving dictation running Tapping a push-to-talk key could leave dictation active, even running more than one dictation at a time. Also only use --delay-exit when some text has been processed so tapping push-to-talk doesn't keep dictation running when nothing was done. --- changelog.rst | 1 + nerd-dictation | 39 ++++++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/changelog.rst b/changelog.rst index 8cafe1f..f62c2c5 100644 --- a/changelog.rst +++ b/changelog.rst @@ -3,6 +3,7 @@ Changelog ######### +- 2021/10/05: Fix bug where quickly running begin/end would leave dictation enabled. - 2021/07/08: Add ``--sample-rate``, optionally set the sample rate used for recording. - 2021/06/25: Add ``--idle-time``, optionally idle to avoid high CPU usage for no perceptual gain (fixes #6). - 2021/06/07: Add ``--delay-exit``, convenient when pushed to talk is used. diff --git a/nerd-dictation b/nerd-dictation index aa66066..c37a18f 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -60,11 +60,16 @@ USER_CONFIG = "nerd-dictation.py" # -def touch(filepath: str) -> None: +def touch(filepath: str, time: Optional[float] = None) -> None: if os.path.exists(filepath): - os.utime(filepath, None) + os.utime(filepath, None if time is None else (time, time)) else: open(filepath, "a").close() + if time is not None: + try: + os.utime(filepath, (time, time)) + except FileNotFoundError: + pass def file_mtime_or_none(filepath: str) -> Optional[float]: @@ -623,7 +628,7 @@ def text_from_vosk_pipe( while code == 0: # -1=cancel, 0=continue, 1=finish. - code = exit_fn() + code = exit_fn(handled_any) if idle_time > 0.0: # Subtract processing time from the previous loop. @@ -744,10 +749,11 @@ def main_begin( is_run_on = age_in_seconds is not None and (age_in_seconds < punctuate_from_previous_timeout) del age_in_seconds - touch(path_to_cookie) - + # Force zero time-stamp so a fast begin/end (tap) action + # doesn't leave dictation running. + touch(path_to_cookie, time=0) cookie_timestamp = file_mtime_or_none(path_to_cookie) - if cookie_timestamp is None: + if cookie_timestamp != 0: sys.stderr.write("Cookie removed after right after creation (unlikely but respect the request)\n") return @@ -761,20 +767,23 @@ def main_begin( # Lazy loaded so recording can start 1st. user_config = None - def exit_fn() -> int: + def exit_fn(handled_any: bool) -> int: nonlocal touch_mtime if not os.path.exists(path_to_cookie): return -1 # Cancel. if file_mtime_or_none(path_to_cookie) != cookie_timestamp: - # Implement `delay_exit` workaround. - if use_overtime: - if touch_mtime is None: - touch_mtime = time.time() - if time.time() - touch_mtime < delay_exit: - # Continue until `delay_exit` is reached. - return 0 - # End `delay_exit`. + # Only delay exit if some text has been handled, + # this prevents accidental tapping of push to talk from running. + if handled_any: + # Implement `delay_exit` workaround. + if use_overtime: + if touch_mtime is None: + touch_mtime = time.time() + if time.time() - touch_mtime < delay_exit: + # Continue until `delay_exit` is reached. + return 0 + # End `delay_exit`. return 1 # End. return 0 # Continue. From f97ab5845a28f8ac721c94d1f14fc2b37613acb0 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Tue, 19 Oct 2021 14:11:16 +1100 Subject: [PATCH 21/21] Cleanup: format the code with black --- nerd-dictation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nerd-dictation b/nerd-dictation index c37a18f..2b8c7fc 100755 --- a/nerd-dictation +++ b/nerd-dictation @@ -953,7 +953,7 @@ This creates the directory used to store internal data, so other commands such a metavar="IDENTIFIER", help=( "The name of the pulse-audio device to use for recording.\n" - "See the output of \"pactl list sources\" to find device names (using the identifier following \"Name:\")." + 'See the output of "pactl list sources" to find device names (using the identifier following "Name:").' ), required=False, )