diff --git a/.gitignore b/.gitignore index 646de6fd..7734e2a4 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ ext/.depend.mk *.pyc *~ webdata/ +.cache/ diff --git a/gentle/__init__.py b/gentle/__init__.py index 77a74e05..cfe17060 100644 --- a/gentle/__init__.py +++ b/gentle/__init__.py @@ -3,3 +3,4 @@ from forced_aligner import ForcedAligner from full_transcriber import FullTranscriber from resample import resample, resampled +from transcription import Transcription diff --git a/gentle/diff_align.py b/gentle/diff_align.py index a40e1269..70cc6dd0 100644 --- a/gentle/diff_align.py +++ b/gentle/diff_align.py @@ -3,10 +3,11 @@ import os import sys -import metasentence -import language_model -import standard_kaldi -from resources import Resources +from gentle import metasentence +from gentle import language_model +from gentle import standard_kaldi +from gentle import transcription +from gentle.resources import Resources # TODO(maxhawkins): try using the (apparently-superior) time-mediated dynamic @@ -23,7 +24,7 @@ def align(alignment, ms, **kwargs): disfluency = kwargs['disfluency'] if 'disfluency' in kwargs else False disfluencies = kwargs['disfluencies'] if 'disfluencies' in kwargs else [] - hypothesis = [X["word"] for X in alignment] + hypothesis = [X.word for X in alignment] reference = ms.get_kaldi_sequence() display_seq = ms.get_display_sequence() @@ -36,17 +37,16 @@ def align(alignment, ms, **kwargs): word = hypothesis[a] if disfluency and word in disfluencies: hyp_token = alignment[a] - phones = hyp_token.get("phones", []) - start = hyp_token["start"] - end = hyp_token["start"] + hyp_token["duration"] - - out.append({ - "case": "not-found-in-transcript", - "phones": phones, - "start": start, - "end": end, - "word": word - }) + phones = hyp_token.phones or [] + start = hyp_token.start + end = hyp_token.start + hyp_token.duration + + out.append(transcription.Word( + case="not-found-in-transcript", + phones=phones, + start=start, + end=end, + word=word)) continue display_word = display_seq[b] @@ -55,28 +55,26 @@ def align(alignment, ms, **kwargs): if op == 'equal': hyp_word = hypothesis[a] hyp_token = alignment[a] - phones = hyp_token.get("phones", []) - start = hyp_token["start"] - end = hyp_token["start"] + hyp_token["duration"] - - out.append({ - "case": "success", - "startOffset": start_offset, - "endOffset": end_offset, - "word": display_word, - "alignedWord": hyp_word, - "phones": phones, - "start": start, - "end": end, - }) + phones = hyp_token.phones or [] + start = hyp_token.start + end = hyp_token.start + hyp_token.duration + + out.append(transcription.Word( + case="success", + startOffset=start_offset, + endOffset=end_offset, + word=display_word, + alignedWord=hyp_word, + phones=phones, + start=start, + end=end)) elif op in ['insert', 'replace']: - out.append({ - "case": "not-found-in-audio", - "startOffset": start_offset, - "endOffset": end_offset, - "word": display_word, - }) + out.append(transcription.Word( + case="not-found-in-audio", + startOffset=start_offset, + endOffset=end_offset, + word=display_word)) return out def word_diff(a, b): diff --git a/gentle/forced_aligner.py b/gentle/forced_aligner.py index b6c6658f..bff1c948 100644 --- a/gentle/forced_aligner.py +++ b/gentle/forced_aligner.py @@ -3,7 +3,8 @@ from gentle import language_model from gentle import metasentence from gentle import multipass -from gentle.transcription import MultiThreadedTranscriber, Transcription +from gentle.transcriber import MultiThreadedTranscriber +from gentle.transcription import Transcription class ForcedAligner(): @@ -31,7 +32,7 @@ def transcribe(self, wavfile, progress_cb=None, logging=None): # Perform a second-pass with unaligned words if logging is not None: - logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.get("case") == "not-found-in-audio"]), len(words))) + logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.case == "not-found-in-audio"]), len(words))) if progress_cb is not None: progress_cb({'status': 'ALIGNING'}) @@ -39,6 +40,6 @@ def transcribe(self, wavfile, progress_cb=None, logging=None): words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb) if logging is not None: - logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.get("case") == "not-found-in-audio"]), len(words))) + logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.case == "not-found-in-audio"]), len(words))) return Transcription(words=words, transcript=self.transcript) diff --git a/gentle/full_transcriber.py b/gentle/full_transcriber.py index 025e7017..8d2bdfe8 100644 --- a/gentle/full_transcriber.py +++ b/gentle/full_transcriber.py @@ -1,7 +1,9 @@ import os from gentle import kaldi_queue -from gentle.transcription import MultiThreadedTranscriber, Transcription +from gentle import transcription +from gentle.transcriber import MultiThreadedTranscriber +from gentle.transcription import Transcription class FullTranscriber(): @@ -24,17 +26,17 @@ def make_transcription_alignment(trans): transcript = "" words = [] for t_wd in trans: - word = { - "case": "success", - "startOffset": len(transcript), - "endOffset": len(transcript) + len(t_wd["word"]), - "word": t_wd["word"], - "alignedWord": t_wd["word"], - "phones": t_wd["phones"], - "start": t_wd["start"], - "end": t_wd["start"] + t_wd["duration"]} + word = transcription.Word( + case="success", + startOffset=len(transcript), + endOffset=len(transcript) + len(t_wd.word), + word=t_wd.word, + alignedWord=t_wd.word, + phones=t_wd.phones, + start=t_wd.start, + end=t_wd.start + t_wd.duration) words.append(word) - transcript += word["word"] + " " + transcript += word.word + " " return Transcription(words=words, transcript=transcript) diff --git a/gentle/multipass.py b/gentle/multipass.py index 01f7d33f..052036eb 100644 --- a/gentle/multipass.py +++ b/gentle/multipass.py @@ -7,6 +7,7 @@ from gentle import metasentence from gentle import language_model from gentle import diff_align +from gentle import transcription def prepare_multipass(alignment): to_realign = [] @@ -14,9 +15,9 @@ def prepare_multipass(alignment): cur_unaligned_words = [] for wd_idx,wd in enumerate(alignment): - if wd['case'] == 'not-found-in-audio': + if wd.case == 'not-found-in-audio': cur_unaligned_words.append(wd) - elif wd['case'] == 'success': + elif wd.case == 'success': if len(cur_unaligned_words) > 0: to_realign.append({ "start": last_aligned_word, @@ -41,12 +42,15 @@ def realign(wavfile, alignment, ms, resources, nthreads=4, progress_cb=None): def realign(chunk): wav_obj = wave.open(wavfile, 'r') - start_t = (chunk["start"] or {"end": 0})["end"] - end_t = chunk["end"] - if end_t is None: + if chunk["start"] is None: + start_t = 0 + else: + start_t = chunk["start"].end + + if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: - end_t = end_t["start"] + end_t = chunk["end"].start duration = end_t - start_t if duration < 0.01 or duration > 60: @@ -54,8 +58,8 @@ def realign(chunk): return # Create a language model - offset_offset = chunk['words'][0]['startOffset'] - chunk_len = chunk['words'][-1]['endOffset'] - offset_offset + offset_offset = chunk['words'][0].startOffset + chunk_len = chunk['words'][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() @@ -71,21 +75,21 @@ def realign(chunk): buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) - ret = k.get_final() + ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: - if wd.get("end"): + if wd.end is not None: # Apply timing offset - wd['start'] += start_t - wd['end'] += start_t + wd.start += start_t + wd.end += start_t - if wd.get("endOffset"): - wd['startOffset'] += offset_offset - wd['endOffset'] += offset_offset + if wd.endOffset is not None: + wd.startOffset += offset_offset + wd.endOffset += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) diff --git a/gentle/standard_kaldi.py b/gentle/standard_kaldi.py index 9ad91258..59bbd5cf 100644 --- a/gentle/standard_kaldi.py +++ b/gentle/standard_kaldi.py @@ -6,7 +6,6 @@ import tempfile import wave -from gentle import ffmpeg from util.paths import get_binary from gentle.rpc import RPCProtocol from gentle.resources import Resources diff --git a/gentle/transcriber.py b/gentle/transcriber.py new file mode 100644 index 00000000..a5fa52d7 --- /dev/null +++ b/gentle/transcriber.py @@ -0,0 +1,84 @@ +import math +import logging +import wave + +from gentle import transcription + +from multiprocessing.pool import ThreadPool as Pool + +class MultiThreadedTranscriber: + def __init__(self, kaldi_queue, chunk_len=20, overlap_t=2, nthreads=4): + self.chunk_len = chunk_len + self.overlap_t = overlap_t + self.nthreads = nthreads + + self.kaldi_queue = kaldi_queue + + def transcribe(self, wavfile, progress_cb=None): + wav_obj = wave.open(wavfile, 'r') + duration = wav_obj.getnframes() / float(wav_obj.getframerate()) + n_chunks = int(math.ceil(duration / float(self.chunk_len - self.overlap_t))) + + chunks = [] + + def transcribe_chunk(idx): + wav_obj = wave.open(wavfile, 'r') + start_t = idx * (self.chunk_len - self.overlap_t) + # Seek + wav_obj.setpos(int(start_t * wav_obj.getframerate())) + # Read frames + buf = wav_obj.readframes(int(self.chunk_len * wav_obj.getframerate())) + + k = self.kaldi_queue.get() + k.push_chunk(buf) + ret = k.get_final() + k.reset() + self.kaldi_queue.put(k) + + chunks.append({"start": start_t, "words": ret}) + logging.info('%d/%d' % (len(chunks), n_chunks)) + if progress_cb is not None: + progress_cb({"message": ' '.join([X['word'] for X in ret]), + "percent": len(chunks) / float(n_chunks)}) + + + pool = Pool(min(n_chunks, self.nthreads)) + pool.map(transcribe_chunk, range(n_chunks)) + pool.close() + + chunks.sort(key=lambda x: x['start']) + + # Combine chunks + # TODO: remove overlap? ...or just let the sequence aligner deal with it. + words = [] + for c in chunks: + chunk_start = c['start'] + for wd in c['words']: + wd['start'] += chunk_start + words.append(transcription.Word(**wd)) + + return words + + +if __name__=='__main__': + # full transcription + from Queue import Queue + from util import ffmpeg + from gentle import standard_kaldi + + import sys + + import logging + logging.getLogger().setLevel('INFO') + + k_queue = Queue() + for i in range(3): + k_queue.put(standard_kaldi.Kaldi()) + + trans = MultiThreadedTranscriber(k_queue) + + with gentle.resampled(sys.argv[1]) as filename: + out = trans.transcribe(filename) + + open(sys.argv[2], 'w').write(out.to_json()) + diff --git a/gentle/transcription.py b/gentle/transcription.py index a130c9f1..ebf13648 100644 --- a/gentle/transcription.py +++ b/gentle/transcription.py @@ -1,64 +1,33 @@ import csv import io import json -import math -import logging -import wave - -from multiprocessing.pool import ThreadPool as Pool - -class MultiThreadedTranscriber: - def __init__(self, kaldi_queue, chunk_len=20, overlap_t=2, nthreads=4): - self.chunk_len = chunk_len - self.overlap_t = overlap_t - self.nthreads = nthreads - - self.kaldi_queue = kaldi_queue - - def transcribe(self, wavfile, progress_cb=None): - wav_obj = wave.open(wavfile, 'r') - duration = wav_obj.getnframes() / float(wav_obj.getframerate()) - n_chunks = int(math.ceil(duration / float(self.chunk_len - self.overlap_t))) - - chunks = [] - - def transcribe_chunk(idx): - wav_obj = wave.open(wavfile, 'r') - start_t = idx * (self.chunk_len - self.overlap_t) - # Seek - wav_obj.setpos(int(start_t * wav_obj.getframerate())) - # Read frames - buf = wav_obj.readframes(int(self.chunk_len * wav_obj.getframerate())) - - k = self.kaldi_queue.get() - k.push_chunk(buf) - ret = k.get_final() - k.reset() - self.kaldi_queue.put(k) - - chunks.append({"start": start_t, "words": ret}) - logging.info('%d/%d' % (len(chunks), n_chunks)) - if progress_cb is not None: - progress_cb({"message": ' '.join([X['word'] for X in ret]), - "percent": len(chunks) / float(n_chunks)}) - - - pool = Pool(min(n_chunks, self.nthreads)) - pool.map(transcribe_chunk, range(n_chunks)) - pool.close() - - chunks.sort(key=lambda x: x['start']) - - # Combine chunks - # TODO: remove overlap? ...or just let the sequence aligner deal with it. - words = [] - for c in chunks: - chunk_start = c['start'] - for wd in c['words']: - wd['start'] += chunk_start - words.append(wd) - - return words + +from collections import defaultdict + +class Word: + + def __init__(self, case=None, startOffset=None, endOffset=None, word=None, alignedWord=None, phones=None, start=None, end=None, duration=None): + self.case = case + self.startOffset = startOffset + self.endOffset = endOffset + self.word = word + self.alignedWord = alignedWord + self.phones = phones + self.start = start + self.end = end + self.duration = duration + + def as_dict(self): + return { key:val for key, val in self.__dict__.iteritems() if val is not None } + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __ne__(self, other): + return not self == other + + def __repr__(self): + return "Word(" + " ".join(sorted([key + "=" + str(val) for key, val in self.as_dict().iteritems()])) + ")" class Transcription: @@ -66,14 +35,37 @@ def __init__(self, transcript=None, words=None): self.transcript = transcript self.words = words + def __eq__(self, other): + return self.transcript == other.transcript and self.words == other.words + def to_json(self, **kwargs): '''Return a JSON representation of the aligned transcript''' + options = { + 'sort_keys': True, + 'indent': 4, + 'separators': (',', ': '), + } + options.update(kwargs) + container = {} if self.transcript: container['transcript'] = self.transcript if self.words: - container['words'] = self.words - return json.dumps(container, **kwargs) + container['words'] = [word.as_dict() for word in self.words] + return json.dumps(container, **options) + + @classmethod + def from_json(cls, json_str): + return cls._from_jsondata(json.loads(json_str)) + + @classmethod + def from_jsonfile(cls, filename): + with open(filename) as fh: + return cls._from_jsondata(json.load(fh)) + + @classmethod + def _from_jsondata(cls, data): + return cls(transcript = data['transcript'], words = [Word(**wd) for wd in data['words']]) def to_csv(self): '''Return a CSV representation of the aligned transcript. Format: @@ -84,34 +76,24 @@ def to_csv(self): buf = io.BytesIO() w = csv.writer(buf) for X in self.words: - if X.get("case") not in ("success", "not-found-in-audio"): + if X.case not in ("success", "not-found-in-audio"): continue - row = [X["word"], - X.get("alignedWord"), - X.get("start"), - X.get("end") + row = [X.word, + X.alignedWord, + X.start, + X.end ] w.writerow(row) return buf.getvalue() -if __name__=='__main__': - # full transcription - from Queue import Queue - from util import ffmpeg - from gentle import standard_kaldi - - import sys - - import logging - logging.getLogger().setLevel('INFO') - - k_queue = Queue() - for i in range(3): - k_queue.put(standard_kaldi.Kaldi()) - - trans = MultiThreadedTranscriber(k_queue) - - with gentle.resampled(sys.argv[1]) as filename: - out = trans.transcribe(filename) - - open(sys.argv[2], 'w').write(out.to_json()) + def stats(self): + counts = defaultdict(int) + for word in self.words: + counts[word.case] += 1 + stats = {} + stats['total'] = len(self.words) + for key, val in counts.iteritems(): + stats[key] = val + return stats + +Transcription.Word = Word diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..21ffa1d4 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --disable-pytest-warnings diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..b5f1fe66 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,2 @@ +!data/ +data/tmp diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..56764ff1 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# nothing here right now diff --git a/tests/data/expected/forced-harvard-sentences-list30.json b/tests/data/expected/forced-harvard-sentences-list30.json new file mode 100644 index 00000000..eb87a521 --- /dev/null +++ b/tests/data/expected/forced-harvard-sentences-list30.json @@ -0,0 +1,2015 @@ +{ + "transcript": "The mute muffled the high tones of the horn.\nThe gold ring fits only a pierced ear.\nThe old pan was covered with hard fudge.\nWatch the log float in the wide river.\nThe node on the stalk of wheat grew daily.\nThe heap of fallen leaves was set on fire.\nWrite fast, if you want to finish early.\nHis shirt was clean but one button was gone.\nThe barrel of beer was a brew of malt and hops.\nTin cans are absent from store shelves.\n", + "words": [ + { + "alignedWord": "the", + "case": "success", + "end": 1.29, + "endOffset": 3, + "phones": [ + { + "duration": 0.18, + "phone": "dh_B" + }, + { + "duration": 0.08, + "phone": "iy_E" + } + ], + "start": 1.03, + "startOffset": 0, + "word": "The" + }, + { + "alignedWord": "mute", + "case": "success", + "end": 1.6400000000000001, + "endOffset": 8, + "phones": [ + { + "duration": 0.08, + "phone": "m_B" + }, + { + "duration": 0.12, + "phone": "y_I" + }, + { + "duration": 0.11, + "phone": "uw_I" + }, + { + "duration": 0.04, + "phone": "t_E" + } + ], + "start": 1.29, + "startOffset": 4, + "word": "mute" + }, + { + "alignedWord": "muffled", + "case": "success", + "end": 2.42, + "endOffset": 16, + "phones": [ + { + "duration": 0.23, + "phone": "m_B" + }, + { + "duration": 0.1, + "phone": "ah_I" + }, + { + "duration": 0.14, + "phone": "f_I" + }, + { + "duration": 0.04, + "phone": "ah_I" + }, + { + "duration": 0.13, + "phone": "l_I" + }, + { + "duration": 0.07, + "phone": "d_E" + } + ], + "start": 1.71, + "startOffset": 9, + "word": "muffled" + }, + { + "alignedWord": "the", + "case": "success", + "end": 2.6700000000000004, + "endOffset": 20, + "phones": [ + { + "duration": 0.13, + "phone": "dh_B" + }, + { + "duration": 0.09, + "phone": "iy_E" + } + ], + "start": 2.45, + "startOffset": 17, + "word": "the" + }, + { + "alignedWord": "high", + "case": "success", + "end": 2.83, + "endOffset": 25, + "phones": [ + { + "duration": 0.11, + "phone": "hh_B" + }, + { + "duration": 0.05, + "phone": "ay_E" + } + ], + "start": 2.67, + "startOffset": 21, + "word": "high" + }, + { + "alignedWord": "tones", + "case": "success", + "end": 3.55, + "endOffset": 31, + "phones": [ + { + "duration": 0.14, + "phone": "t_B" + }, + { + "duration": 0.25, + "phone": "ow_I" + }, + { + "duration": 0.08, + "phone": "n_I" + }, + { + "duration": 0.07, + "phone": "z_E" + } + ], + "start": 3.01, + "startOffset": 26, + "word": "tones" + }, + { + "alignedWord": "of", + "case": "success", + "end": 3.74, + "endOffset": 34, + "phones": [ + { + "duration": 0.09, + "phone": "ah_B" + }, + { + "duration": 0.07, + "phone": "v_E" + } + ], + "start": 3.58, + "startOffset": 32, + "word": "of" + }, + { + "alignedWord": "the", + "case": "success", + "end": 3.8400000000000003, + "endOffset": 38, + "phones": [ + { + "duration": 0.03, + "phone": "dh_B" + }, + { + "duration": 0.07, + "phone": "iy_E" + } + ], + "start": 3.74, + "startOffset": 35, + "word": "the" + }, + { + "alignedWord": "horn", + "case": "success", + "end": 4.34, + "endOffset": 43, + "phones": [ + { + "duration": 0.08, + "phone": "hh_B" + }, + { + "duration": 0.18, + "phone": "ao_I" + }, + { + "duration": 0.12, + "phone": "r_I" + }, + { + "duration": 0.12, + "phone": "n_E" + } + ], + "start": 3.84, + "startOffset": 39, + "word": "horn" + }, + { + "alignedWord": "the", + "case": "success", + "end": 6.36, + "endOffset": 48, + "phones": [ + { + "duration": 0.11, + "phone": "dh_B" + }, + { + "duration": 0.08, + "phone": "iy_E" + } + ], + "start": 6.17, + "startOffset": 45, + "word": "The" + }, + { + "alignedWord": "gold", + "case": "success", + "end": 6.8500000000000005, + "endOffset": 53, + "phones": [ + { + "duration": 0.08, + "phone": "g_B" + }, + { + "duration": 0.11, + "phone": "ow_I" + }, + { + "duration": 0.18, + "phone": "l_I" + }, + { + "duration": 0.12, + "phone": "d_E" + } + ], + "start": 6.36, + "startOffset": 49, + "word": "gold" + }, + { + "alignedWord": "ring", + "case": "success", + "end": 7.27, + "endOffset": 58, + "phones": [ + { + "duration": 0.11, + "phone": "r_B" + }, + { + "duration": 0.09, + "phone": "ih_I" + }, + { + "duration": 0.22, + "phone": "ng_E" + } + ], + "start": 6.85, + "startOffset": 54, + "word": "ring" + }, + { + "alignedWord": "fits", + "case": "success", + "end": 7.92, + "endOffset": 63, + "phones": [ + { + "duration": 0.22, + "phone": "f_B" + }, + { + "duration": 0.12, + "phone": "ih_I" + }, + { + "duration": 0.1, + "phone": "t_I" + }, + { + "duration": 0.18, + "phone": "s_E" + } + ], + "start": 7.3, + "startOffset": 59, + "word": "fits" + }, + { + "alignedWord": "only", + "case": "success", + "end": 8.63, + "endOffset": 68, + "phones": [ + { + "duration": 0.11, + "phone": "ow_B" + }, + { + "duration": 0.25, + "phone": "n_I" + }, + { + "duration": 0.09, + "phone": "l_I" + }, + { + "duration": 0.12, + "phone": "iy_E" + } + ], + "start": 8.06, + "startOffset": 64, + "word": "only" + }, + { + "alignedWord": "a", + "case": "success", + "end": 8.73, + "endOffset": 70, + "phones": [ + { + "duration": 0.1, + "phone": "ah_S" + } + ], + "start": 8.63, + "startOffset": 69, + "word": "a" + }, + { + "alignedWord": "pierced", + "case": "success", + "end": 9.27, + "endOffset": 78, + "phones": [ + { + "duration": 0.09, + "phone": "p_B" + }, + { + "duration": 0.12, + "phone": "ih_I" + }, + { + "duration": 0.1, + "phone": "r_I" + }, + { + "duration": 0.04, + "phone": "s_I" + }, + { + "duration": 0.14, + "phone": "t_E" + } + ], + "start": 8.78, + "startOffset": 71, + "word": "pierced" + }, + { + "alignedWord": "ear", + "case": "success", + "end": 9.74, + "endOffset": 82, + "phones": [ + { + "duration": 0.23, + "phone": "ih_B" + }, + { + "duration": 0.21, + "phone": "r_E" + } + ], + "start": 9.3, + "startOffset": 79, + "word": "ear" + }, + { + "alignedWord": "the", + "case": "success", + "end": 12.11, + "endOffset": 87, + "phones": [ + { + "duration": 0.16, + "phone": "dh_B" + }, + { + "duration": 0.11, + "phone": "iy_E" + } + ], + "start": 11.84, + "startOffset": 84, + "word": "The" + }, + { + "alignedWord": "old", + "case": "success", + "end": 12.450000000000001, + "endOffset": 91, + "phones": [ + { + "duration": 0.19, + "phone": "ow_B" + }, + { + "duration": 0.06, + "phone": "l_I" + }, + { + "duration": 0.06, + "phone": "d_E" + } + ], + "start": 12.14, + "startOffset": 88, + "word": "old" + }, + { + "alignedWord": "pan", + "case": "success", + "end": 13.11, + "endOffset": 95, + "phones": [ + { + "duration": 0.17, + "phone": "p_B" + }, + { + "duration": 0.26, + "phone": "ae_I" + }, + { + "duration": 0.17, + "phone": "n_E" + } + ], + "start": 12.51, + "startOffset": 92, + "word": "pan" + }, + { + "alignedWord": "was", + "case": "success", + "end": 13.299999999999999, + "endOffset": 99, + "phones": [ + { + "duration": 0.06, + "phone": "w_B" + }, + { + "duration": 0.05, + "phone": "ao_I" + }, + { + "duration": 0.08, + "phone": "z_E" + } + ], + "start": 13.11, + "startOffset": 96, + "word": "was" + }, + { + "alignedWord": "covered", + "case": "success", + "end": 13.82, + "endOffset": 107, + "phones": [ + { + "duration": 0.17, + "phone": "k_B" + }, + { + "duration": 0.09, + "phone": "ah_I" + }, + { + "duration": 0.08, + "phone": "v_I" + }, + { + "duration": 0.09, + "phone": "er_I" + }, + { + "duration": 0.09, + "phone": "d_E" + } + ], + "start": 13.3, + "startOffset": 100, + "word": "covered" + }, + { + "alignedWord": "with", + "case": "success", + "end": 14.030000000000001, + "endOffset": 112, + "phones": [ + { + "duration": 0.09, + "phone": "w_B" + }, + { + "duration": 0.06, + "phone": "ih_I" + }, + { + "duration": 0.06, + "phone": "th_E" + } + ], + "start": 13.82, + "startOffset": 108, + "word": "with" + }, + { + "alignedWord": "hard", + "case": "success", + "end": 14.58, + "endOffset": 117, + "phones": [ + { + "duration": 0.22, + "phone": "hh_B" + }, + { + "duration": 0.13, + "phone": "aa_I" + }, + { + "duration": 0.09, + "phone": "r_I" + }, + { + "duration": 0.08, + "phone": "d_E" + } + ], + "start": 14.06, + "startOffset": 113, + "word": "hard" + }, + { + "alignedWord": "fudge", + "case": "success", + "end": 15.42, + "endOffset": 123, + "phones": [ + { + "duration": 0.03, + "phone": "f_B" + }, + { + "duration": 0.21, + "phone": "ah_I" + }, + { + "duration": 0.38, + "phone": "jh_E" + } + ], + "start": 14.8, + "startOffset": 118, + "word": "fudge" + }, + { + "alignedWord": "watch", + "case": "success", + "end": 17.98, + "endOffset": 130, + "phones": [ + { + "duration": 0.18, + "phone": "w_B" + }, + { + "duration": 0.19, + "phone": "ao_I" + }, + { + "duration": 0.17, + "phone": "ch_E" + } + ], + "start": 17.44, + "startOffset": 125, + "word": "Watch" + }, + { + "alignedWord": "the", + "case": "success", + "end": 18.19, + "endOffset": 134, + "phones": [ + { + "duration": 0.08, + "phone": "dh_B" + }, + { + "duration": 0.11, + "phone": "iy_E" + } + ], + "start": 18, + "startOffset": 131, + "word": "the" + }, + { + "alignedWord": "log", + "case": "success", + "end": 18.64, + "endOffset": 138, + "phones": [ + { + "duration": 0.08, + "phone": "l_B" + }, + { + "duration": 0.24, + "phone": "ao_I" + }, + { + "duration": 0.13, + "phone": "g_E" + } + ], + "start": 18.19, + "startOffset": 135, + "word": "log" + }, + { + "alignedWord": "float", + "case": "success", + "end": 19.21, + "endOffset": 144, + "phones": [ + { + "duration": 0.09, + "phone": "f_B" + }, + { + "duration": 0.07, + "phone": "l_I" + }, + { + "duration": 0.17, + "phone": "ow_I" + }, + { + "duration": 0.11, + "phone": "t_E" + } + ], + "start": 18.77, + "startOffset": 139, + "word": "float" + }, + { + "alignedWord": "in", + "case": "success", + "end": 19.48, + "endOffset": 147, + "phones": [ + { + "duration": 0.1, + "phone": "ih_B" + }, + { + "duration": 0.1, + "phone": "n_E" + } + ], + "start": 19.28, + "startOffset": 145, + "word": "in" + }, + { + "alignedWord": "the", + "case": "success", + "end": 19.6, + "endOffset": 151, + "phones": [ + { + "duration": 0.04, + "phone": "dh_B" + }, + { + "duration": 0.08, + "phone": "iy_E" + } + ], + "start": 19.48, + "startOffset": 148, + "word": "the" + }, + { + "alignedWord": "wide", + "case": "success", + "end": 20.1, + "endOffset": 156, + "phones": [ + { + "duration": 0.15, + "phone": "w_B" + }, + { + "duration": 0.29, + "phone": "ay_I" + }, + { + "duration": 0.06, + "phone": "d_E" + } + ], + "start": 19.6, + "startOffset": 152, + "word": "wide" + }, + { + "alignedWord": "river", + "case": "success", + "end": 20.779999999999998, + "endOffset": 162, + "phones": [ + { + "duration": 0.23, + "phone": "r_B" + }, + { + "duration": 0.05, + "phone": "ih_I" + }, + { + "duration": 0.1, + "phone": "v_I" + }, + { + "duration": 0.27, + "phone": "er_E" + } + ], + "start": 20.13, + "startOffset": 157, + "word": "river" + }, + { + "alignedWord": "the", + "case": "success", + "end": 22.9, + "endOffset": 167, + "phones": [ + { + "duration": 0.18, + "phone": "dh_B" + }, + { + "duration": 0.09, + "phone": "iy_E" + } + ], + "start": 22.63, + "startOffset": 164, + "word": "The" + }, + { + "alignedWord": "node", + "case": "success", + "end": 23.389999999999997, + "endOffset": 172, + "phones": [ + { + "duration": 0.11, + "phone": "n_B" + }, + { + "duration": 0.3, + "phone": "ow_I" + }, + { + "duration": 0.08, + "phone": "d_E" + } + ], + "start": 22.9, + "startOffset": 168, + "word": "node" + }, + { + "alignedWord": "on", + "case": "success", + "end": 23.61, + "endOffset": 175, + "phones": [ + { + "duration": 0.13, + "phone": "ao_B" + }, + { + "duration": 0.09, + "phone": "n_E" + } + ], + "start": 23.39, + "startOffset": 173, + "word": "on" + }, + { + "alignedWord": "the", + "case": "success", + "end": 23.72, + "endOffset": 179, + "phones": [ + { + "duration": 0.04, + "phone": "dh_B" + }, + { + "duration": 0.07, + "phone": "iy_E" + } + ], + "start": 23.61, + "startOffset": 176, + "word": "the" + }, + { + "alignedWord": "stalk", + "case": "success", + "end": 24.369999999999997, + "endOffset": 185, + "phones": [ + { + "duration": 0.17, + "phone": "s_B" + }, + { + "duration": 0.07, + "phone": "t_I" + }, + { + "duration": 0.27, + "phone": "ao_I" + }, + { + "duration": 0.14, + "phone": "k_E" + } + ], + "start": 23.72, + "startOffset": 180, + "word": "stalk" + }, + { + "alignedWord": "of", + "case": "success", + "end": 24.549999999999997, + "endOffset": 188, + "phones": [ + { + "duration": 0.09, + "phone": "ah_B" + }, + { + "duration": 0.06, + "phone": "v_E" + } + ], + "start": 24.4, + "startOffset": 186, + "word": "of" + }, + { + "alignedWord": "wheat", + "case": "success", + "end": 24.990000000000002, + "endOffset": 194, + "phones": [ + { + "duration": 0.03, + "phone": "hh_B" + }, + { + "duration": 0.15, + "phone": "w_I" + }, + { + "duration": 0.15, + "phone": "iy_I" + }, + { + "duration": 0.11, + "phone": "t_E" + } + ], + "start": 24.55, + "startOffset": 189, + "word": "wheat" + }, + { + "alignedWord": "grew", + "case": "success", + "end": 25.479999999999997, + "endOffset": 199, + "phones": [ + { + "duration": 0.16, + "phone": "g_B" + }, + { + "duration": 0.07, + "phone": "r_I" + }, + { + "duration": 0.17, + "phone": "uw_E" + } + ], + "start": 25.08, + "startOffset": 195, + "word": "grew" + }, + { + "alignedWord": "daily", + "case": "success", + "end": 26.14, + "endOffset": 205, + "phones": [ + { + "duration": 0.11, + "phone": "d_B" + }, + { + "duration": 0.2, + "phone": "ey_I" + }, + { + "duration": 0.07, + "phone": "l_I" + }, + { + "duration": 0.28, + "phone": "iy_E" + } + ], + "start": 25.48, + "startOffset": 200, + "word": "daily" + }, + { + "alignedWord": "the", + "case": "success", + "end": 27.81, + "endOffset": 210, + "phones": [ + { + "duration": 0.19, + "phone": "dh_B" + }, + { + "duration": 0.03, + "phone": "ah_E" + } + ], + "start": 27.59, + "startOffset": 207, + "word": "The" + }, + { + "alignedWord": "heap", + "case": "success", + "end": 28.240000000000002, + "endOffset": 215, + "phones": [ + { + "duration": 0.22, + "phone": "hh_B" + }, + { + "duration": 0.16, + "phone": "iy_I" + }, + { + "duration": 0.05, + "phone": "p_E" + } + ], + "start": 27.810000000000002, + "startOffset": 211, + "word": "heap" + }, + { + "alignedWord": "of", + "case": "success", + "end": 28.5, + "endOffset": 218, + "phones": [ + { + "duration": 0.13, + "phone": "ah_B" + }, + { + "duration": 0.08, + "phone": "v_E" + } + ], + "start": 28.29, + "startOffset": 216, + "word": "of" + }, + { + "alignedWord": "fallen", + "case": "success", + "end": 29.05, + "endOffset": 225, + "phones": [ + { + "duration": 0.09, + "phone": "f_B" + }, + { + "duration": 0.13, + "phone": "aa_I" + }, + { + "duration": 0.08, + "phone": "l_I" + }, + { + "duration": 0.04, + "phone": "ah_I" + }, + { + "duration": 0.15, + "phone": "n_E" + } + ], + "start": 28.560000000000002, + "startOffset": 219, + "word": "fallen" + }, + { + "alignedWord": "leaves", + "case": "success", + "end": 29.580000000000002, + "endOffset": 232, + "phones": [ + { + "duration": 0.07, + "phone": "l_B" + }, + { + "duration": 0.25, + "phone": "iy_I" + }, + { + "duration": 0.04, + "phone": "v_I" + }, + { + "duration": 0.17, + "phone": "z_E" + } + ], + "start": 29.05, + "startOffset": 226, + "word": "leaves" + }, + { + "alignedWord": "was", + "case": "success", + "end": 29.89, + "endOffset": 236, + "phones": [ + { + "duration": 0.13, + "phone": "w_B" + }, + { + "duration": 0.06, + "phone": "ah_I" + }, + { + "duration": 0.06, + "phone": "z_E" + } + ], + "start": 29.64, + "startOffset": 233, + "word": "was" + }, + { + "alignedWord": "set", + "case": "success", + "end": 30.26, + "endOffset": 240, + "phones": [ + { + "duration": 0.14, + "phone": "s_B" + }, + { + "duration": 0.12, + "phone": "eh_I" + }, + { + "duration": 0.08, + "phone": "t_E" + } + ], + "start": 29.92, + "startOffset": 237, + "word": "set" + }, + { + "alignedWord": "on", + "case": "success", + "end": 30.52, + "endOffset": 243, + "phones": [ + { + "duration": 0.12, + "phone": "ao_B" + }, + { + "duration": 0.1, + "phone": "n_E" + } + ], + "start": 30.3, + "startOffset": 241, + "word": "on" + }, + { + "alignedWord": "fire", + "case": "success", + "end": 31.21, + "endOffset": 248, + "phones": [ + { + "duration": 0.18, + "phone": "f_B" + }, + { + "duration": 0.25, + "phone": "ay_I" + }, + { + "duration": 0.26, + "phone": "r_E" + } + ], + "start": 30.52, + "startOffset": 244, + "word": "fire" + }, + { + "alignedWord": "write", + "case": "success", + "end": 33.22, + "endOffset": 255, + "phones": [ + { + "duration": 0.19, + "phone": "r_B" + }, + { + "duration": 0.08, + "phone": "ay_I" + }, + { + "duration": 0.1, + "phone": "t_E" + } + ], + "start": 32.85, + "startOffset": 250, + "word": "Write" + }, + { + "alignedWord": "fast", + "case": "success", + "end": 33.79, + "endOffset": 260, + "phones": [ + { + "duration": 0.14, + "phone": "f_B" + }, + { + "duration": 0.24, + "phone": "ae_I" + }, + { + "duration": 0.06, + "phone": "s_I" + }, + { + "duration": 0.09, + "phone": "t_E" + } + ], + "start": 33.26, + "startOffset": 256, + "word": "fast" + }, + { + "alignedWord": "if", + "case": "success", + "end": 33.96, + "endOffset": 264, + "phones": [ + { + "duration": 0.09, + "phone": "ih_B" + }, + { + "duration": 0.08, + "phone": "f_E" + } + ], + "start": 33.79, + "startOffset": 262, + "word": "if" + }, + { + "alignedWord": "you", + "case": "success", + "end": 34.08, + "endOffset": 268, + "phones": [ + { + "duration": 0.06, + "phone": "y_B" + }, + { + "duration": 0.06, + "phone": "uw_E" + } + ], + "start": 33.96, + "startOffset": 265, + "word": "you" + }, + { + "alignedWord": "want", + "case": "success", + "end": 34.379999999999995, + "endOffset": 273, + "phones": [ + { + "duration": 0.08, + "phone": "w_B" + }, + { + "duration": 0.1, + "phone": "ao_I" + }, + { + "duration": 0.05, + "phone": "n_I" + }, + { + "duration": 0.07, + "phone": "t_E" + } + ], + "start": 34.08, + "startOffset": 269, + "word": "want" + }, + { + "alignedWord": "to", + "case": "success", + "end": 34.489999999999995, + "endOffset": 276, + "phones": [ + { + "duration": 0.04, + "phone": "t_B" + }, + { + "duration": 0.07, + "phone": "uw_E" + } + ], + "start": 34.379999999999995, + "startOffset": 274, + "word": "to" + }, + { + "alignedWord": "finish", + "case": "success", + "end": 35.019999999999996, + "endOffset": 283, + "phones": [ + { + "duration": 0.11, + "phone": "f_B" + }, + { + "duration": 0.07, + "phone": "ih_I" + }, + { + "duration": 0.06, + "phone": "n_I" + }, + { + "duration": 0.07, + "phone": "ih_I" + }, + { + "duration": 0.22, + "phone": "sh_E" + } + ], + "start": 34.489999999999995, + "startOffset": 277, + "word": "finish" + }, + { + "alignedWord": "early", + "case": "success", + "end": 35.730000000000004, + "endOffset": 289, + "phones": [ + { + "duration": 0.28, + "phone": "er_B" + }, + { + "duration": 0.13, + "phone": "l_I" + }, + { + "duration": 0.26, + "phone": "iy_E" + } + ], + "start": 35.06, + "startOffset": 284, + "word": "early" + }, + { + "alignedWord": "his", + "case": "success", + "end": 37.8, + "endOffset": 294, + "phones": [ + { + "duration": 0.14, + "phone": "hh_B" + }, + { + "duration": 0.06, + "phone": "ih_I" + }, + { + "duration": 0.08, + "phone": "z_E" + } + ], + "start": 37.519999999999996, + "startOffset": 291, + "word": "His" + }, + { + "alignedWord": "shirt", + "case": "success", + "end": 37.879999999999995, + "endOffset": 300, + "phones": [ + { + "duration": 0.08, + "phone": "sh_B" + } + ], + "start": 37.8, + "startOffset": 295, + "word": "shirt" + }, + { + "alignedWord": "was", + "case": "success", + "end": 38.459999999999994, + "endOffset": 304, + "phones": [ + { + "duration": 0.12, + "phone": "w_B" + }, + { + "duration": 0.05, + "phone": "ao_I" + }, + { + "duration": 0.07, + "phone": "z_E" + } + ], + "start": 38.22, + "startOffset": 301, + "word": "was" + }, + { + "alignedWord": "clean", + "case": "success", + "end": 39.05, + "endOffset": 310, + "phones": [ + { + "duration": 0.17, + "phone": "k_B" + }, + { + "duration": 0.07, + "phone": "l_I" + }, + { + "duration": 0.24, + "phone": "iy_I" + }, + { + "duration": 0.11, + "phone": "n_E" + } + ], + "start": 38.459999999999994, + "startOffset": 305, + "word": "clean" + }, + { + "alignedWord": "but", + "case": "success", + "end": 39.309999999999995, + "endOffset": 314, + "phones": [ + { + "duration": 0.12, + "phone": "b_B" + }, + { + "duration": 0.03, + "phone": "ah_I" + }, + { + "duration": 0.08, + "phone": "t_E" + } + ], + "start": 39.08, + "startOffset": 311, + "word": "but" + }, + { + "alignedWord": "one", + "case": "success", + "end": 39.629999999999995, + "endOffset": 318, + "phones": [ + { + "duration": 0.14, + "phone": "w_B" + }, + { + "duration": 0.07, + "phone": "ah_I" + }, + { + "duration": 0.11, + "phone": "n_E" + } + ], + "start": 39.309999999999995, + "startOffset": 315, + "word": "one" + }, + { + "alignedWord": "button", + "case": "success", + "end": 39.879999999999995, + "endOffset": 325, + "phones": [ + { + "duration": 0.05, + "phone": "b_B" + }, + { + "duration": 0.09, + "phone": "ah_I" + }, + { + "duration": 0.06, + "phone": "t_I" + }, + { + "duration": 0.05, + "phone": "ah_I" + } + ], + "start": 39.629999999999995, + "startOffset": 319, + "word": "button" + }, + { + "alignedWord": "was", + "case": "success", + "end": 40.18, + "endOffset": 329, + "phones": [ + { + "duration": 0.06, + "phone": "w_B" + }, + { + "duration": 0.04, + "phone": "ao_I" + }, + { + "duration": 0.08, + "phone": "z_E" + } + ], + "start": 40, + "startOffset": 326, + "word": "was" + }, + { + "alignedWord": "gone", + "case": "success", + "end": 40.76, + "endOffset": 334, + "phones": [ + { + "duration": 0.1, + "phone": "g_B" + }, + { + "duration": 0.3, + "phone": "ao_I" + }, + { + "duration": 0.18, + "phone": "n_E" + } + ], + "start": 40.18, + "startOffset": 330, + "word": "gone" + }, + { + "alignedWord": "the", + "case": "success", + "end": 42.82, + "endOffset": 339, + "phones": [ + { + "duration": 0.17, + "phone": "dh_B" + }, + { + "duration": 0.08, + "phone": "iy_E" + } + ], + "start": 42.57, + "startOffset": 336, + "word": "The" + }, + { + "alignedWord": "barrel", + "case": "success", + "end": 43.34, + "endOffset": 346, + "phones": [ + { + "duration": 0.1, + "phone": "b_B" + }, + { + "duration": 0.17, + "phone": "eh_I" + }, + { + "duration": 0.07, + "phone": "r_I" + }, + { + "duration": 0.1, + "phone": "ah_I" + }, + { + "duration": 0.08, + "phone": "l_E" + } + ], + "start": 42.82, + "startOffset": 340, + "word": "barrel" + }, + { + "alignedWord": "of", + "case": "success", + "end": 43.480000000000004, + "endOffset": 349, + "phones": [ + { + "duration": 0.03, + "phone": "ah_B" + }, + { + "duration": 0.11, + "phone": "v_E" + } + ], + "start": 43.34, + "startOffset": 347, + "word": "of" + }, + { + "alignedWord": "beer", + "case": "success", + "end": 44.04, + "endOffset": 354, + "phones": [ + { + "duration": 0.08, + "phone": "b_B" + }, + { + "duration": 0.25, + "phone": "ih_I" + }, + { + "duration": 0.2, + "phone": "r_E" + } + ], + "start": 43.51, + "startOffset": 350, + "word": "beer" + }, + { + "alignedWord": "was", + "case": "success", + "end": 44.27, + "endOffset": 358, + "phones": [ + { + "duration": 0.08, + "phone": "w_B" + }, + { + "duration": 0.04, + "phone": "ao_I" + }, + { + "duration": 0.08, + "phone": "z_E" + } + ], + "start": 44.07, + "startOffset": 355, + "word": "was" + }, + { + "alignedWord": "a", + "case": "success", + "end": 44.339999999999996, + "endOffset": 360, + "phones": [ + { + "duration": 0.07, + "phone": "ah_S" + } + ], + "start": 44.269999999999996, + "startOffset": 359, + "word": "a" + }, + { + "alignedWord": "brew", + "case": "success", + "end": 44.78, + "endOffset": 365, + "phones": [ + { + "duration": 0.13, + "phone": "b_B" + }, + { + "duration": 0.03, + "phone": "r_I" + }, + { + "duration": 0.28, + "phone": "uw_E" + } + ], + "start": 44.34, + "startOffset": 361, + "word": "brew" + }, + { + "alignedWord": "of", + "case": "success", + "end": 44.95, + "endOffset": 368, + "phones": [ + { + "duration": 0.08, + "phone": "ah_B" + }, + { + "duration": 0.09, + "phone": "v_E" + } + ], + "start": 44.78, + "startOffset": 366, + "word": "of" + }, + { + "alignedWord": "malt", + "case": "success", + "end": 45.45, + "endOffset": 373, + "phones": [ + { + "duration": 0.14, + "phone": "m_B" + }, + { + "duration": 0.12, + "phone": "ao_I" + }, + { + "duration": 0.18, + "phone": "l_I" + }, + { + "duration": 0.03, + "phone": "t_E" + } + ], + "start": 44.980000000000004, + "startOffset": 369, + "word": "malt" + }, + { + "alignedWord": "and", + "case": "success", + "end": 45.78, + "endOffset": 377, + "phones": [ + { + "duration": 0.08, + "phone": "ah_B" + }, + { + "duration": 0.05, + "phone": "n_I" + }, + { + "duration": 0.05, + "phone": "d_E" + } + ], + "start": 45.6, + "startOffset": 374, + "word": "and" + }, + { + "alignedWord": "hops", + "case": "success", + "end": 46.32, + "endOffset": 382, + "phones": [ + { + "duration": 0.12, + "phone": "hh_B" + }, + { + "duration": 0.2, + "phone": "aa_I" + }, + { + "duration": 0.14, + "phone": "p_I" + }, + { + "duration": 0.08, + "phone": "s_E" + } + ], + "start": 45.78, + "startOffset": 378, + "word": "hops" + }, + { + "alignedWord": "tin", + "case": "success", + "end": 48.93, + "endOffset": 387, + "phones": [ + { + "duration": 0.11, + "phone": "t_B" + }, + { + "duration": 0.12, + "phone": "ih_I" + }, + { + "duration": 0.12, + "phone": "n_E" + } + ], + "start": 48.58, + "startOffset": 384, + "word": "Tin" + }, + { + "alignedWord": "cans", + "case": "success", + "end": 49.49, + "endOffset": 392, + "phones": [ + { + "duration": 0.1, + "phone": "k_B" + }, + { + "duration": 0.27, + "phone": "ae_I" + }, + { + "duration": 0.06, + "phone": "n_I" + }, + { + "duration": 0.13, + "phone": "z_E" + } + ], + "start": 48.93, + "startOffset": 388, + "word": "cans" + }, + { + "alignedWord": "are", + "case": "success", + "end": 49.63, + "endOffset": 396, + "phones": [ + { + "duration": 0.14, + "phone": "er_S" + } + ], + "start": 49.49, + "startOffset": 393, + "word": "are" + }, + { + "alignedWord": "absent", + "case": "success", + "end": 50.220000000000006, + "endOffset": 403, + "phones": [ + { + "duration": 0.24, + "phone": "ae_B" + }, + { + "duration": 0.1, + "phone": "b_I" + }, + { + "duration": 0.07, + "phone": "s_I" + }, + { + "duration": 0.03, + "phone": "ah_I" + }, + { + "duration": 0.05, + "phone": "n_I" + }, + { + "duration": 0.1, + "phone": "t_E" + } + ], + "start": 49.63, + "startOffset": 397, + "word": "absent" + }, + { + "alignedWord": "from", + "case": "success", + "end": 50.46, + "endOffset": 408, + "phones": [ + { + "duration": 0.06, + "phone": "f_B" + }, + { + "duration": 0.04, + "phone": "r_I" + }, + { + "duration": 0.06, + "phone": "ah_I" + }, + { + "duration": 0.08, + "phone": "m_E" + } + ], + "start": 50.22, + "startOffset": 404, + "word": "from" + }, + { + "alignedWord": "store", + "case": "success", + "end": 50.84, + "endOffset": 414, + "phones": [ + { + "duration": 0.08, + "phone": "s_B" + }, + { + "duration": 0.06, + "phone": "t_I" + }, + { + "duration": 0.13, + "phone": "ao_I" + }, + { + "duration": 0.08, + "phone": "r_E" + } + ], + "start": 50.49, + "startOffset": 409, + "word": "store" + }, + { + "alignedWord": "shelves", + "case": "success", + "end": 51.42, + "endOffset": 422, + "phones": [ + { + "duration": 0.2, + "phone": "sh_B" + }, + { + "duration": 0.14, + "phone": "eh_I" + }, + { + "duration": 0.16, + "phone": "l_I" + }, + { + "duration": 0.03, + "phone": "v_I" + }, + { + "duration": 0.05, + "phone": "z_E" + } + ], + "start": 50.84, + "startOffset": 415, + "word": "shelves" + } + ] +} \ No newline at end of file diff --git a/tests/data/expected/full-harvard-sentences-list30-s01.json b/tests/data/expected/full-harvard-sentences-list30-s01.json new file mode 100644 index 00000000..a57c9a06 --- /dev/null +++ b/tests/data/expected/full-harvard-sentences-list30-s01.json @@ -0,0 +1,223 @@ +{ + "transcript": "the mute muscle the higher tones are who are and ", + "words": [ + { + "alignedWord": "the", + "case": "success", + "end": 0.5900000000000001, + "endOffset": 3, + "phones": [ + { + "duration": 0.17, + "phone": "dh_B" + }, + { + "duration": 0.08, + "phone": "iy_E" + } + ], + "start": 0.34, + "startOffset": 0, + "word": "the" + }, + { + "alignedWord": "mute", + "case": "success", + "end": 0.98, + "endOffset": 8, + "phones": [ + { + "duration": 0.08, + "phone": "m_B" + }, + { + "duration": 0.11, + "phone": "y_I" + }, + { + "duration": 0.13, + "phone": "uw_I" + }, + { + "duration": 0.07, + "phone": "t_E" + } + ], + "start": 0.59, + "startOffset": 4, + "word": "mute" + }, + { + "alignedWord": "muscle", + "case": "success", + "end": 1.74, + "endOffset": 15, + "phones": [ + { + "duration": 0.23, + "phone": "m_B" + }, + { + "duration": 0.11, + "phone": "ah_I" + }, + { + "duration": 0.12, + "phone": "s_I" + }, + { + "duration": 0.05, + "phone": "ah_I" + }, + { + "duration": 0.22, + "phone": "l_E" + } + ], + "start": 1.01, + "startOffset": 9, + "word": "muscle" + }, + { + "alignedWord": "the", + "case": "success", + "end": 1.97, + "endOffset": 19, + "phones": [ + { + "duration": 0.11, + "phone": "dh_B" + }, + { + "duration": 0.09, + "phone": "iy_E" + } + ], + "start": 1.77, + "startOffset": 16, + "word": "the" + }, + { + "alignedWord": "higher", + "case": "success", + "end": 2.29, + "endOffset": 26, + "phones": [ + { + "duration": 0.1, + "phone": "hh_B" + }, + { + "duration": 0.18, + "phone": "ay_I" + }, + { + "duration": 0.04, + "phone": "er_E" + } + ], + "start": 1.97, + "startOffset": 20, + "word": "higher" + }, + { + "alignedWord": "tones", + "case": "success", + "end": 2.86, + "endOffset": 32, + "phones": [ + { + "duration": 0.16, + "phone": "t_B" + }, + { + "duration": 0.24, + "phone": "ow_I" + }, + { + "duration": 0.09, + "phone": "n_I" + }, + { + "duration": 0.08, + "phone": "z_E" + } + ], + "start": 2.29, + "startOffset": 27, + "word": "tones" + }, + { + "alignedWord": "are", + "case": "success", + "end": 3.12, + "endOffset": 36, + "phones": [ + { + "duration": 0.26, + "phone": "er_S" + } + ], + "start": 2.86, + "startOffset": 33, + "word": "are" + }, + { + "alignedWord": "who", + "case": "success", + "end": 3.43, + "endOffset": 40, + "phones": [ + { + "duration": 0.11, + "phone": "hh_B" + }, + { + "duration": 0.2, + "phone": "uw_E" + } + ], + "start": 3.12, + "startOffset": 37, + "word": "who" + }, + { + "alignedWord": "are", + "case": "success", + "end": 3.49, + "endOffset": 44, + "phones": [ + { + "duration": 0.06, + "phone": "er_S" + } + ], + "start": 3.43, + "startOffset": 41, + "word": "are" + }, + { + "alignedWord": "and", + "case": "success", + "end": 3.64, + "endOffset": 48, + "phones": [ + { + "duration": 0.03, + "phone": "ah_B" + }, + { + "duration": 0.05, + "phone": "n_I" + }, + { + "duration": 0.07, + "phone": "d_E" + } + ], + "start": 3.49, + "startOffset": 45, + "word": "and" + } + ] +} \ No newline at end of file diff --git a/tests/data/input/README.md b/tests/data/input/README.md new file mode 100644 index 00000000..1c139078 --- /dev/null +++ b/tests/data/input/README.md @@ -0,0 +1,5 @@ +## Data sources: + +Harvard Sentences audio files taken from the Open Speech Repository at http://www.voiptroubleshooter.com/open_speech/american.html + +The Harvard Sentences are listed at http://www.cs.columbia.edu/%7Ehgs/audio/harvard.html diff --git a/tests/data/input/harvard-sentences-list30-s01.mp3 b/tests/data/input/harvard-sentences-list30-s01.mp3 new file mode 100644 index 00000000..6714a2cf Binary files /dev/null and b/tests/data/input/harvard-sentences-list30-s01.mp3 differ diff --git a/tests/data/input/harvard-sentences-list30.mp3 b/tests/data/input/harvard-sentences-list30.mp3 new file mode 100644 index 00000000..5319b966 Binary files /dev/null and b/tests/data/input/harvard-sentences-list30.mp3 differ diff --git a/tests/data/input/harvard-sentences-list30.txt b/tests/data/input/harvard-sentences-list30.txt new file mode 100644 index 00000000..c6f9011b --- /dev/null +++ b/tests/data/input/harvard-sentences-list30.txt @@ -0,0 +1,10 @@ +The mute muffled the high tones of the horn. +The gold ring fits only a pierced ear. +The old pan was covered with hard fudge. +Watch the log float in the wide river. +The node on the stalk of wheat grew daily. +The heap of fallen leaves was set on fire. +Write fast, if you want to finish early. +His shirt was clean but one button was gone. +The barrel of beer was a brew of malt and hops. +Tin cans are absent from store shelves. diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 00000000..968e24a5 --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,23 @@ +import json +import os +import pytest + +_this_dir = os.path.dirname(os.path.realpath(__file__)) +_data_dir = os.path.join(_this_dir, "data") +_input_dir = os.path.join(_data_dir, "input") +_expected_dir = os.path.join(_data_dir, "expected") +_result_dir = os.path.join(_data_dir, "tmp") + +def input_path(name): + return os.path.join(_input_dir, name) + +def expected_path(name): + return os.path.join(_expected_dir, name) + +def result_path(name): + if not os.path.isdir(_result_dir): os.makedirs(_result_dir) + return os.path.join(_result_dir, name) + +def input_data(filename): + with open(input_path(filename)) as fh: + return fh.read(); diff --git a/tests/test_forced_aligner.py b/tests/test_forced_aligner.py new file mode 100644 index 00000000..a2d17a0f --- /dev/null +++ b/tests/test_forced_aligner.py @@ -0,0 +1,38 @@ +import gentle +import logging +import logging.handlers +import sys + +from helpers import * + +class TestForcedAligner: + + def test_harvard30(self, tmpdir, request): + name = "harvard-sentences-list30" + transcript = input_data(name + ".txt") + audiofile = input_path(name + ".mp3") + expectedfile = expected_path("forced-" + name + ".json") + resultfile = result_path("forced-" + name + ".json") + + expected = gentle.Transcription.from_jsonfile(expectedfile) + assert transcript == expected.transcript # test data consistency check + + resources = gentle.Resources() + aligner = gentle.ForcedAligner(resources, transcript) + + logger = logging.getLogger(request.node.name) + handler = logging.handlers.MemoryHandler(sys.maxint) + logger.addHandler(handler) + logger.setLevel('INFO') + + with gentle.resampled(audiofile) as wavfile: + result = aligner.transcribe(wavfile, logging=logger) + + log = [record.getMessage() for record in handler.buffer] + + with open(resultfile, "w") as fh: + fh.write(result.to_json()) + + assert result == expected + assert "5 unaligned words (of 86)" in log + assert "after 2nd pass: 0 unaligned words (of 86)" in log diff --git a/tests/test_full_transcriber.py b/tests/test_full_transcriber.py new file mode 100644 index 00000000..d290eedd --- /dev/null +++ b/tests/test_full_transcriber.py @@ -0,0 +1,30 @@ +import gentle +import json +import logging +import logging.handlers +import sys + +from helpers import * + +class TestFullTranscriber: + + def test_harvard30_s01(self, request): + name = "harvard-sentences-list30-s01" + audiofile = input_path(name + ".mp3") + expectedfile = expected_path("full-" + name + ".json") + resultfile = result_path("full-" + name + ".json") + + expected = gentle.Transcription.from_jsonfile(expectedfile) + + resources = gentle.Resources() + transcriber = gentle.FullTranscriber(resources) + + assert transcriber.available # verify language model is loaded + + with gentle.resampled(audiofile) as wavfile: + result = transcriber.transcribe(wavfile) + + with open(resultfile, "w") as fh: + fh.write(result.to_json()) + + assert result == expected diff --git a/www/view_alignment.html b/www/view_alignment.html index 7125c8c4..63e7cb37 100644 --- a/www/view_alignment.html +++ b/www/view_alignment.html @@ -258,7 +258,7 @@