-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathrun_exported_model_AVSR.py
68 lines (50 loc) · 2.36 KB
/
run_exported_model_AVSR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import argparse
import sys
import os
import subprocess
# Make sure that we can import functions/classes from utils/ folder
sys.path.insert(1, os.path.join(sys.path[0], '..'))
import numpy as np
from util.data_preprocessing_video import extract_and_store_visual_features
from util.exported_model import DeepSpeechModel
from util.video_stream import VideoStream
# Argument parser. This script expects 4 optional args.
parser = argparse.ArgumentParser(
description='Run a trained Speech to Text model for some inputs. \
**NOTE: Please use a trailing slash for directory names.')
parser.add_argument('-d', '--export_dir', type=str,
help="Dir where the trained model's meta graph and data were exported")
parser.add_argument('-vf', '--video_file', type=str,
help="Video file's location.")
parser.add_argument('-n', '--model_name', type=str,
help='Name of the model exported')
parser.add_argument('--use_spell_check', default = False, action='store_true',
help='Whether to use spell check system for decoded transcript from RNN')
args = parser.parse_args()
export_dir = args.export_dir or 'data/export_AVSR/00000001/'
model_name = args.model_name or 'export'
# Create DeepSpeechModel Class object.
# This object will be responsible for handling tensorflow model.
# For video file, we will create a temporary audio file (.wav file) and json file (storing visual features)
temp_dir = '/tmp/'
temp_wav_name = 'temp.wav'
temp_json_name = 'temp.json'
# Using samplerate and bitrate similar to Red Hen Lab's sample video.
cmd = "ffmpeg -i " + args.video_file + " -ab 96k -ar 44100 -vn " + temp_dir + temp_wav_name
subprocess.call(cmd, shell=True)
# Now extract visual features from the video and store them in a JSON file.
status = extract_and_store_visual_features(args.video_file, temp_dir, temp_json_name)
if not status:
print "[ERROR] Stopped due to incorrect video FPS.(Required FPS=30)"
os.remove(temp_dir+temp_wav_name) # delete temporary audio file
exit()
model = DeepSpeechModel(export_dir, model_name, args.use_spell_check, use_visual_features=True)
model.restore_model()
transcript = model.find_transcripts(temp_dir+temp_wav_name, temp_dir+temp_json_name)
print "\n[Generated Transcript]\t",
print transcript
# Delete temporary audio file and json file
os.remove(temp_dir+temp_wav_name)
os.remove(temp_dir+temp_json_name)
# Close tensorflow session
model.close()