-
Notifications
You must be signed in to change notification settings - Fork 9
/
record-prompt.py
91 lines (69 loc) · 3.21 KB
/
record-prompt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
import sys
import os
import time
from dotenv import load_dotenv
from pydub import AudioSegment
import azure.cognitiveservices.speech as speechsdk
load_dotenv("/var/lib/asterisk/agi-bin/.env")
AZURE_SPEECH_KEY = os.environ.get('AZURE_SPEECH_KEY')
AZURE_SERVICE_REGION = os.environ.get('AZURE_SERVICE_REGION')
# The format to record a prompt is as follows:
# ./record-prompt.py file-name "Text to record" language
# file-name --> file name if extension mp3, remember that in the Agent AI script, the welcome audio is: welcome-en (English), welcome-es (Spanish), and the wait audio is: wait-en (English), and wait-es (Spanish).
# languaje --> could be "en-US" or "es-ES"
# If you want to add more languages you must modify the scripts
# Check if a file name was provided
audio_name = sys.argv[1] if len(sys.argv) > 1 else None
audio_text = sys.argv[2] if len(sys.argv) > 1 else None
language = sys.argv[3] if len(sys.argv) > 1 else None
if audio_name is None:
print("No filename provided for the recording.")
sys.exit(1)
if audio_text is None:
print("No text to record audio.")
sys.exit(1)
if language == "es-ES":
azure_language = "es-ES"
azure_voice_name = "es-ES-ElviraNeural"
else:
azure_language = "en-US"
azure_voice_name = "en-US-JennyNeural"
audio_path = f"/var/lib/asterisk/sounds/{audio_name}.mp3"
print(audio_path)
def main():
# Sets API Key and Region
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
# Sets the synthesis output format.
# The full list of supported format can be found here:
# https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
# Select synthesis language and voice
# Set either the `SpeechSynthesisVoiceName` or `SpeechSynthesisLanguage`.
speech_config.speech_synthesis_language = azure_language
speech_config.speech_synthesis_voice_name = azure_voice_name
# Creates a speech synthesizer using file as audio output.
# Replace with your own audio file name.
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None)
result = speech_synthesizer.speak_text_async(audio_text).get()
stream = speechsdk.AudioDataStream(result)
stream.save_to_wav_file(audio_path)
# Path to the original MP3 file and path for the trimmed file
original_file = audio_path
trimmed_file = "/tmp/tmp.mp3"
# Load the original audio file in MP3 format
audio = AudioSegment.from_mp3(original_file)
# Get the total duration of the file in milliseconds
total_duration = len(audio)
# Calculate the new duration without the last second
new_duration = total_duration - 750 # Subtract 1000 milliseconds (1 second)
# Trim the audio file
trimmed_audio = audio[:new_duration]
# Save the trimmed file as MP3
trimmed_audio.export(trimmed_file, format="mp3")
# Remove the original file
os.remove(original_file)
# Rename the trimmed file to the original file name
os.rename(trimmed_file, original_file)
if __name__ == "__main__":
main()