-
Notifications
You must be signed in to change notification settings - Fork 0
/
openAITTS.py
81 lines (67 loc) · 3.36 KB
/
openAITTS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from pathlib import Path
from pydub import AudioSegment
from pydub.generators import Sine
import json
from openai import OpenAI
import tempfile
def speak_with_pauses(segments_json, rate=150):
segments = json.loads(segments_json)
# JSONIFY all the segments given by LLM
segments = json.loads(segments_json)
print(segments)
# initialized variables
audio_segments = []
durations = []
asset_list = []
# adding a silence of 10 second to initialize the audio
silence_segment = Sine(0).to_audio_segment(duration=0.1) # Convert to milliseconds
audio_segments.append(silence_segment)
durations.append(0.1)
client = OpenAI()
for i in range(1, len(segments)+1):
sentence = segments[i-1].get('sentence')
pause_duration = segments[i-1].get('pause')
asset = segments[i-1].get('asset')
if sentence:
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=sentence
)
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as temp_audio_file:
temp_audio_file.write(response.content)
temp_audio_file.close()
audio_segment = AudioSegment.from_file(temp_audio_file.name)
audio_segments.append(audio_segment)
duration = len(audio_segment) / 1000 # convert milliseconds to seconds
durations.append(duration)
asset_list.append({"time": sum(durations), "asset": asset})
if pause_duration > 0:
silence_segment = Sine(0).to_audio_segment(duration=pause_duration) # Convert to milliseconds
audio_segments.append(silence_segment)
durations.append(pause_duration / 1000) # convert milliseconds to seconds
# Concatenate audio segments
output_audio = audio_segments[0]
for segment in audio_segments[1:]:
output_audio = output_audio.append(segment, crossfade=0)
# Create JSON object to store duration data
duration_json = {}
for i, dur in enumerate(durations):
duration_json[f"Segment {i+1}"] = dur
with open('pause_durations.json', 'w') as json_file:
json.dump(duration_json, json_file, indent=4)
# Export the audio
output_audio.export("final_audio_with_pauses.mp3", format="mp3")
print("Audio exported successfully.")
return ['final_audio_with_pauses.mp3', asset_list]
# Example usage
# test_text = """[
# {"sentence": "Welcome back folks!", "asset": "welcome.mp4", "pause": 100},
# {"sentence": "Today, we're going to discuss a fundamental aspect of game development: collision detection.", "asset": "collision_detection.mp4", "pause": 100},
# {"sentence": "Now, you might be wondering, what exactly is collision detection?", "pause": 1000},
# {"sentence": "Well, it's the process of determining when two objects in a game world intersect or come into contact with each other.", "pause": 200},
# {"sentence": "This might sound straightforward, but trust me, there's a bit more to it than meets the eye.", "pause": 300},
# {"sentence": "Let's delve deeper into this topic and explore how we can implement collision detection in our game.", "pause": 300}
# ]"""
# output_ = speak_with_pauses(test_text, 150)
# print("Asset list:", output_[1])