forked from pladisdev/VTS-AI-Plugin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvts_model_inference.py
68 lines (54 loc) · 1.8 KB
/
vts_model_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import torch
import pyaudio
from vtube_studio.vts import VTS_API
import time
import asyncio
import keyboard
import numpy as np
from ai.vtubernet import VtuberNet
URL = "ws://localhost:8001/"
MIC_INPUT = "Enter Audio Input Here"
model = VtuberNet()
model.load_state_dict(torch.load("vtuber_model.pth"))
model.eval()
async def main():
async with VTS_API(URL) as vtube_studio:
pa = pyaudio.PyAudio()
desired_device_name = MIC_INPUT
desired_device_index = None
for i in range(pa.get_device_count()):
device_info = pa.get_device_info_by_index(i)
if desired_device_name in device_info["name"]:
desired_device_index = i
break
sample_format = pyaudio.paFloat32
channels = 1
stream = pa.open(
format=sample_format,
channels=channels,
rate=model.sample_rate,
frames_per_buffer=model.segment,
input_device_index=desired_device_index,
input=True
)
stream.start_stream()
params_config = await vtube_studio.get_parameters()
start = 0
while True:
print(str(time.time()-start))
while time.time() - start < .1:
await asyncio.sleep(.001)
start = time.time()
data = stream.read(model.segment)
audio_data = np.frombuffer(data, dtype=np.float32)
test_data = torch.tensor(audio_data)
with torch.no_grad():
output = model(test_data)
await vtube_studio.nn_movement(output.flatten().tolist(), params_config)
if keyboard.is_pressed('del'):
break
stream.stop_stream()
stream.close()
pa.terminate()
if(__name__ == '__main__'):
asyncio.run(main())