forked from PKU-YuanGroup/Video-LLaVA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_audvis.py
94 lines (73 loc) · 3.95 KB
/
test_audvis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import torch
from videollava.constants import IMAGE_TOKEN_INDEX, SPEECH_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_SPEECH_TOKEN
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.model.builder import load_pretrained_model
from videollava.utils import disable_torch_init
from videollava.mm_utils import tokenizer_image_token, tokenizer_image_speech_token, get_model_name_from_path, KeywordsStoppingCriteria
import pdb
import whisper
def main():
disable_torch_init()
# video = '/home/mok/module/Video-LLaVA/test.mp4'
video = '/node_data/hyun/mok/data/MELD.Raw/train_splits/dia0_utt0.mp4'
### laugh reasoing 가능, scene에서 relation 추론 가능, emotion 추론 가능
inp = 'Give me the reason why the person laughed in this clip. \
I also give you transcription and the relation between two people.\
Utterance: Why do you love me? Oh, I love you because youre my firstborn. Youre my game changer, my life changer.\
# Suck it, siblings. '
inp = 'Please transcribe the given video.'
inp = 'How is the speaker currently feeling?'
model_base = "lmsys/vicuna-7b-v1.5"
# model_path = "./checkpoints/videollava-7b-lora"
# model_path = "/home/mok/module/Video-LLaVA/checkpoints/videollava-7b"
model_path = "/home/mok/module/Video-LLaVA-Speech/checkpoints/videollava-7b-aud-debug"
cache_dir = '/home/mok/module/Video-LLaVA/cache_dir'
device = 'cuda'
load_4bit, load_8bit = True, False
model_name = get_model_name_from_path(model_path)
tokenizer, model, processor, _ = load_pretrained_model(model_path, model_base, model_name)#, load_8bit, load_4bit, device=device, cache_dir=cache_dir)
video_processor = processor['video']
speech_processor = processor['speech']
conv_mode = "llava_v1"
conv = conv_templates[conv_mode].copy()
roles = conv.roles
video_tensor = video_processor(video, return_tensors='pt')['pixel_values']
if type(video_tensor) is list:
tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor]
else:
tensor = video_tensor.to(model.device, dtype=torch.float16)
tensor = torch.rand(tensor.shape).to(model.device, dtype=torch.float16)
# import pdb; pdb.set_trace()
audio = whisper.load_audio(video)
audio = whisper.pad_or_trim(audio)
speech_tensor = [speech_processor(audio, sampling_rate=16000, return_tensors='pt').input_features]
if type(speech_tensor) is list:
speech = [speech.to(model.device, dtype=torch.float16) for speech in speech_tensor]
else:
speech = speech_tensor.to(model.device, dtype=torch.float16)
print(f"{roles[1]}: {inp}")
inp = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + inp
# inp = ' '.join([DEFAULT_IMAGE_TOKEN] * 1) + '\n' + inp
inp = ' '.join([DEFAULT_SPEECH_TOKEN] * 1) + '\n' + inp ### speech가 맨앞에들어가!!!
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
# input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
input_ids = tokenizer_image_speech_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, SPEECH_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=tensor,
speeches=speech,
do_sample=True,
temperature=0.1,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
print(outputs)
if __name__ == '__main__':
main()