-
Notifications
You must be signed in to change notification settings - Fork 2
/
demo_gradio.py
116 lines (98 loc) · 3.83 KB
/
demo_gradio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sys
sys.path.insert(0, "/data9/shz/project/ZoomEye")
from PIL import Image
import os
import json
import warnings
import gradio as gr
from ZoomEye.zoom_model import ZoomModelGlobalLocal, ZoomModelLocal
from ZoomEye.zoom_eye import get_zoom_eye_response
warnings.filterwarnings("ignore")
# ------------------ Global Model Initialization ------------------
# Load the model globally so it's initialized only once
MODEL_PATH = "/data9/shz/ckpt/llava-onevision-qwen2-7b-ov"
def initialize_model(model_path):
"""
Initialize the ZoomEye model based on the model configuration.
"""
config_path = os.path.join(model_path, "config.json")
config = json.load(open(config_path, "r"))
if "anyres" in config['image_aspect_ratio']:
zoom_model = ZoomModelGlobalLocal(
model_path=model_path, conv_type="qwen_1_5", patch_scale=1.2, bias_value=0.6
)
else:
zoom_model = ZoomModelLocal(
model_path=model_path, conv_type="v1", patch_scale=None, bias_value=0.2
)
return zoom_model, config
# Initialize the model only once
ZOOM_MODEL, CONFIG = initialize_model(MODEL_PATH)
# ------------------ Main Function for Processing ------------------
def process_image_and_question(image, question):
"""
Process the input image and question to generate a response.
Returns a textual answer and an output image.
"""
def pop_limit_func(max_depth):
return max_depth * 3
search_kwargs = {
"pop_limit": pop_limit_func,
"threshold_descrease": [0.1, 0.1, 0.2],
"answering_confidence_threshold_lower": 0,
"answering_confidence_threshold_upper": 0.6,
"visual_cue_threshold": 0.6,
}
decomposed_question_template = "What is the appearance of the {}?"
ic_examples_path = "ZoomEye/ic_examples/hr-bench_4k.json"
# Save the input image as a file (for ZoomEye processing)
input_image_path = "temp_image.jpg"
image.save(input_image_path)
annotation = {
"input_image": input_image_path,
"question": question,
}
# Get the response, which includes a textual answer and an output image
response = get_zoom_eye_response(
zoom_model=ZOOM_MODEL, # Use the globally initialized model
annotation=annotation,
ic_examples=json.load(open(ic_examples_path, "r")),
decomposed_question_template=decomposed_question_template,
**search_kwargs,
)
text_response = response["text"] # Model's textual answer
output_image = response["output_image"] # Output image (PIL.Image object)
return text_response, output_image # Return text and image
# ------------------ Gradio Interface ------------------
def gradio_interface():
"""
Define the Gradio interface with input fields, outputs, and examples.
"""
examples = [
["demo/demo.jpg", "What is the color of the soda can?"],
["demo/demo1.jpg", "What is the message written on the sign?"],
["demo/demo2.jpg", "Is the red car on the left or right side of the police car?"],
]
inputs = [
gr.Image(type="pil", label="Image"), # Image upload input
gr.Textbox(label="Question", placeholder="Please input your question here.") # Text input
]
outputs = [
gr.Textbox(label="Model Response"), # Textual response output
gr.Image(label="Zoomed View") # Zoomed view of ZoomEye
]
interface = gr.Interface(
fn=process_image_and_question,
inputs=inputs,
outputs=outputs,
title="ZoomEye Demo",
description="Upload an image and ask a question to get the response and zoomed view from ZoomEye.",
theme="compact",
examples=examples,
)
return interface
if __name__ == "__main__":
# Launch the Gradio interface
print("lauch gradio interface")
app = gradio_interface()
app.launch(share=True)