Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Droid] Integrate Florence-2-Large Model #309

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bin/test_truss_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def truss_push():
raise Exception(
f"Failed to push model:\n\nSTDOUT: {result.stdout.decode()}\nSTDERR: {result.stderr.decode()}"
)
model_id = match.group(1)
deployment_id = match.group(2)
model_id = str(match.group(1)) # Ensure model_id is a string
deployment_id = str(match.group(2)) # Ensure deployment_id is a string
print(
f"Model pushed successfully. model-id: {model_id}. deployment-id: {deployment_id}"
)
Expand Down
26 changes: 26 additions & 0 deletions florence-2-large/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
description: Deploy Florence-2 Large model for multi-task computer vision
model_metadata:
model_name: Florence-2-Large
avatar_url: https://huggingface.co/microsoft/Florence-2-large/resolve/main/florence2_avatar.png
cover_image_url: https://huggingface.co/microsoft/Florence-2-large/resolve/main/florence2_cover_image.png
tags:
- vision
- multi-task
example_model_input:
prompt: "<OD>"
image_url: "https://example.com/test_image.jpg"
python_version: py39
requirements:
- transformers==4.30.0
- torch==2.0.1
- torchvision==0.15.2
- GitPython==3.1.31
resources:
accelerator: A100
cpu: 4
memory: 16G
use_gpu: true
secrets:
hf_access_token:
description: Access token for Hugging Face
spec_version: 1
60 changes: 60 additions & 0 deletions florence-2-large/model/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import Dict, List

import requests
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor


class Model:
def __init__(self, **kwargs):
self.model = None
self.processor = None
self.model_name = "microsoft/Florence-2-large"

def load(self):
if self.model is None:
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name, trust_remote_code=True
)
self.processor = AutoProcessor.from_pretrained(
self.model_name, trust_remote_code=True
)

def preprocess(self, prompt: str, image_url: str) -> Dict:
image = Image.open(requests.get(image_url, stream=True).raw)
inputs = self.processor(text=prompt, images=image, return_tensors="pt")
return inputs

def postprocess(
self, generated_ids: List[int], original_image: Image.Image
) -> Dict:
generated_text = self.processor.batch_decode(
generated_ids, skip_special_tokens=False
)[0]
parsed_answer = self.processor.post_process_generation(
generated_text,
task=self.task,
image_size=(original_image.width, original_image.height),
)
return parsed_answer

def predict(self, model_input: Dict) -> Dict[str, List]:
self.load()
prompt = model_input["prompt"]
image_url = model_input["image_url"]
self.task = model_input.get("task", "<OD>")

inputs = self.preprocess(prompt, image_url)
original_image = inputs.pop("images")

generated_ids = self.model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3,
do_sample=False,
)

parsed_answer = self.postprocess(generated_ids, original_image)

return {"result": parsed_answer}
53 changes: 53 additions & 0 deletions florence-2-large/tests/test_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import unittest
from typing import Dict

from model.model import Model


class TestFlorence2Large(unittest.TestCase):
def setUp(self):
self.model = Model()
self.model.load()

def test_model_loading(self):
"""Test if the model and its components are loaded correctly."""
self.assertIsNotNone(self.model.model, "Model not loaded")
self.assertIsNotNone(self.model.processor, "Processor not loaded")

def test_inference(self):
"""Test model inference with predefined inputs and expected outputs."""
test_input = {
"prompt": "<OD>",
"image_url": "https://example.com/test_image.jpg",
}
expected_output_keys = ["result"]

output = self.model.predict(test_input)

self.assertIsInstance(output, Dict, "Output is not a dictionary")
self.assertCountEqual(
output.keys(),
expected_output_keys,
"Output keys do not match expected keys",
)

def test_output_handling(self):
"""Test if the model's output is correctly formatted."""
test_input = {
"prompt": "<OD>",
"image_url": "https://example.com/test_image.jpg",
}

output = self.model.predict(test_input)
result = output["result"]

self.assertIsInstance(result, Dict, "Result is not a dictionary")
self.assertTrue("objects" in result, "Objects key not found in result")
self.assertTrue(
"panoptic_segmentation" in result,
"Panoptic segmentation key not found in result",
)


if __name__ == "__main__":
unittest.main()
Loading