Skip to content

Commit

Permalink
0.12.0
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Apr 28, 2024
1 parent adb8eee commit 1dc75c7
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 112 deletions.
6 changes: 2 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@ RUN git clone https://github.com/dvlab-research/MGM.git --single-branch /app/MGM
WORKDIR /app
COPY requirements.txt .
ARG VERSION=latest
# transformers==4.36.2 supports most models except MGM-2B, llava-1.6, nanollava
RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0" >> requirements.txt ; fi
RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0\nautoawq" >> requirements.txt ; fi
# TODO: nvidia apex wheel
RUN pip install --no-cache-dir -U -r requirements.txt \
https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.7/flash_attn-2.5.7+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt

WORKDIR /app/MGM
RUN pip install --no-cache-dir --no-deps -e .
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
- - [X] [XComposer2-7b-4bit](https://huggingface.co/internlm/internlm-xcomposer2-7b-4bit) (not recommended)
- - [X] [XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] (wont gpu split)
- - [X] [XComposer2-VL-4bit](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b-4bit)
- [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4)
- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (main docker only, wont gpu split)
- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (main docker only, wont gpu split)
- [X] [LlavaNext](https://huggingface.co/llava-hf) (main docker only)
- - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (main docker only)
- - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (main docker only)
Expand Down Expand Up @@ -62,6 +65,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le

## Recent updates

Version: 0.12.0

- new model support: HuggingFaceM4/idefics2-8b, HuggingFaceM4/idefics2-8b-AWQ
- Fix: remove prompt from output of InternVL-Chat-V1-5

Version: 0.11.0

- new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model
Expand Down Expand Up @@ -117,9 +125,9 @@ docker compose -f docker-compose.alt.yml pull

```shell
# install the python dependencies
pip install -r requirements.txt "transformers>=4.39.0"
pip install -U -r requirements.txt "transformers>=4.39.0" autoawq
# OR install the python dependencies for the alt version
pip install -r requirements.txt "transformers==4.36.2"
pip install -U -r requirements.txt "transformers==4.36.2"
# run the server with your chosen model
python vision.py --model vikhyatk/moondream2
```
Expand Down
58 changes: 58 additions & 0 deletions backend/idefics2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers import AwqConfig

from vision_qna import *

# "HuggingFaceM4/idefics2-8b"
# "HuggingFaceM4/idefics2-8b-AWQ"

class VisionQnA(VisionQnABase):
model_name: str = "idefics2"

def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None):
super().__init__(model_id, device, device_map, extra_params, format)

#do_image_splitting=False
#size= {"longest_edge": 448, "shortest_edge": 378}
self.processor = AutoProcessor.from_pretrained(model_id)

if '-awq' in model_id.lower():
"""
# This is from https://huggingface.co/HuggingFaceM4/idefics2-8b
# It doesn't work
quantization_config = AwqConfig(
bits=4,
fuse_max_seq_len=4096,
modules_to_fuse={
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
"mlp": ["gate_proj", "up_proj", "down_proj"],
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
"use_alibi": False,
"num_attention_heads": 32,
"num_key_value_heads": 8,
"hidden_size": 4096,
}
)
self.params['quantization_config'] = quantization_config
"""

if self.params['torch_dtype'] == torch.bfloat16:
self.params['torch_dtype'] = torch.float16

self.model = AutoModelForVision2Seq.from_pretrained(**self.params).to(self.device)

print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")

async def chat_with_images(self, request: ImageChatRequest) -> str:
images, hfmessages = await images_hfmessages_from_messages(request.messages)

prompt = self.processor.apply_chat_template(hfmessages, add_generation_prompt=True)
inputs = self.processor(text=prompt, images=images, return_tensors="pt")
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

# Generate
params = self.get_generation_params(request)
generated_ids = self.model.generate(**inputs, **params)
generated_texts = self.processor.decode(generated_ids[0][inputs['input_ids'].size(1):].cpu(), skip_special_tokens=True)

return generated_texts
6 changes: 4 additions & 2 deletions model_conf_tests.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
["vikhyatk/moondream2", "--use-flash-attn"],
["vikhyatk/moondream1"],
["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA", "--use-flash-attn"],
["HuggingFaceM4/idefics2-8b", "--use-flash-attn", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"],
["echo840/Monkey"],
["echo840/Monkey-Chat"],
["THUDM/cogvlm-chat-hf"],
Expand All @@ -26,7 +27,8 @@

["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"],
["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit"],
["HuggingFaceM4/idefics2-8b-AWQ", "--use-flash-attn", "--device-map", "cuda:0"],
["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit", "--device-map", "cuda:0"],
["THUDM/cogvlm-chat-hf", "--load-in-4bit"],
["THUDM/cogagent-chat-hf", "--load-in-4bit"],
["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn"],
Expand Down
21 changes: 15 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,26 @@ accelerate
auto_gptq
bitsandbytes
fastapi
flash_attn
# See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases
# And: https://github.com/Dao-AILab/flash-attention/releases for linux.
https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
flash_attn; python_version != "3.10" and python_version != "3.11"
openai
peft
protobuf
pydantic
python-datauri
requests
sentencepiece
torch>=2.2.0
torch==2.2.*
uvicorn
xformers

# moondream
deepspeed==0.11.1
deepspeed<0.14.0
einops
einops-exts
httpx
Expand All @@ -36,8 +42,11 @@ transformers_stream_generator
loguru
sse_starlette

#latest
# alt
#transformers==4.36.2

# latest
#transformers>=4.39.0
# idefics2
#autoawq

#alt
#transformers==4.36.2
12 changes: 6 additions & 6 deletions test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
'leaf': 'https://images.freeimages.com/images/large-previews/cd7/gingko-biloba-1058537.jpg',
}

green_pass = '\033[92mpass\033[0m'
red_fail = '\033[91mfail\033[0m'
green_pass = '\033[92mpass\033[0m'
red_fail = '\033[91mfail\033[0m'


def data_url_from_url(img_url: str) -> str:
Expand All @@ -48,7 +48,7 @@ def record_result(cmd_args, results, t, mem, note):
'note': note
}])
result = all(results)
print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\" # test {'pass' if result else 'fail'}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}")
print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\" # test {green_pass if result else red_fail}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}")

torch_memory_baseline = 0

Expand Down Expand Up @@ -115,8 +115,8 @@ def test(cmd_args: list[str]) -> int:
mem = get_total_gpu_mem_used()

result = all(results)
if result:
note = 'All tests passed.'
if not note:
note = f'{results.count(True)}/{len(results)} tests passed.'

print(f"\n\n###\n\nTest complete.\nResult: {green_pass if result else red_fail}, time: {t:.1f}s")

Expand Down Expand Up @@ -233,4 +233,4 @@ def single_round():
for r in all_results:
cmdl = ' '.join(r['args'])
result = all(r['results'])
print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {'pass' if result else 'fail'}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")
print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}")
Loading

0 comments on commit 1dc75c7

Please sign in to comment.