diff --git a/Dockerfile b/Dockerfile index d70d47f..1c6e382 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,11 +10,9 @@ RUN git clone https://github.com/dvlab-research/MGM.git --single-branch /app/MGM WORKDIR /app COPY requirements.txt . ARG VERSION=latest -# transformers==4.36.2 supports most models except MGM-2B, llava-1.6, nanollava -RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0" >> requirements.txt ; fi +RUN if [ "$VERSION" = "alt" ]; then echo "transformers==4.36.2" >> requirements.txt; else echo "transformers>=4.39.0\nautoawq" >> requirements.txt ; fi # TODO: nvidia apex wheel -RUN pip install --no-cache-dir -U -r requirements.txt \ - https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.7/flash_attn-2.5.7+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl +RUN --mount=type=cache,target=/root/.cache/pip pip install -U -r requirements.txt WORKDIR /app/MGM RUN pip install --no-cache-dir --no-deps -e . diff --git a/README.md b/README.md index 04fa502..05a1ac0 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,9 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview` - - [X] [XComposer2-7b-4bit](https://huggingface.co/internlm/internlm-xcomposer2-7b-4bit) (not recommended) - - [X] [XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] (wont gpu split) - - [X] [XComposer2-VL-4bit](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b-4bit) +- [X] [HuggingFaceM4/idefics2](https://huggingface.co/HuggingFaceM4) +- - [X] [idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) (main docker only, wont gpu split) +- - [X] [idefics2-8b-AWQ](https://huggingface.co/HuggingFaceM4/idefics2-8b-AWQ) (main docker only, wont gpu split) - [X] [LlavaNext](https://huggingface.co/llava-hf) (main docker only) - - [X] [llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) (main docker only) - - [X] [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (main docker only) @@ -62,6 +65,11 @@ See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_le ## Recent updates +Version: 0.12.0 + +- new model support: HuggingFaceM4/idefics2-8b, HuggingFaceM4/idefics2-8b-AWQ +- Fix: remove prompt from output of InternVL-Chat-V1-5 + Version: 0.11.0 - new model support: OpenGVLab/InternVL-Chat-V1-5, up to 4k resolution, top opensource model @@ -117,9 +125,9 @@ docker compose -f docker-compose.alt.yml pull ```shell # install the python dependencies -pip install -r requirements.txt "transformers>=4.39.0" +pip install -U -r requirements.txt "transformers>=4.39.0" autoawq # OR install the python dependencies for the alt version -pip install -r requirements.txt "transformers==4.36.2" +pip install -U -r requirements.txt "transformers==4.36.2" # run the server with your chosen model python vision.py --model vikhyatk/moondream2 ``` diff --git a/backend/idefics2.py b/backend/idefics2.py new file mode 100644 index 0000000..c547750 --- /dev/null +++ b/backend/idefics2.py @@ -0,0 +1,58 @@ +from transformers import AutoProcessor, AutoModelForVision2Seq +from transformers import AwqConfig + +from vision_qna import * + +# "HuggingFaceM4/idefics2-8b" +# "HuggingFaceM4/idefics2-8b-AWQ" + +class VisionQnA(VisionQnABase): + model_name: str = "idefics2" + + def __init__(self, model_id: str, device: str, device_map: str = 'auto', extra_params = {}, format = None): + super().__init__(model_id, device, device_map, extra_params, format) + + #do_image_splitting=False + #size= {"longest_edge": 448, "shortest_edge": 378} + self.processor = AutoProcessor.from_pretrained(model_id) + + if '-awq' in model_id.lower(): + """ + # This is from https://huggingface.co/HuggingFaceM4/idefics2-8b + # It doesn't work + quantization_config = AwqConfig( + bits=4, + fuse_max_seq_len=4096, + modules_to_fuse={ + "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], + "mlp": ["gate_proj", "up_proj", "down_proj"], + "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], + "use_alibi": False, + "num_attention_heads": 32, + "num_key_value_heads": 8, + "hidden_size": 4096, + } + ) + self.params['quantization_config'] = quantization_config + """ + + if self.params['torch_dtype'] == torch.bfloat16: + self.params['torch_dtype'] = torch.float16 + + self.model = AutoModelForVision2Seq.from_pretrained(**self.params).to(self.device) + + print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}") + + async def chat_with_images(self, request: ImageChatRequest) -> str: + images, hfmessages = await images_hfmessages_from_messages(request.messages) + + prompt = self.processor.apply_chat_template(hfmessages, add_generation_prompt=True) + inputs = self.processor(text=prompt, images=images, return_tensors="pt") + inputs = {k: v.to(self.model.device) for k, v in inputs.items()} + + # Generate + params = self.get_generation_params(request) + generated_ids = self.model.generate(**inputs, **params) + generated_texts = self.processor.decode(generated_ids[0][inputs['input_ids'].size(1):].cpu(), skip_special_tokens=True) + + return generated_texts \ No newline at end of file diff --git a/model_conf_tests.json b/model_conf_tests.json index 37fc754..74e7ba0 100644 --- a/model_conf_tests.json +++ b/model_conf_tests.json @@ -2,7 +2,8 @@ ["vikhyatk/moondream2", "--use-flash-attn"], ["vikhyatk/moondream1"], ["OpenGVLab/InternVL-Chat-V1-5", "--device-map", "cuda:0"], - ["qnguyen3/nanoLLaVA", "--use-flash-attn"], + ["HuggingFaceM4/idefics2-8b", "--use-flash-attn", "--device-map", "cuda:0"], + ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--device-map", "cuda:0"], ["echo840/Monkey"], ["echo840/Monkey-Chat"], ["THUDM/cogvlm-chat-hf"], @@ -26,7 +27,8 @@ ["OpenGVLab/InternVL-Chat-V1-5-Int8", "--device-map", "cuda:0"], ["OpenGVLab/InternVL-Chat-V1-5", "--load-in-4bit", "--device-map", "cuda:0"], - ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit"], + ["HuggingFaceM4/idefics2-8b-AWQ", "--use-flash-attn", "--device-map", "cuda:0"], + ["qnguyen3/nanoLLaVA", "--use-flash-attn", "--load-in-4bit", "--device-map", "cuda:0"], ["THUDM/cogvlm-chat-hf", "--load-in-4bit"], ["THUDM/cogagent-chat-hf", "--load-in-4bit"], ["internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn"], diff --git a/requirements.txt b/requirements.txt index a0d8feb..df290e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,13 @@ accelerate auto_gptq bitsandbytes fastapi -flash_attn +# See: https://github.com/bdashore3/flash-attention/releases for other windows flash_attn releases +# And: https://github.com/Dao-AILab/flash-attention/releases for linux. +https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/bdashore3/flash-attention/releases/download/v2.5.8/flash_attn-2.5.8+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +flash_attn; python_version != "3.10" and python_version != "3.11" openai peft protobuf @@ -10,12 +16,12 @@ pydantic python-datauri requests sentencepiece -torch>=2.2.0 +torch==2.2.* uvicorn xformers # moondream -deepspeed==0.11.1 +deepspeed<0.14.0 einops einops-exts httpx @@ -36,8 +42,11 @@ transformers_stream_generator loguru sse_starlette -#latest +# alt +#transformers==4.36.2 + +# latest #transformers>=4.39.0 +# idefics2 +#autoawq -#alt -#transformers==4.36.2 diff --git a/test_models.py b/test_models.py index 5825861..b1864a4 100755 --- a/test_models.py +++ b/test_models.py @@ -20,8 +20,8 @@ 'leaf': 'https://images.freeimages.com/images/large-previews/cd7/gingko-biloba-1058537.jpg', } -green_pass = '\033[92mpass\033[0m' -red_fail = '\033[91mfail\033[0m' +green_pass = '\033[92mpass\033[0m✅' +red_fail = '\033[91mfail\033[0m❌' def data_url_from_url(img_url: str) -> str: @@ -48,7 +48,7 @@ def record_result(cmd_args, results, t, mem, note): 'note': note }]) result = all(results) - print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\" # test {'pass' if result else 'fail'}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}") + print(f"#CLI_COMMAND=\"python vision.py -m {' '.join(cmd_args)}\" # test {green_pass if result else red_fail}, time: {t:.1f}s, mem: {mem:.1f}GB, {note}") torch_memory_baseline = 0 @@ -115,8 +115,8 @@ def test(cmd_args: list[str]) -> int: mem = get_total_gpu_mem_used() result = all(results) - if result: - note = 'All tests passed.' + if not note: + note = f'{results.count(True)}/{len(results)} tests passed.' print(f"\n\n###\n\nTest complete.\nResult: {green_pass if result else red_fail}, time: {t:.1f}s") @@ -233,4 +233,4 @@ def single_round(): for r in all_results: cmdl = ' '.join(r['args']) result = all(r['results']) - print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {'pass' if result else 'fail'}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}") + print(f"#CLI_COMMAND=\"python vision.py -m {cmdl}\" # test {green_pass if result else red_fail}, time: {r['time']:.1f}s, mem: {r['mem']:.1f}GB, {r['note']}") diff --git a/vision-alt.sample.env b/vision-alt.sample.env index 0cd6628..fa4aa08 100644 --- a/vision-alt.sample.env +++ b/vision-alt.sample.env @@ -2,45 +2,45 @@ # Copy this file to vision.env and uncomment the model of your choice. HF_HOME=hf_home #CUDA_VISIBLE_DEVICES=1,0 -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass, time: 4.8s, mem: 4.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test pass, time: 4.1s, mem: 4.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass, time: 6.1s, mem: 21.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass, time: 7.7s, mem: 21.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass, time: 13.5s, mem: 52.2GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass, time: 13.8s, mem: 36.5GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass, time: 15.4s, mem: 37.4GB, All tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass, time: 4.7s, mem: 19.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn" # test pass, time: 5.6s, mem: 15.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn" # test pass, time: 14.8s, mem: 19.1GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn" # test pass, time: 22.1s, mem: 27.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn" # test pass, time: 17.1s, mem: 31.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn" # test pass, time: 11.2s, mem: 67.5GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn" # test pass, time: 77.9s, mem: 70.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn" # test pass, time: 14.9s, mem: 91.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn" # test pass, time: 20.0s, mem: 96.4GB, All tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass, time: 15.9s, mem: 25.1GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass, time: 15.9s, mem: 19.2GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass, time: 17.6s, mem: 20.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass, time: 7.1s, mem: 11.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass, time: 6.6s, mem: 7.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0" # test fail, time: 2.1s, mem: 15.9GB, -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass, time: 6.2s, mem: 14.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass, time: 7.1s, mem: 27.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass, time: 28.2s, mem: 31.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail, time: 18.3s, mem: 18.3GB, -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass, time: 21.5s, mem: 12.2GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass, time: 20.8s, mem: 12.2GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0" # test pass, time: 10.4s, mem: 9.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0" # test pass, time: 12.9s, mem: 11.1GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn" # test fail, time: 2.6s, mem: 6.1GB, -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn" # test pass, time: 10.0s, mem: 5.5GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn" # test pass, time: 11.3s, mem: 8.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn" # test pass, time: 12.5s, mem: 6.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn" # test pass, time: 39.7s, mem: 9.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn" # test pass, time: 41.2s, mem: 9.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn" # test pass, time: 17.6s, mem: 13.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn" # test pass, time: 18.2s, mem: 21.4GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn" # test pass, time: 222.7s, mem: 24.4GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn" # test pass, time: 22.6s, mem: 26.5GB, All tests passed. -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn" # test pass, time: 26.1s, mem: 29.7GB, All tests passed. \ No newline at end of file +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 4.6s, mem: 4.6GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test pass✅, time: 4.3s, mem: 4.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 6.2s, mem: 21.8GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 7.8s, mem: 21.8GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 13.5s, mem: 52.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 14.4s, mem: 36.2GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 15.1s, mem: 37.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 5.0s, mem: 19.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --use-flash-attn" # test pass✅, time: 5.8s, mem: 15.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --use-flash-attn" # test pass✅, time: 16.0s, mem: 18.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --use-flash-attn" # test pass✅, time: 20.0s, mem: 27.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --use-flash-attn" # test pass✅, time: 18.4s, mem: 31.6GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --use-flash-attn" # test pass✅, time: 11.3s, mem: 67.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --use-flash-attn" # test pass✅, time: 96.4s, mem: 70.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --use-flash-attn" # test pass✅, time: 16.0s, mem: 91.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --use-flash-attn" # test pass✅, time: 18.6s, mem: 96.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 14.9s, mem: 25.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.8s, mem: 19.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 15.8s, mem: 20.2GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.0s, mem: 11.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.2s, mem: 7.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0" # test fail❌, time: 2.1s, mem: 15.7GB, 0/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.0s, mem: 14.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.1s, mem: 26.8GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 27.9s, mem: 31.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail❌, time: 17.6s, mem: 18.0GB, 2/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 20.5s, mem: 12.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 21.2s, mem: 11.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn --device cuda:0" # test pass✅, time: 9.6s, mem: 9.2GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn --device cuda:0" # test pass✅, time: 12.4s, mem: 10.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn" # test fail❌, time: 2.5s, mem: 5.8GB, 0/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 10.0s, mem: 5.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 10.9s, mem: 8.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B --load-in-4bit --use-flash-attn" # test pass✅, time: 9.3s, mem: 6.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-7B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 37.7s, mem: 9.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B --load-in-4bit --use-flash-attn" # test pass✅, time: 37.2s, mem: 9.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-13B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 22.0s, mem: 13.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B --load-in-4bit --use-flash-attn" # test pass✅, time: 18.2s, mem: 21.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-34B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 224.7s, mem: 24.2GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B --load-in-4bit --use-flash-attn" # test pass✅, time: 23.8s, mem: 26.3GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-8x7B-HD --load-in-4bit --use-flash-attn" # test pass✅, time: 25.1s, mem: 29.5GB, 8/8 tests passed. \ No newline at end of file diff --git a/vision.sample.env b/vision.sample.env index cc19dc1..e87cc25 100644 --- a/vision.sample.env +++ b/vision.sample.env @@ -2,44 +2,46 @@ # Copy this file to vision.env and uncomment the model of your choice. HF_HOME=hf_home #CUDA_VISIBLE_DEVICES=1,0 -#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass, time: 7.3s, mem: 4.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test fail, time: 4.2s, mem: 4.9GB, Test failed with Exception: Internal Server Error -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass, time: 18.7s, mem: 52.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn" # test pass, time: 11.4s, mem: 8.4GB, All tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass, time: 10.0s, mem: 21.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass, time: 13.2s, mem: 21.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass, time: 17.0s, mem: 36.1GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass, time: 19.9s, mem: 36.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass, time: 8.2s, mem: 19.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass, time: 18.9s, mem: 24.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass, time: 23.7s, mem: 18.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass, time: 20.7s, mem: 20.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass, time: 9.7s, mem: 11.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass, time: 12.7s, mem: 7.4GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0" # test fail, time: 6.8s, mem: 15.5GB, -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass, time: 8.5s, mem: 14.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass, time: 12.6s, mem: 26.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass, time: 16.1s, mem: 19.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass, time: 13.3s, mem: 18.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass, time: 12.4s, mem: 32.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test pass, time: 52.0s, mem: 72.3GB, All tests passed. -#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass, time: 7.9s, mem: 8.2GB, All tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass, time: 27.1s, mem: 31.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail, time: 20.1s, mem: 18.0GB, -#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit" # test pass, time: 14.5s, mem: 7.8GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass, time: 21.3s, mem: 12.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass, time: 24.7s, mem: 12.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass, time: 13.2s, mem: 9.2GB, All tests passed. -#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass, time: 19.2s, mem: 10.7GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn" # test fail, time: 5.0s, mem: 5.7GB, -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn" # test pass, time: 11.0s, mem: 5.5GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn" # test pass, time: 15.8s, mem: 8.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn" # test pass, time: 26.6s, mem: 8.9GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn" # test pass, time: 15.5s, mem: 9.6GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn" # test pass, time: 13.3s, mem: 14.5GB, All tests passed. -#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn" # test pass, time: 54.8s, mem: 26.0GB, All tests passed. -#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --load-in-4bit --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --load-in-4bit --use-flash-attn" # test fail, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). -#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit" # test pass, time: 7.4s, mem: 5.6GB, All tests passed. \ No newline at end of file +#CLI_COMMAND="python vision.py -m vikhyatk/moondream2 --use-flash-attn" # test pass✅, time: 4.1s, mem: 4.6GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m vikhyatk/moondream1" # test fail❌, time: 3.6s, mem: 4.8GB, Test failed with Exception: Internal Server Error +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --device-map cuda:0" # test pass✅, time: 13.5s, mem: 51.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.1s, mem: 22.3GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --device-map cuda:0" # test pass✅, time: 7.9s, mem: 8.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey" # test pass✅, time: 6.5s, mem: 21.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m echo840/Monkey-Chat" # test pass✅, time: 8.2s, mem: 21.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf" # test pass✅, time: 14.1s, mem: 36.2GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf" # test pass✅, time: 15.3s, mem: 37.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m Qwen/Qwen-VL-Chat" # test pass✅, time: 5.0s, mem: 19.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m adept/fuyu-8b --device-map cuda:0" # test pass✅, time: 14.4s, mem: 24.8GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.6s, mem: 18.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b --use-flash-attn --device-map cuda:0" # test pass✅, time: 16.5s, mem: 20.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V-2 --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.9s, mem: 11.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m openbmb/MiniCPM-V --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.1s, mem: 7.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --use-flash-attn --device-map cuda:0" # test fail❌, time: 2.0s, mem: 15.6GB, 0/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.1s, mem: 14.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --use-flash-attn --device-map cuda:0" # test pass✅, time: 6.8s, mem: 26.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --use-flash-attn" # test pass✅, time: 12.8s, mem: 19.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --use-flash-attn" # test pass✅, time: 8.6s, mem: 18.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --use-flash-attn" # test pass✅, time: 9.6s, mem: 32.6GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --use-flash-attn" # test pass✅, time: 46.2s, mem: 72.4GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). +#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn" # test pass✅, time: 4.5s, mem: 8.3GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5-Int8 --device-map cuda:0" # test pass✅, time: 27.6s, mem: 32.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m OpenGVLab/InternVL-Chat-V1-5 --load-in-4bit --device-map cuda:0" # test fail❌, time: 17.0s, mem: 18.2GB, 2/8 tests passed. +#CLI_COMMAND="python vision.py -m HuggingFaceM4/idefics2-8b-AWQ --use-flash-attn --device-map cuda:0" # test pass✅, time: 8.2s, mem: 12.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m qnguyen3/nanoLLaVA --use-flash-attn --load-in-4bit --device-map cuda:0" # test pass✅, time: 11.2s, mem: 7.6GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogvlm-chat-hf --load-in-4bit" # test pass✅, time: 19.4s, mem: 12.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m THUDM/cogagent-chat-hf --load-in-4bit" # test pass✅, time: 20.6s, mem: 12.2GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-7b-4bit --use-flash-attn" # test pass✅, time: 11.2s, mem: 9.5GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m internlm/internlm-xcomposer2-vl-7b-4bit --use-flash-attn" # test pass✅, time: 11.3s, mem: 10.9GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/bakLlava-v1-hf --load-in-4bit --use-flash-attn" # test fail❌, time: 2.5s, mem: 5.9GB, 0/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-7b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 9.3s, mem: 5.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-1.5-13b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 10.2s, mem: 9.0GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-mistral-7b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 19.1s, mem: 9.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-7b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 13.6s, mem: 9.7GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-vicuna-13b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 12.2s, mem: 14.8GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m llava-hf/llava-v1.6-34b-hf --load-in-4bit --use-flash-attn" # test pass✅, time: 54.1s, mem: 26.1GB, 8/8 tests passed. +#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-6B --load-in-4bit --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). +#CLI_COMMAND="python vision.py -m 01-ai/Yi-VL-34B --load-in-4bit --use-flash-attn" # test fail❌, time: -1.0s, mem: -1.0GB, Error: Server failed to start (exit). +#CLI_COMMAND="python vision.py -m YanweiLi/MGM-2B --use-flash-attn --load-in-4bit" # test pass✅, time: 7.4s, mem: 5.7GB, 8/8 tests passed. \ No newline at end of file diff --git a/vision_qna.py b/vision_qna.py index c74e534..f2cfee5 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -7,6 +7,7 @@ from typing import Optional, List, Literal from pydantic import BaseModel from transformers import BitsAndBytesConfig +from transformers.image_utils import load_image class ImageURL(BaseModel): url: str @@ -116,14 +117,7 @@ def get_generation_params(self, request: ImageChatRequest, default_params = {}) return params async def url_to_image(img_url: str) -> Image.Image: - if img_url.startswith('http'): - response = requests.get(img_url) - - img_data = response.content - elif img_url.startswith('data:'): - img_data = DataURI(img_url).data - - return Image.open(io.BytesIO(img_data)).convert("RGB") + return load_image(img_url) async def url_to_file(img_url: str) -> str: if img_url.startswith('data:'): @@ -139,6 +133,25 @@ async def url_to_file(img_url: str) -> str: f.write(response.content) return filename +async def images_hfmessages_from_messages(messages: list[Message], url_handler = url_to_image): + hfmessages = [] + images = [] + + for m in messages: + content = [] + for c in m.content: + if c.type == 'image_url': + image = await url_handler(c.image_url.url) + images.extend([image]) + content.extend([{"type": "image"}]) + elif c.type == 'text': + content.extend([{'type': 'text', 'text': c.text}]) + + hfmessages.extend([{'role': m.role, 'content': content}]) + + return images, hfmessages + + async def phi15_prompt_from_messages(messages: list[Message], img_tok = "", img_end = ''): # prompt = '' images = [] @@ -464,4 +477,8 @@ def guess_backend(model_name: str) -> str: return 'fuyu' if 'internvl-chat-v1-5' in model_id: - return 'internvl-chat-v1-5' \ No newline at end of file + return 'internvl-chat-v1-5' + + if 'idefics2' in model_id: + return 'idefics2' + \ No newline at end of file