From 6a24f4f8492f4ec34ef7159270b503fba07be105 Mon Sep 17 00:00:00 2001 From: matatonic Date: Sat, 6 Apr 2024 14:17:01 -0400 Subject: [PATCH] 0.6.1 +gptq 4bit for internlm --- README.md | 4 +++- backend/xcomposer2-vl.py | 27 +++++++++++++++++++++++++-- backend/xcomposer2.py | 30 +++++++++++++++++++++++++++--- chat_with_image.py | 10 ++++------ docker-compose.yml | 2 ++ requirements.txt | 3 ++- vision_qna.py | 2 +- 7 files changed, 64 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index b97e7cd..1e685ce 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Model support: - [ ] [openbmb/OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B) - [ ] [echo840/Monkey](https://huggingface.co/echo840/Monkey) - [ ] [YanweiLi/MiniGemini](https://huggingface.co/collections/YanweiLi/) +- [ ] [NousResearch/Obsidian-3B-V0.5](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [ ] ... @@ -27,9 +28,10 @@ Some vision systems include their own OpenAI compatible API server. Also include - [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) ([cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf), [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)), `docker-compose.cogvlm.yml` **Recommended for 16GB-40GB GPU**s - [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL ([Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B), [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)), `docker-compose.yi-vl.yml` -Version: 0.6.0 +Version: 0.6.1 Recent updates: +- AutoGPTQ support for internlm/internlm-xcomposer2-7b-4bit, internlm/internlm-xcomposer2-vl-7b-4bit - Automatic selection of backend, based on the model name - Enable trust_remote_code by default - Improved parameter support: temperature, top_p, max_tokens, system prompts diff --git a/backend/xcomposer2-vl.py b/backend/xcomposer2-vl.py index abf5324..78c45be 100644 --- a/backend/xcomposer2-vl.py +++ b/backend/xcomposer2-vl.py @@ -1,9 +1,23 @@ import os from transformers import AutoTokenizer, AutoModel - from vision_qna import * +import auto_gptq +import torch # internlm/internlm-xcomposer2-vl-7b +# internlm/internlm-xcomposer2-vl-7b-4bit + +class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM): + layers_block_name = "model.layers" + outside_layer_modules = [ + 'vit', 'vision_proj', 'model.tok_embeddings', 'model.norm', 'output', + ] + inside_layer_modules = [ + ["attention.wqkv.linear"], + ["attention.wo.linear"], + ["feed_forward.w1.linear", "feed_forward.w3.linear"], + ["feed_forward.w2.linear"], + ] class VisionQnA(VisionQnABase): model_name: str = "xcomposer2-vl" @@ -12,7 +26,16 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None) super().__init__(model_id, device, extra_params, format) self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) - self.model = AutoModel.from_pretrained(**self.params).eval() + + if '-4bit' in model_id: + if self.params['torch_dtype'] == torch.bfloat16: + self.params['torch_dtype'] = torch.float16 + + torch.set_grad_enabled(False) + auto_gptq.modeling._base.SUPPORTED_MODELS = ["internlm"] + self.model = InternLMXComposer2QForCausalLM.from_quantized(model_name_or_path=model_id, **self.params).eval() + else: + self.model = AutoModel.from_pretrained(**self.params).eval() print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}") diff --git a/backend/xcomposer2.py b/backend/xcomposer2.py index 788bc28..ea2ab5e 100644 --- a/backend/xcomposer2.py +++ b/backend/xcomposer2.py @@ -1,7 +1,23 @@ import os from transformers import AutoTokenizer, AutoModel - from vision_qna import * +import auto_gptq +import torch + +# internlm/internlm-xcomposer2-7b +# internlm/internlm-xcomposer2-7b-4bit + +class InternLMXComposer2QForCausalLM(auto_gptq.modeling.BaseGPTQForCausalLM): + layers_block_name = "model.layers" + outside_layer_modules = [ + 'vit', 'vision_proj', 'model.tok_embeddings', 'model.norm', 'output', + ] + inside_layer_modules = [ + ["attention.wqkv.linear"], + ["attention.wo.linear"], + ["feed_forward.w1.linear", "feed_forward.w3.linear"], + ["feed_forward.w2.linear"], + ] # internlm/internlm-xcomposer2-7b @@ -12,8 +28,16 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None) super().__init__(model_id, device, extra_params, format) self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False)) - self.model = AutoModel.from_pretrained(**self.params).eval() - + if '-4bit' in model_id: + if self.params['torch_dtype'] == torch.bfloat16: + self.params['torch_dtype'] = torch.float16 + + torch.set_grad_enabled(False) + auto_gptq.modeling._base.SUPPORTED_MODELS = ["internlm"] + self.model = InternLMXComposer2QForCausalLM.from_quantized(model_name_or_path=model_id, **self.params).eval() + else: + self.model = AutoModel.from_pretrained(**self.params).eval() + print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}") async def chat_with_images(self, request: ImageChatRequest) -> str: diff --git a/chat_with_image.py b/chat_with_image.py index b5f0230..545a48a 100755 --- a/chat_with_image.py +++ b/chat_with_image.py @@ -48,10 +48,8 @@ def url_to_data_url(img_url: str) -> str: image_url = str(DataURI.from_file(image_url)) messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else [] - content = [ - { "type": "image_url", "image_url": { "url": image_url } }, - { "type": "text", "text": ' '.join(args.questions) }, - ] + content = [{ "type": "image_url", "image_url": { "url": image_url } }, + { "type": "text", "text": ' '.join(args.questions) }] messages.extend([{ "role": "user", "content": content }]) while True: @@ -70,8 +68,8 @@ def url_to_data_url(img_url: str) -> str: break content = [{"type": "image_url", "image_url": { "url": image_url } }] if image_url else [] - content.extend([{ 'type': 'text', 'text': response.choices[0].message.content } ]) + content.extend([{ 'type': 'text', 'text': response.choices[0].message.content }]) messages.extend([{ "role": "assistant", "content": content }, - { "role": "user", "content": [ { 'type': 'text', 'text': q } ] } ]) + { "role": "user", "content": [{ 'type': 'text', 'text': q }] }]) diff --git a/docker-compose.yml b/docker-compose.yml index 6e11d6c..e686d59 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,7 +13,9 @@ services: ports: - 5006:5006 #command: ["python", "vision.py", "-m", "internlm/internlm-xcomposer2-7b", "--use-flash-attn"] + #command: ["python", "vision.py", "-m", "internlm/internlm-xcomposer2-7b-4bit", "--use-flash-attn"] #command: ["python", "vision.py", "-m", "internlm/internlm-xcomposer2-vl-7b", "--use-flash-attn"] + #command: ["python", "vision.py", "-m", "internlm/internlm-xcomposer2-vl-7b-4bit", "--use-flash-attn"] #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-v1.6-34b-hf", "--load-in-4bit", "--use-flash-attn"] # WIP #command: ["python", "vision.py", "-m", "echo840/Monkey-Chat"] # broken #command: ["python", "vision.py", "-m", "openbmb/OmniLMM-12B"] # WIP diff --git a/requirements.txt b/requirements.txt index a933ac0..385dd6d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ bitsandbytes flash_attn sentencepiece protobuf -peft \ No newline at end of file +peft +auto_gptq \ No newline at end of file diff --git a/vision_qna.py b/vision_qna.py index 8d2bffc..a2fdac0 100644 --- a/vision_qna.py +++ b/vision_qna.py @@ -47,7 +47,7 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None) 'quantization_config': { 'load_in_4bit': True, 'bnb_4bit_quant_type': "nf4", - 'bnb_4bit_use_double_quant': True, + 'bnb_4bit_use_double_quant': True, # XXX make this an option 'bnb_4bit_compute_dtype': self.dtype, } }