diff --git a/README.md b/README.md
index 0332cbc..b97e7cd 100644
--- a/README.md
+++ b/README.md
@@ -7,12 +7,12 @@ An OpenAI API compatible vision server, it functions like `gpt-4-vision-preview`
 - Does not connect to the OpenAI API and does not require an OpenAI API Key
 - Not affiliated with OpenAI in any way
 
-Backend Model support:
+Model support:
+- [X] [InternLM-XComposer2](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (multi-image chat model, lots of warnings on startup, but works fine)
+- [X] [InternLM-XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] *(only supports a single image, also lots of warnings)
 - [X] [LlavaNext](https://huggingface.co/llava-hf) - (llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf - llava-v1.6-34b-hf is not working well yet) *(only supports a single image)
 - [X] [Llava](https://huggingface.co/llava-hf) - (llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf) *(only supports a single image)
 - [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
-- [X] [InternLM-XComposer2](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (multi-image chat model, you may need to add "in English" to the first prompt.)
-- [X] [InternLM-XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] *(only supports a single image)
 - [X] Moondream2 - [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
 - [ ] Moondream1 - [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1)
 - [ ] Deepseek-VL - [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
@@ -27,20 +27,15 @@ Some vision systems include their own OpenAI compatible API server. Also include
 - [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) ([cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf), [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)), `docker-compose.cogvlm.yml` **Recommended for 16GB-40GB GPU**s
 - [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL ([Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B), [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)), `docker-compose.yi-vl.yml`
 
-Version: 0.5.0
+Version: 0.6.0
 
 Recent updates:
-- new backend: XComposer2 (multi-image finetuned chat model)
-- new backend: XComposer2-VL (single image pretrained model)
-- new backend: MiniCPM-V aka. OmniLMM-3B
-- Yi-VL and CogVLM (docker containers only)
-- new backend: Qwen-VL
-- new backend: llava (1.5)
-- new backend: llavanext (1.6+)
-- multi-turn questions & answers
-- chat_with_images.py test tool and code sample
-- selectable chat formats
-- flash attention 2, accelerate (device split), bitsandbytes (4bit, 8bit) support
+- Automatic selection of backend, based on the model name
+- Enable trust_remote_code by default
+- Improved parameter support: temperature, top_p, max_tokens, system prompts
+- Improved default generation parameters and sampler settings
+- Improved system prompt for InternLM-XComposer2 & InternLM-XComposer2-VL, Fewer refusals and should not require "In English" nearly as much while still supporting Chinese.
+- Fix: chat_with_images.py url filename bug
 
 
 See: [OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)
@@ -54,6 +49,8 @@ API Documentation
 Installation instructions
 -------------------------
 
+(**Docker Recommended**)
+
 ```shell
 # install the python dependencies
 pip install -r requirements.txt
@@ -61,29 +58,29 @@ pip install -r requirements.txt
 pip install -r requirements.moondream.txt -r requirements.qwen-vl.txt
 # install the package
 pip install .
-# run the server
-python vision.py
+# run the server with your chosen model
+python vision.py --model vikhyatk/moondream2
 ```
 
 Usage
 -----
 
 ```
-usage: vision.py [-h] [-m MODEL] [-b BACKEND] [-f FORMAT] [-d DEVICE] [--no-trust-remote-code] [-4] [-8] [-F] [-P PORT] [-H HOST] [--preload]
+usage: vision.py [-h] -m MODEL [-b BACKEND] [-f FORMAT] [-d DEVICE] [--no-trust-remote-code] [-4] [-8] [-F] [-P PORT] [-H HOST] [--preload]
 
 OpenedAI Vision API Server
 
 options:
   -h, --help            show this help message and exit
   -m MODEL, --model MODEL
-                        The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf (default: vikhyatk/moondream2)
+                        The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf (default: None)
   -b BACKEND, --backend BACKEND
                         Force the backend to use (moondream1, moondream2, llavanext, llava, qwen-vl) (default: None)
   -f FORMAT, --format FORMAT
                         Force a specific chat format. (vicuna, mistral, chatml, llama2, phi15, gemma) (doesn't work with all models) (default: None)
   -d DEVICE, --device DEVICE
                         Set the torch device for the model. Ex. cuda:1 (default: auto)
-  --no-trust-remote-code
+  c
                         Don't trust remote code (required for some models) (default: False)
   -4, --load-in-4bit    load in 4bit (doesn't work with all models) (default: False)
   -8, --load-in-8bit    load in 8bit (doesn't work with all models) (default: False)
@@ -96,9 +93,15 @@ options:
 Docker support
 --------------
 
-You can run the server via docker like so:
+1) Edit the docker-compose file to suit your needs.
+
+2) You can run the server via docker like so:
 ```shell
 docker compose up
+# for CogVLM
+docker compose -f docker-compose.cogvlm.yml up
+# for VI-VL
+docker compose -f docker-compose.yi-vl.yml up
 ```
 
 Sample API Usage
@@ -109,11 +112,23 @@ Sample API Usage
 Example:
 ```
 $ python chat_with_image.py https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
-Answer: This is a beautiful image of a wooden path leading through a lush green field. The path appears to be well-trodden, suggesting it's a popular route for walking or hiking. The sky is a clear blue with some scattered clouds, indicating a pleasant day with good weather. The field is vibrant and seems to be well-maintained, which could suggest it's part of a park or nature reserve. The overall scene is serene and inviting, perfect for a peaceful walk in nature.
+Answer: The image captures a serene landscape of a grassy field, where a wooden walkway cuts through the center. The path is flanked by tall, lush green grass on either side, leading the eye towards the horizon. A few trees and bushes are scattered in the distance, adding depth to the scene. Above, the sky is a clear blue, dotted with white clouds that add to the tranquil atmosphere.
+
 
 Question: Are there any animals in the picture?
-Answer: No, there are no animals visible in the picture. The focus is on the path and the surrounding natural landscape. 
+Answer: No, there are no animals visible in the picture.
 
-Question: 
+Question: ^D
+$
+```
+
+Known Bugs & Workarounds
+------------------------
+
+1. Related to cuda device split, If you get:
+```
+RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument tensors in method wrapper_CUDA_cat)
 ```
+Try to specify a single cuda device with `CUDA_VISIBLE_DEVICES=1` (or # of your GPU) before running the script. or set the device via `--device \<device\>` on the command line.
 
+2. 4bit/8bit and flash attention 2 don't work for all the models. No workaround.
diff --git a/backend/deepseek-vl.py b/backend/deepseek-vl.py
index d596498..dd02977 100644
--- a/backend/deepseek-vl.py
+++ b/backend/deepseek-vl.py
@@ -19,7 +19,7 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         # XXX WIP
         conversation = [
             {
diff --git a/backend/generic.py b/backend/generic.py
index 57d66b7..6d0ad0b 100644
--- a/backend/generic.py
+++ b/backend/generic.py
@@ -16,12 +16,15 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
     
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
-        images, prompt = await prompt_from_messages(messages, self.format)
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, prompt = await prompt_from_messages(request.messages, self.format)
 
         encoded_images = self.model.encode_image(images)
         inputs = self.tokenizer(prompt, encoded_images, return_tensors="pt")
-        output = self.model.generate(**inputs, max_new_tokens=max_tokens)
+
+        params = self.get_generation_params(request)
+
+        output = self.model.generate(**inputs, **params)
         response = self.tokenizer.decode(output[0], skip_special_tokens=True)
 
         return answer_from_response(response, self.format)
diff --git a/backend/llava.py b/backend/llava.py
index 8815fbe..1cbdda8 100644
--- a/backend/llava.py
+++ b/backend/llava.py
@@ -21,12 +21,14 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
                                
-        images, prompt = await prompt_from_messages(messages, self.format)
+        images, prompt = await prompt_from_messages(request.messages, self.format)
         inputs = self.processor(prompt, images, return_tensors="pt").to(self.device)
 
-        output = self.model.generate(**inputs, max_new_tokens=max_tokens)
+        params = self.get_generation_params(request)
+
+        output = self.model.generate(**inputs, **params)
         response = self.processor.decode(output[0], skip_special_tokens=True)
         
         return answer_from_response(response, self.format)
diff --git a/backend/llavanext.py b/backend/llavanext.py
index 1e274cb..a1d4b49 100644
--- a/backend/llavanext.py
+++ b/backend/llavanext.py
@@ -21,12 +21,14 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
-                               
-        images, prompt = await prompt_from_messages(messages, self.format)
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+
+        images, prompt = await prompt_from_messages(request.messages, self.format)
         inputs = self.processor(prompt, images, return_tensors="pt").to(self.model.device)
 
-        output = self.model.generate(**inputs, max_new_tokens=max_tokens)
+        params = self.get_generation_params(request)
+
+        output = self.model.generate(**inputs, **params)
         response = self.processor.decode(output[0], skip_special_tokens=True)
         
         return answer_from_response(response, self.format)
diff --git a/backend/minigemini.py b/backend/minigemini.py
index db333b6..c64a34f 100644
--- a/backend/minigemini.py
+++ b/backend/minigemini.py
@@ -22,8 +22,8 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
-        images, prompt = await prompt_from_messages(messages, self.format)
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, prompt = await prompt_from_messages(request.messages, self.format)
 
         #encoded_images = self.model.encode_image(images).to(self.device)
         # square?
@@ -32,18 +32,19 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
 
         input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
 
+        params = self.get_generation_params(request)
+        
         with torch.inference_mode():
             output_ids = self.model.generate(
                 input_ids,
                 images=image_tensor,
                 images_aux=None,
-                do_sample=False,
-                temperature=0.0,
-                max_new_tokens=max_tokens,
                 bos_token_id=self.tokenizer.bos_token_id,  # Begin of sequence token
                 eos_token_id=self.tokenizer.eos_token_id,  # End of sequence token
                 pad_token_id=self.tokenizer.pad_token_id,  # Pad token
-                use_cache=True)
+                use_cache=True,
+                **params,
+            )
             
         answer = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
 
diff --git a/backend/monkey.py b/backend/monkey.py
index 192a355..c800ebf 100644
--- a/backend/monkey.py
+++ b/backend/monkey.py
@@ -14,19 +14,19 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         super().__init__(model_id, device, extra_params, format)
 
          # XXX currently bugged https://huggingface.co/echo840/Monkey/discussions/4
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
         self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
 
         self.tokenizer.padding_side = 'left'
         self.tokenizer.pad_token_id = self.tokenizer.eod_id
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         files = []
         prompt = ''
     
-        for m in messages:
+        for m in request.messages:
             if m.role == 'user':
                 p = ''
                 for c in m.content:
@@ -48,12 +48,12 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
         attention_mask = input_ids.attention_mask.to(self.model.device)
         input_ids = input_ids.input_ids.to(self.model.device)
 
+        params = self.get_generation_params(request)
+
         pred = self.model.generate(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            do_sample=False,
             num_beams=1,
-            max_new_tokens=512,
             min_new_tokens=1,
             length_penalty=1,
             num_return_sequences=1,
@@ -61,6 +61,7 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
             use_cache=True,
             pad_token_id=self.tokenizer.eod_id,
             eos_token_id=self.tokenizer.eod_id,
+            **params,
         )
         response = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
 
diff --git a/backend/moondream1.py b/backend/moondream1.py
index 966a3ca..b9826b9 100644
--- a/backend/moondream1.py
+++ b/backend/moondream1.py
@@ -22,10 +22,12 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
-        images, prompt = await prompt_from_messages(messages, self.format)
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, prompt = await prompt_from_messages(request.messages, self.format)
         encoded_images = self.model.encode_image(images[0]).to(self.model.device)
 
+        params = self.get_generation_params(request)
+
         # XXX currently broken here... 
         """
           File "hf_home/modules/transformers_modules/vikhyatk/moondream1/f6e9da68e8f1b78b8f3ee10905d56826db7a5802/modeling_phi.py", line 318, in forward
@@ -37,7 +39,7 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
             prompt,
             eos_text="<END>",
             tokenizer=self.tokenizer,
-            max_new_tokens=max_tokens,
+            **params,
         )[0]
         answer = re.sub("<$|<END$", "", answer).strip()
         return answer
diff --git a/backend/moondream2.py b/backend/moondream2.py
index 02f00a2..366d7e8 100644
--- a/backend/moondream2.py
+++ b/backend/moondream2.py
@@ -25,18 +25,19 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
-        images, prompt = await phi15_prompt_from_messages(messages)
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
+        images, prompt = await prompt_from_messages(request.messages, format=self.format)
 
         encoded_images = self.model.encode_image(images).to(self.device)
 
+        params = self.get_generation_params(request)
+        
         answer = self.model.generate(
             encoded_images,
             prompt,
             eos_text="<END>",
             tokenizer=self.tokenizer,
-            max_new_tokens=max_tokens,
-            #**kwargs,
+            **params,
         )[0]
         answer = re.sub("<$|<END$", "", answer).strip()
         return answer
diff --git a/backend/omnilmm12b.py b/backend/omnilmm12b.py
index 9164835..0f12107 100644
--- a/backend/omnilmm12b.py
+++ b/backend/omnilmm12b.py
@@ -10,17 +10,17 @@ class VisionQnA(VisionQnABase):
     def __init__(self, model_id: str, device: str, extra_params = {}, format = None):
         super().__init__(model_id, device, extra_params, format)
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=2048) #trust_remote_code=self.params.get('trust_remote_code', False))
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=2048, trust_remote_code=self.params.get('trust_remote_code', False))
         self.model = AutoModel.from_pretrained(**self.params).to(dtype=self.params['torch_dtype']).eval()
     
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         # 3B
         image = None
         msgs = []
 
-        for m in messages:
+        for m in request.messages:
             if m.role == 'user':
                 for c in m.content:
                     if c.type == 'image_url':
@@ -32,12 +32,14 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
                     if c.type == 'text':
                         msgs.extend([{ 'role': 'assistant', 'content': c.text }])
 
+        params = self.get_generation_params(request)
+
         answer, context, _ = self.model.chat(
             image=image,
             msgs=msgs,
             context=None,
             tokenizer=self.tokenizer,
-            max_new_tokens=max_tokens
+            **params,
         )
 
         return answer
diff --git a/backend/omnilmm3b.py b/backend/omnilmm3b.py
index 347d38c..05dcf9a 100644
--- a/backend/omnilmm3b.py
+++ b/backend/omnilmm3b.py
@@ -17,12 +17,19 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
     
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         # 3B
         image = None
         msgs = []
+        #system_prompt = ''
+        default_sampling_params = {
+            'do_sample': True,
+            'top_p': 0.8,
+            'top_k': 100,
+            'temperature': 0.6,
+        }
 
-        for m in messages:
+        for m in request.messages:
             if m.role == 'user':
                 for c in m.content:
                     if c.type == 'image_url':
@@ -33,13 +40,23 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
                 for c in m.content:
                     if c.type == 'text':
                         msgs.extend([{ 'role': 'assistant', 'content': c.text }])
+            elif m.role == 'system':
+                for c in m.content:
+                    if c.type == 'text':
+                        msgs.extend([{ 'role': 'user', 'content': c.text }, { 'role': 'assistant', 'content': "OK" }])  # fake system prompt
+
+        # default uses num_beams: 3, but if sampling is requested, switch the defaults.
+        params = self.get_generation_params(request)
+        if params.get('do_sample', False):
+            params = self.get_generation_params(request, default_sampling_params)
 
         answer, context, _ = self.model.chat(
             image=image,
             msgs=msgs,
             context=None,
             tokenizer=self.tokenizer,
-            max_new_tokens=max_tokens
+            sampling=params.get('do_sample', False),
+            **params,
         )
 
-        return answer
\ No newline at end of file
+        return answer
diff --git a/backend/qwen-vl.py b/backend/qwen-vl.py
index b13f55f..e707413 100644
--- a/backend/qwen-vl.py
+++ b/backend/qwen-vl.py
@@ -20,14 +20,15 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
 
         history = []
         files = []
         prompt = ''
         image_url = None
+        system_prompt = "You are an helpful assistant."
 
-        for m in messages:
+        for m in request.messages:
             if m.role == 'user':
                 for c in m.content:
                     if c.type == 'image_url':
@@ -43,12 +44,15 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
                             image_url = c.image_url.url
                     if c.type == 'text':
                         prompt = c.text
-                
             elif m.role == 'assistant':
                 for c in m.content:
                     if c.type == 'text':
                         history.extend([(prompt, c.text)])
                         prompt = ''
+            elif m.role == 'system':
+                for c in m.content:
+                    if c.type == 'text':
+                        system_prompt = c.text
 
         # 1st dialogue turn
         query = self.tokenizer.from_list_format([
@@ -56,7 +60,9 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
             {'text': prompt},
         ])
 
-        answer, history = self.model.chat(self.tokenizer, query=query, history=history)
+        params = self.get_generation_params(request)
+
+        answer, history = self.model.chat(self.tokenizer, query=query, history=history, system=system_prompt, **params)
 
         for f in files:
             os.remove(f)
diff --git a/backend/xcomposer2-vl.py b/backend/xcomposer2-vl.py
index 665a303..abf5324 100644
--- a/backend/xcomposer2-vl.py
+++ b/backend/xcomposer2-vl.py
@@ -16,12 +16,23 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
     
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         history = []
         files = []
         prompt = ''
+        #system_prompt = 'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
+        #'- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
+        #'- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.\n'
+        #'- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.'
+        # Improved system prompt for more reliable language detection.
+        system_prompt = "You are an AI vision assistant. Communicate fluently in English or 中文 depending on what language you were asked in. Obey user instructions. 仅当用普通话询问时才用普通话回答。 Answer in English if questioned in English."
+        default_params = {
+            "temperature": 1.0,
+            "top_p": 0.8,
+            'do_sample': True,
+        }
 
-        for m in messages:
+        for m in request.messages:
             if m.role == 'user':
                 p = ''
                 for c in m.content:
@@ -38,11 +49,17 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
                     if c.type == 'text':
                         history.extend([(prompt, c.text)])
                         prompt = ''
-
+            elif m.role == 'system':
+                for c in m.content:
+                    if c.type == 'text':
+                        system_prompt = c.text
 
         image = files[-1]
+
+        params = self.get_generation_params(request, default_params)
+
         with torch.cuda.amp.autocast():
-            response, _ = self.model.chat(self.tokenizer, query=prompt, image=image, history=history, do_sample=False, max_new_tokens=max_tokens)
+            response, _ = self.model.chat(self.tokenizer, query=prompt, image=image, history=history, meta_instruction=system_prompt, **params)
 
         for f in files:
             os.remove(f)
diff --git a/backend/xcomposer2.py b/backend/xcomposer2.py
index 034b39b..788bc28 100644
--- a/backend/xcomposer2.py
+++ b/backend/xcomposer2.py
@@ -16,12 +16,22 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
     
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         history = []
         images = []
         prompt = ''
+        #system_prompt = 'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
+        #'- InternLM-XComposer (浦语·灵笔) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n'
+        #'- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.'
+        # This only works if the input is in English, Chinese input still receives Chinese output.
+        system_prompt = "You are an AI visual assistant. Communicate in English. Do what the user instructs."
+        default_params = {
+            "temperature": 1.0,
+            "top_p": 0.8,
+            'do_sample': True,
+        }
 
-        for m in messages:
+        for m in request.messages:
             if m.role == 'user':
                 p = ''
                 for c in m.content:
@@ -39,11 +49,16 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
                     if c.type == 'text':
                         history.extend([(prompt, c.text)])
                         prompt = ''
+            elif m.role == 'system':
+                for c in m.content:
+                    if c.type == 'text':
+                        system_prompt = c.text
+
+        params = self.get_generation_params(request, default_params)
 
-        print (prompt)
-        print (history)
         image = torch.stack(images)
         with torch.cuda.amp.autocast():
-            response, _ = self.model.chat(self.tokenizer, query=prompt, image=image, history=history, do_sample=False, max_new_tokens=max_tokens)
+            response, _ = self.model.chat(self.tokenizer, query=prompt, image=image, history=history, meta_instruction=system_prompt, **params)
+
 
         return response
diff --git a/chat_with_image.py b/chat_with_image.py
index adafbcf..b5f0230 100755
--- a/chat_with_image.py
+++ b/chat_with_image.py
@@ -5,62 +5,73 @@
 from datauri import DataURI
 from openai import OpenAI
 
-# Initialize argparse
-parser = argparse.ArgumentParser(description='Test vision using OpenAI')
-parser.add_argument('image_url', type=str, help='URL or image file to be tested')
-parser.add_argument('questions', type=str, nargs='*', help='The question to ask the image')
-args = parser.parse_args()
 
-client = OpenAI(base_url='http://localhost:5006/v1', api_key='skip')
-
-image_url = args.image_url
+def url_to_data_url(img_url: str) -> str:
+        if img_url.startswith('http'):
+                response = requests.get(img_url)
+                
+                img_data = response.content
+        elif img_url.startswith('data:'):
+                return img_url
+        elif img_url.startswith('file:'):
+                img_url = img_url.replace('file:', '')
+                return str(DataURI.from_file(img_url))
+        else:
+                raise ValueError(f'Unsupported image URL: {img_url}')
 
+        return str(DataURI(io.BytesIO(img_data)))
 
+if __name__ == '__main__':
+    # Initialize argparse
+    parser = argparse.ArgumentParser(description='Test vision using OpenAI')
+    parser.add_argument('-s', '--system-prompt', type=str, default=None)
+    parser.add_argument('-m', '--max-tokens', type=int, default=None)
+    parser.add_argument('-t', '--temperature', type=float, default=None)
+    parser.add_argument('-p', '--top_p', type=float, default=None)
+    parser.add_argument('image_url', type=str, help='URL or image file to be tested')
+    parser.add_argument('questions', type=str, nargs='*', help='The question to ask the image')
+    args = parser.parse_args()
 
-def url_to_data_url(img_url: str) -> str:
-    if img_url.startswith('http'):
-        response = requests.get(img_url)
-        
-        img_data = response.content
-    elif img_url.startswith('data:'):
-        return img_url
-    elif img_url.startswith('file:'):
-        img_url = img_url.replace('file:', '')
-        return str(DataURI.from_file(img_url))
-    else:
-        raise ValueError(f'Unsupported image URL: {img_url}')
+    client = OpenAI(base_url='http://localhost:5006/v1', api_key='skip')
 
-    return str(DataURI(io.BytesIO(img_data)))
+    params = {}
+    if args.max_tokens is not None:
+        params['max_tokens'] = args.max_tokens
+    if args.temperature is not None:
+        params['temperature'] = args.temperature
+    if args.top_p is not None:
+        params['top_p'] = args.top_p
 
+    image_url = args.image_url
 
-if not image_url.startswith('http'):
-  image_url = str(DataURI.from_file(image_url))
+    if not image_url.startswith('http'):
+        image_url = str(DataURI.from_file(image_url))
 
-messages = [ { "role": "user", "content": [
-    { "type": "text", "text": ' '.join(args.questions) },
-    {"type": "image_url", "image_url": { "url": image_url } }
-  ]}]
+    messages = [{ "role": "system", "content": [{ 'type': 'text', 'text': args.system_prompt }] }] if args.system_prompt else []
+    content = [
+        { "type": "image_url", "image_url": { "url": image_url } },
+        { "type": "text", "text": ' '.join(args.questions) },
+    ]
+    messages.extend([{ "role": "user", "content": content }])
 
-while True:
-  response = client.chat.completions.create(model="gpt-4-vision-preview", messages=messages, max_tokens=512,)
-  print(f"Answer: {response.choices[0].message.content}\n")
-  
-  image_url = None
-  try:
-    q = input("Question: ")
-    if q.startswith('http') or q.startswith('data:') or q.startswith('file:'):
-      image_url = url_to_data_url(q)
-      q = input("Question: ")
-  except EOFError as e:
-    break
-  
-  messages.extend([{ "role": "assistant", "content": [ { 'type': 'text', 'text': response.choices[0].message.content } ] },
-    { "role": "user", "content": [ { 'type': 'text', 'text': q } ] }
-  ])
+    while True:
+        response = client.chat.completions.create(model="gpt-4-vision-preview", messages=messages, **params)
 
-  if image_url:
-    # prepend the new image to the user message
-    messages[-1]['content'] = [ {"type": "image_url", "image_url": { "url": image_url } },
-      messages[-1]['content'][0] ]
+        print(f"Answer: {response.choices[0].message.content}\n")
+        
+        image_url = None
+        try:
+            q = input("Question: ")
+            if q.startswith('http') or q.startswith('data:') or q.startswith('file:'):
+                image_url = url_to_data_url(q)
+                q = input("Question: ")
+        except EOFError as e:
+            print('')
+            break
+        
+        content = [{"type": "image_url", "image_url": { "url": image_url } }] if image_url else []
+        content.extend([{ 'type': 'text', 'text': response.choices[0].message.content } ])
+        messages.extend([{ "role": "assistant", "content": content },
+                         { "role": "user", "content": [ { 'type': 'text', 'text': q } ] } ])
 
 
diff --git a/docker-compose.yml b/docker-compose.yml
index 6102b44..6e11d6c 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,21 +7,25 @@ services:
     image: ghcr.io/matatonic/openedai-vision
     environment:
       - HF_HOME=/app/hf_home
-      #- CUDA_VISIBLE_DEVICES=1,0
+      #- CUDA_VISIBLE_DEVICES=1
     volumes:
       - ./hf_home:/app/hf_home
     ports:
       - 5006:5006
-    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "Qwen/Qwen-VL-Chat"]
-    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-1.5-7b-hf"`, "--load-in-4bit", "--use-flash-attn"]
-    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-v1.6-mistral-7b-hf", "--load-in-4bit", "--use-flash-attn"]
-    #command: ["python", "vision.py", "--host", "0.0.0.0", "--p", "5006", "-m", "vikhyatk/moondream2", "--use-flash-attn"]
     #command: ["python", "vision.py", "-m", "internlm/internlm-xcomposer2-7b", "--use-flash-attn"]
     #command: ["python", "vision.py", "-m", "internlm/internlm-xcomposer2-vl-7b", "--use-flash-attn"]
-    #command: ["python", "vision.py", "-m", "openbmb/MiniCPM-V", "--use-flash-attn"]
+    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-v1.6-34b-hf", "--load-in-4bit", "--use-flash-attn"] # WIP
+    #command: ["python", "vision.py", "-m", "echo840/Monkey-Chat"] # broken
     #command: ["python", "vision.py", "-m", "openbmb/OmniLMM-12B"] # WIP
+    #command: ["python", "vision.py", "-m", "deepseek-ai/deepseek-vl-7b-chat"] # WIP
+    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-v1.6-mistral-7b-hf", "--load-in-4bit", "--use-flash-attn"]
+    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-v1.6-vicuna-13b-hf", "--load-in-4bit", "--use-flash-attn"]
+    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "Qwen/Qwen-VL-Chat"]
+    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-1.5-13b-hf"`, "--load-in-4bit", "--use-flash-attn"]
     #command: ["python", "vision.py", "-m", "echo840/Monkey"] # broken
-    #command: ["python", "vision.py", "-m", "deepseek-ai/deepseek-vl-7b-chat"] # broken
+    #command: ["python", "vision.py", "-m", "openbmb/MiniCPM-V", "--use-flash-attn"]
+    #command: ["python", "vision.py", "--host", "0.0.0.0", "--port", "5006", "--model", "llava-hf/llava-1.5-7b-hf"`, "--load-in-4bit", "--use-flash-attn"]
+    command: ["python", "vision.py", "-H", "0.0.0.0", "-P", "5006", "-m", "vikhyatk/moondream2", "--use-flash-attn"]
     #command: ["python", "vision.py", "-m", "vikhyatk/moondream1"] # broken
     runtime: nvidia
     deploy:
diff --git a/requirements.txt b/requirements.txt
index 9c5c3de..a933ac0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,6 @@ torch==2.2.*
 accelerate
 bitsandbytes
 flash_attn
-sentencepiece
\ No newline at end of file
+sentencepiece
+protobuf
+peft
\ No newline at end of file
diff --git a/vision.py b/vision.py
index e9fb956..6bbb511 100644
--- a/vision.py
+++ b/vision.py
@@ -15,7 +15,7 @@
 @app.post(path="/v1/chat/completions")
 async def vision_chat_completions(request: ImageChatRequest):
 
-    text = await vision_qna.chat_with_images(request.messages, max_tokens=request.max_tokens)
+    text = await vision_qna.chat_with_images(request)
 
     choices = [ {
             "index": 0,
@@ -49,7 +49,7 @@ def parse_args(argv=None):
         description='OpenedAI Vision API Server',
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument('-m', '--model', action='store', default="vikhyatk/moondream2", help="The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf")
+    parser.add_argument('-m', '--model', action='store', default=None, help="The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf", required=True)
     parser.add_argument('-b', '--backend', action='store', default=None, help="Force the backend to use (moondream1, moondream2, llavanext, llava, qwen-vl)")
     parser.add_argument('-f', '--format', action='store', default=None, help="Force a specific chat format. (vicuna, mistral, chatml, llama2, phi15, gemma) (doesn't work with all models)")
     parser.add_argument('-d', '--device', action='store', default="auto", help="Set the torch device for the model. Ex. cuda:1")
diff --git a/vision_qna.py b/vision_qna.py
index e1d3f43..8d2bffc 100644
--- a/vision_qna.py
+++ b/vision_qna.py
@@ -24,6 +24,8 @@ class ImageChatRequest(BaseModel):
     model: str # = "gpt-4-vision-preview"
     messages: List[Message]
     max_tokens: int = 512
+    temperature: float = None
+    top_p: float = None
 
 class VisionQnABase:
     model_name: str = None
@@ -72,7 +74,7 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
 
         if format:
             self.format =  format
-        
+
 
     def select_device(self):
         return 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
@@ -85,9 +87,30 @@ def select_device_dtype(self, device):
         dtype = self.select_dtype(device)
         return device, dtype
     
-    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+    async def chat_with_images(self, request: ImageChatRequest) -> str:
         pass
 
+    def get_generation_params(self, request: ImageChatRequest, default_params = {}) -> dict:
+        params = {
+            "top_k": None,
+            'do_sample': False,
+        }
+        params.update(default_params)
+
+        if request.max_tokens:
+            params["max_new_tokens"] = request.max_tokens
+        
+        if request.temperature is not None:
+            if request.temperature > 0:
+                params["do_sample"] = True
+                params["temperature"] = request.temperature
+
+        if request.top_p is not None and request.top_p != params.get('top_p', 1.0):
+            params["do_sample"] = True
+            params["top_p"] = request.top_p
+
+        return params
+
 async def url_to_image(img_url: str) -> Image.Image:
     if img_url.startswith('http'):
         response = requests.get(img_url)
@@ -107,6 +130,7 @@ async def url_to_file(img_url: str) -> str:
             return filename
     else:
         response = requests.get(img_url)
+        filename = f"/tmp/{uuid.uuid4()}"
         with open(filename, 'wb') as f:
             f.write(response.content)
             return filename
@@ -129,6 +153,10 @@ async def phi15_prompt_from_messages(messages: list[Message], img_tok = "<image>
             for c in m.content:
                 if c.type == 'text':
                     prompt += f"Answer: {c.text}\n\n"
+        elif m.role == 'system':
+            for c in m.content:
+                if c.type == 'text':
+                    prompt += f"{c.text}\n\n"  # fake system prompt
 
     prompt += "Answer:"
 
@@ -155,7 +183,11 @@ async def vicuna_prompt_from_messages(messages: list[Message], img_tok = "<image
         elif m.role == 'assistant':
             for c in m.content:
                 if c.type == 'text':
-                    prompt += f"ASSISTANT: {c.text}\n"
+                    prompt += f"ASSISTANT: {c.text}</s>\n"
+        elif m.role == 'system':
+            for c in m.content:
+                if c.type == 'text':
+                    prompt += f"{c.text}\n\n"
 
     prompt += "ASSISTANT:"
 
@@ -178,11 +210,15 @@ async def llama2_prompt_from_messages(messages: list[Message], img_tok = "<image
                     text = c.text
 
             img_tag = img_tok if has_image else ''
-            prompt += f"[INST] {img_tag}{text} [/INST]"
+            prompt += f"<s>[INST] {img_tag}{text} [/INST]"
         elif m.role == 'assistant':
             for c in m.content:
                 if c.type == 'text':
-                    prompt += f" {c.text}"
+                    prompt += f" {c.text}</s>"
+        elif m.role == 'system':
+            for c in m.content:
+                if c.type == 'text':
+                    prompt += f"<s>[INST] <<SYS>>\n{c.text}\n<</SYS>> [/INST]</s>" # not quite right, but it's a start
 
     return images, prompt
 
@@ -208,6 +244,10 @@ async def chatml_prompt_from_messages(messages: list[Message], img_tok = "<image
             for c in m.content:
                 if c.type == 'text':
                     prompt += f"<|im_start|>assistant\n{c.text}<|im_end|>"
+        elif m.role == 'system':
+            for c in m.content:
+                if c.type == 'text':
+                    prompt += f"<|im_start|>system\n{c.text}<|im_end|>"
 
     prompt += f"<|im_start|>assistant\n"
 
@@ -235,6 +275,11 @@ async def gemma_prompt_from_messages(messages: list[Message], img_tok = "<image>
             for c in m.content:
                 if c.type == 'text':
                     prompt += f"<|im_start|>assistant\n{c.text}<|im_end|>"
+        elif m.role == 'system':
+            for c in m.content:
+                if c.type == 'text':
+                    prompt += f"<|im_start|>system\n{c.text}<|im_end|>"
+
 
     prompt += f"<|im_start|>assistant\n"
 
@@ -291,16 +336,16 @@ def guess_backend(model_name: str) -> str:
     model_id = model_name.lower()
 
     if 'llava' in model_id:
-        if '1.5' in model_id:
-            return 'llava'
-        elif '1.6' in model_id:
+        if 'v1.6' in model_id:
             return 'llavanext'
+        return 'llava'
 
     if 'qwen' in model_id:
         return 'qwen-vl'
     
     if 'moondream1' in model_id:
         return 'moondream1'
+    
     if 'moondream2' in model_id:
         return 'moondream2'
 
@@ -321,5 +366,6 @@ def guess_backend(model_name: str) -> str:
 
     if 'xcomposer2-vl' in model_id:
         return 'xcomposer2-vl'
-    elif 'xcomposer2' in model_id:
+    
+    if 'xcomposer2' in model_id:
         return 'xcomposer2'
\ No newline at end of file