lm-sys · BabyChouSr · Nov 4, 2023 · Nov 4, 2023 · Nov 4, 2023 · Nov 5, 2023
diff --git a/fastchat/constants.py b/fastchat/constants.py
@@ -16,6 +16,9 @@
 INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
 # Maximum input length
 INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 3072))
+INPUT_CHAR_LEN_LIMIT_WITH_IMAGE = int(
+    os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT_WITH_IMAGE", 2816)
+)
 # Maximum conversation turns
 CONVERSATION_TURN_LIMIT = 50
 # Session expiration time

diff --git a/fastchat/conversation.py b/fastchat/conversation.py
@@ -130,6 +130,8 @@ def get_prompt(self) -> str:
             for i, (role, message) in enumerate(self.messages):
                 tag = self.roles[i % 2]
                 if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
                     if i == 0:
                         ret += message + " "
                     else:
@@ -216,6 +218,65 @@ def get_prompt(self) -> str:
         else:
             raise ValueError(f"Invalid style: {self.sep_style}")
 
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(
+                                    pil_img.mode, (width, width), background_color
+                                )
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(
+                                    pil_img.mode, (height, height), background_color
+                                )
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(
+                            f"Invalid image_process_mode: {image_process_mode}"
+                        )
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if longest_edge != max(image.size):
+                        if H > W:
+                            H, W = longest_edge, shortest_edge
+                        else:
+                            H, W = shortest_edge, longest_edge
+                        image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+
     def set_system_message(self, system_message: str):
         """Set the system message."""
         self.system_message = system_message
@@ -237,7 +298,30 @@ def to_gradio_chatbot(self):
         ret = []
         for i, (role, msg) in enumerate(self.messages[self.offset :]):
             if i % 2 == 0:
-                ret.append([msg, None])
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>", "").strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
             else:
                 ret[-1][-1] = msg
         return ret
@@ -270,6 +354,16 @@ def copy(self):
         )
 
     def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "template_name": self.name,
+                "system_message": self.system_message,
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+                "offset": self.offset,
+            }
         return {
             "template_name": self.name,
             "system_message": self.system_message,
@@ -1153,6 +1247,22 @@ def get_conv_template(name: str) -> Conversation:
     )
 )
 
+# Llava template
+# reference: conv_llava_llama_2 from https://github.com/haotian-liu/LLaVA/blob/main/llava/conversation.py
+register_conv_template(
+    Conversation(
+        name="llava",
+        system_template="You are a helpful language and vision assistant. "
+        "You are able to understand the visual content that the user provides, "
+        "and assist the user with a variety of tasks using natural language.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep="<s>",
+        sep2="</s>",
+        stop_str="</s>",
+    )
+)
+
 
 if __name__ == "__main__":
     from fastchat.conversation import get_conv_template

diff --git a/fastchat/model/llava/constants.py b/fastchat/model/llava/constants.py
@@ -0,0 +1,8 @@
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
diff --git a/fastchat/model/llava/language_model/llava_llama.py b/fastchat/model/llava/language_model/llava_llama.py
@@ -0,0 +1,169 @@
+#    Adapted from haotian-liu/LLaVA
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlamaConfig,
+    LlamaModel,
+    LlamaForCausalLM,
+)
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from fastchat.model.llava.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
+
+
+class LlavaConfig(LlamaConfig):
+    model_type = "llava"
+
+
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+
+
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        (
+            input_ids,
+            attention_mask,
+            past_key_values,
+            inputs_embeds_prepared,
+            labels,
+        ) = self.prepare_inputs_labels_for_multimodal(
+            input_ids, attention_mask, past_key_values, labels, images
+        )
+        if inputs_embeds is None:
+            inputs_embeds = inputs_embeds_prepared
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+
+
+AutoConfig.register("llava", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)