hiyouga · marko1616 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
@@ -15,16 +15,20 @@
 import asyncio
 import concurrent.futures
 import os
+import pathlib
 from threading import Thread
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 import torch
+import torchvision
+from PIL import Image
 from transformers import GenerationConfig, TextIteratorStreamer
 
 from ..data import get_template_and_fix_tokenizer
 from ..extras.logging import get_logger
 from ..extras.misc import get_logits_processor
 from ..model import load_model, load_tokenizer
+from ..webui.common import DEFAULT_CACHE_DIR
 from .base_engine import BaseEngine, Response
 
 
@@ -58,6 +62,7 @@ def __init__(
         self.model = load_model(
             self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate)
         )  # must after fixing tokenizer to resize vocab
+        self.model_args = model_args
         self.generating_args = generating_args.to_dict()
         try:
             asyncio.get_event_loop()
@@ -75,6 +80,7 @@ def _process_args(
         processor: Optional["ProcessorMixin"],
         template: "Template",
         generating_args: Dict[str, Any],
+        model_args: "ModelArguments",
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
@@ -86,16 +92,26 @@ def _process_args(
             and image is not None
             and not hasattr(processor, "image_seq_length")
             and template.image_token not in messages[0]["content"]
-        ):  # llava-like models
+            and model_args.visual_inputs_type == "vision_tower"
+        ):
+            # llava-like models
             messages[0]["content"] = template.image_token + messages[0]["content"]
+        elif image is not None and model_args.visual_inputs_type == "vision_token":
+            # Add image pathlike token as vision input
+            image_path = pathlib.Path(DEFAULT_CACHE_DIR) / "temp.png"
+            Image.fromarray(image).convert("RGB").save(image_path)
+            messages[-1]["content"] = template.format_image.apply(content=os.fspath(image_path))[0] + messages[-1]["content"]
+        elif image is not None and model_args.visual_inputs_type == "vision_message_embed":
+            messages[-1]["content"] = template.format_image.apply()[0] + messages[-1]["content"]
 
         paired_messages = messages + [{"role": "assistant", "content": ""}]
         system = system or generating_args["default_system"]
         pixel_values = None
         prompt_ids, _ = template.encode_oneturn(
             tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools
         )
-        if processor is not None and image is not None:  # add image features
+        # add image features for vision tower
+        if processor is not None and image is not None and template.format_image is None:
             image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
             batch_feature = image_processor(image, return_tensors="pt")
             pixel_values = batch_feature.to(model.device)["pixel_values"]  # shape (B, C, H, W)
@@ -163,6 +179,17 @@ def _process_args(
             generation_config=GenerationConfig(**generating_args),
             logits_processor=get_logits_processor(),
         )
+        if image is not None and model_args.visual_inputs_type == "vision_message_embed":
+            transform = torchvision.transforms.Compose(
+                [
+                    torchvision.transforms.Resize(
+                        (1120, 1120), interpolation=torchvision.transforms.InterpolationMode.BICUBIC
+                    ),
+                    torchvision.transforms.ToTensor(),
+                    torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+                ]
+            )
+            gen_kwargs["images"] = transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype)
 
         if pixel_values is not None:
             gen_kwargs["pixel_values"] = pixel_values
@@ -177,14 +204,15 @@ def _chat(
         processor: Optional["ProcessorMixin"],
         template: "Template",
         generating_args: Dict[str, Any],
+        model_args: "ModelArguments",
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> List["Response"]:
         gen_kwargs, prompt_length = HuggingfaceEngine._process_args(
-            model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs
+            model, tokenizer, processor, template, generating_args, model_args, messages, system, tools, image, input_kwargs
         )
         generate_output = model.generate(**gen_kwargs)
         response_ids = generate_output[:, prompt_length:]
@@ -212,14 +240,15 @@ def _stream_chat(
         processor: Optional["ProcessorMixin"],
         template: "Template",
         generating_args: Dict[str, Any],
+        model_args: "ModelArguments",
         messages: Sequence[Dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> Callable[[], str]:
         gen_kwargs, _ = HuggingfaceEngine._process_args(
-            model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs
+            model, tokenizer, processor, template, generating_args, model_args, messages, system, tools, image, input_kwargs
         )
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs["streamer"] = streamer
@@ -285,6 +314,7 @@ async def chat(
             self.processor,
             self.template,
             self.generating_args,
+            self.model_args,
             messages,
             system,
             tools,
@@ -313,6 +343,7 @@ async def stream_chat(
             self.processor,
             self.template,
             self.generating_args,
+            self.model_args,
             messages,
             system,
             tools,

diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py
@@ -77,7 +77,7 @@ def __init__(
             "max_lora_rank": model_args.vllm_max_lora_rank,
         }
 
-        if model_args.visual_inputs:
+        if model_args.visual_inputs and model_args.visual_inputs_type == "vision_tower":
             image_size = config.vision_config.image_size
             patch_size = config.vision_config.patch_size
             self.image_feature_size = (image_size // patch_size) ** 2

diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py
@@ -177,7 +177,7 @@ def get_dataset(
 
     with training_args.main_process_first(desc="pre-process dataset"):
         preprocess_func, print_function = get_preprocess_and_print_func(
-            data_args, training_args, stage, template, tokenizer, processor
+            data_args, training_args, model_args, stage, template, tokenizer, processor
         )
         column_names = list(next(iter(dataset)).keys())
         kwargs = {}
@@ -190,6 +190,10 @@ def get_dataset(
 
         dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
 
+        if model_args.visual_inputs_type == "vision_message_embed":
+            dataset = dataset.rename_column("image_inputs","images")
+        print(dataset["images"])
+
         if data_args.tokenized_path is not None:
             if training_args.should_save:
                 dataset.save_to_disk(data_args.tokenized_path)

diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py
@@ -29,13 +29,14 @@
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
 
-    from ..hparams import DataArguments
+    from ..hparams import DataArguments, ModelArguments
     from .template import Template
 
 
 def get_preprocess_and_print_func(
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
+    model_args: "ModelArguments",
     stage: Literal["pt", "sft", "rm", "ppo", "kto"],
     template: "Template",
     tokenizer: "PreTrainedTokenizer",
@@ -63,6 +64,7 @@ def get_preprocess_and_print_func(
                 tokenizer=tokenizer,
                 processor=processor,
                 data_args=data_args,
+                model_args=model_args,
             )
 
         print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)

diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py
@@ -14,6 +14,7 @@
 
 import bisect
 from typing import TYPE_CHECKING, List, Sequence
+from torchvision import transforms
 
 from ...extras.packages import is_pillow_available
 
@@ -61,13 +62,25 @@ def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]:
     return knapsacks
 
 
-def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "NDArray":
+def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin", vision_type: str = "vision_tower") -> "NDArray":
     r"""
     Processes visual inputs. (currently only supports a single image)
     """
-    image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
-    image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255))
-    return image_processor(image, return_tensors="pt")["pixel_values"][0]  # shape (C, H, W)
+    if vision_type == "vision_tower":
+        image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+        image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255))
+        return image_processor(image, return_tensors="pt")["pixel_values"][0]  # shape (C, H, W)
+    elif vision_type == "vision_message_embed":
+        transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (1120, 1120), interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+            ]
+        )
+        return transform(images[0]) if len(images) != 0 else transform(Image.new("RGB", (1120, 1120), (255, 255, 255)))
 
 
 def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") -> List[int]:

diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py
@@ -23,7 +23,7 @@
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, ProcessorMixin
 
-    from ...hparams import DataArguments
+    from ...hparams import DataArguments, ModelArguments
     from ..template import Template
 
 
@@ -78,19 +78,26 @@ def preprocess_supervised_dataset(
     tokenizer: "PreTrainedTokenizer",
     processor: Optional["ProcessorMixin"],
     data_args: "DataArguments",
+    model_args: "ModelArguments",
 ) -> Dict[str, List[List[int]]]:
     # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
     # for multiturn examples, we only mask the prompt part in each prompt-response pair.
     model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
-    if processor is not None:
+    if processor is not None and model_args.visual_inputs_type == "vision_tower":
         model_inputs["pixel_values"] = []
         if hasattr(processor, "image_seq_length"):  # paligemma models
             model_inputs["token_type_ids"] = []
+    elif model_args.visual_inputs_type == "vision_message_embed":
+        model_inputs["image_inputs"] = []
 
     for i in range(len(examples["prompt"])):
         if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
             logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
             continue
+        if model_args.visual_inputs_type == "vision_message_embed":
+            assert len(examples["images"][i]) <= 1,"GLM4v only support 1 image train yet."
+            model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "vision_message_embed"))
+            examples["prompt"][i][-1]["content"] = template.format_image.apply()[0] + examples["prompt"][i][-1]["content"]
 
         input_ids, labels = _encode_supervised_example(
             prompt=examples["prompt"][i],
@@ -105,7 +112,7 @@ def preprocess_supervised_dataset(
         model_inputs["input_ids"].append(input_ids)
         model_inputs["attention_mask"].append([1] * len(input_ids))
         model_inputs["labels"].append(labels)
-        if processor is not None:
+        if processor is not None and model_args.visual_inputs_type == "vision_tower":
             model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor))
             if hasattr(processor, "image_seq_length"):  # paligemma models
                 model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor))

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
@@ -39,6 +39,7 @@ class Template:
     format_tools: "Formatter"
     format_separator: "Formatter"
     format_prefix: "Formatter"
+    format_image: "Formatter"
     default_system: str
     stop_words: List[str]
     image_token: str
@@ -239,6 +240,7 @@ def _register_template(
     format_tools: Optional["Formatter"] = None,
     format_separator: Optional["Formatter"] = None,
     format_prefix: Optional["Formatter"] = None,
+    format_image: Optional["Formatter"] = None,
     default_system: str = "",
     stop_words: List[str] = [],
     image_token: str = "<image>",
@@ -290,6 +292,7 @@ def _register_template(
         format_tools=format_tools or default_tool_formatter,
         format_separator=format_separator or default_separator_formatter,
         format_prefix=format_prefix or default_prefix_formatter,
+        format_image=format_image,
         default_system=default_system,
         stop_words=stop_words,
         image_token=image_token,
@@ -686,6 +689,21 @@ def get_template_and_fix_tokenizer(
 )
 
 
+_register_template(
+    name="glm4v",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    format_image=EmptyFormatter(slots=["<|begin_of_image|><|endoftext|><|end_of_image|>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
 _register_template(
     name="intern",
     format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
@@ -815,6 +833,19 @@ def get_template_and_fix_tokenizer(
 )
 
 
+_register_template(
+    name="qwenvl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    format_image=StringFormatter(slots=["<img>{{content}}</img>"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
 _register_template(
     name="solar",
     format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),