From fbf19f844d2c32c05d48c0b942c3fa2047e1ad1d Mon Sep 17 00:00:00 2001 From: marko1616 Date: Wed, 19 Jun 2024 14:11:14 +0800 Subject: [PATCH 01/30] Basic support for webui. --- src/llamafactory/chat/hf_engine.py | 14 +++++++++++++- src/llamafactory/data/data_utils.py | 1 + src/llamafactory/data/template.py | 19 +++++++++++++++++++ src/llamafactory/extras/constants.py | 22 +++++++++++++++++++++- src/llamafactory/hparams/model_args.py | 5 +++++ src/llamafactory/model/loader.py | 3 ++- src/llamafactory/model/patcher.py | 3 ++- src/llamafactory/webui/chatter.py | 1 + src/llamafactory/webui/common.py | 11 ++++++++--- src/llamafactory/webui/components/top.py | 4 +++- src/llamafactory/webui/manager.py | 1 + 11 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 9e60175b0f..851d1525f9 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -15,15 +15,19 @@ import asyncio import concurrent.futures import os +import pathlib from threading import Thread from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union import torch from transformers import GenerationConfig, TextIteratorStreamer +from PIL import Image + from ..data import get_template_and_fix_tokenizer from ..extras.logging import get_logger from ..extras.misc import get_logits_processor +from ..webui.common import DEFAULT_CACHE_DIR from ..model import load_model, load_tokenizer from .base_engine import BaseEngine, Response @@ -86,8 +90,15 @@ def _process_args( and image is not None and not hasattr(processor, "image_seq_length") and template.image_token not in messages[0]["content"] + and template.format_image is None ): # llava-like models messages[0]["content"] = template.image_token + messages[0]["content"] + # Add image token as vision input + if image is not None and template.format_image is not None: + image_path = pathlib.Path(DEFAULT_CACHE_DIR) / "temp.png" + Image.fromarray(image).save(image_path) + messages[-1]["role"] = "user_with_image" + messages[-1]["image"] = os.fspath(image_path) paired_messages = messages + [{"role": "assistant", "content": ""}] system = system or generating_args["default_system"] @@ -95,7 +106,8 @@ def _process_args( prompt_ids, _ = template.encode_oneturn( tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools ) - if processor is not None and image is not None: # add image features + # add image features for vision tower + if processor is not None and image is not None and template.format_image is None: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") batch_feature = image_processor(image, return_tensors="pt") pixel_values = batch_feature.to(model.device)["pixel_values"] # shape (B, C, H, W) diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py index cc9761b107..5c613f0e47 100644 --- a/src/llamafactory/data/data_utils.py +++ b/src/llamafactory/data/data_utils.py @@ -37,6 +37,7 @@ class Role(str, Enum): SYSTEM = "system" FUNCTION = "function" OBSERVATION = "observation" + USER_WITH_IMAGE = "user_with_image" def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]: diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index c9af9605fe..516d05fc21 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -39,6 +39,7 @@ class Template: format_tools: "Formatter" format_separator: "Formatter" format_prefix: "Formatter" + format_image: "Formatter" default_system: str stop_words: List[str] image_token: str @@ -116,6 +117,9 @@ def _encode( if message["role"] == Role.USER.value: elements += self.format_user.apply(content=message["content"], idx=str(i // 2)) + elif message["role"] == Role.USER_WITH_IMAGE.value: + elements += self.format_image.apply(content=message["image"]) + elements += self.format_user.apply(content=message["content"], idx=str(i // 2)) elif message["role"] == Role.ASSISTANT.value: elements += self.format_assistant.apply(content=message["content"]) elif message["role"] == Role.OBSERVATION.value: @@ -239,6 +243,7 @@ def _register_template( format_tools: Optional["Formatter"] = None, format_separator: Optional["Formatter"] = None, format_prefix: Optional["Formatter"] = None, + format_image: Optional["Formatter"] = None, default_system: str = "", stop_words: List[str] = [], image_token: str = "", @@ -290,6 +295,7 @@ def _register_template( format_tools=format_tools or default_tool_formatter, format_separator=format_separator or default_separator_formatter, format_prefix=format_prefix or default_prefix_formatter, + format_image=format_image, default_system=default_system, stop_words=stop_words, image_token=image_token, @@ -829,6 +835,19 @@ def get_template_and_fix_tokenizer( ) +_register_template( + name="qwenvl", + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_separator=EmptyFormatter(slots=["\n"]), + format_image=StringFormatter(slots=["{{content}}"]), + default_system="You are a helpful assistant.", + stop_words=["<|im_end|>"], + replace_eos=True, +) + + _register_template( name="solar", format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]), diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 36265c8e99..d2041f0aff 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -86,6 +86,8 @@ VISION_MODELS = set() +VISION_TYPES = dict() + class DownloadSource(str, Enum): DEFAULT = "hf" @@ -96,6 +98,7 @@ def register_model_group( models: Dict[str, Dict[DownloadSource, str]], template: Optional[str] = None, vision: bool = False, + vision_type: str = "vision_tower", ) -> None: prefix = None for name, path in models.items(): @@ -108,7 +111,7 @@ def register_model_group( DEFAULT_TEMPLATE[prefix] = template if vision: VISION_MODELS.add(prefix) - + VISION_TYPES[prefix] = vision_type register_model_group( models={ @@ -1204,6 +1207,23 @@ def register_model_group( ) +register_model_group( + models={ + "Qwen-VL-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen-VL-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen2-0.5B", + }, + "Qwen-VL": { + DownloadSource.DEFAULT: "Qwen/Qwen-VL", + DownloadSource.MODELSCOPE: "qwen/wen-VL", + }, + }, + template="qwenvl", + vision=True, + vision_type="vision_token", +) + + register_model_group( models={ "SOLAR-10.7B": { diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 996e913083..8777aa9de9 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -117,6 +117,10 @@ class ModelArguments: default=False, metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, ) + visual_inputs_type: str = field( + default="", + metadata={"help": "Type of visual inputs."}, + ) moe_aux_loss_coef: Optional[float] = field( default=None, metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, @@ -235,6 +239,7 @@ def __post_init__(self): if self.new_special_tokens is not None: # support multiple special tokens self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] + assert self.visual_inputs and self.visual_inputs_type, "Must specify visual inputs type while using visual input." assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization." diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 69cccd931e..96f975b7a8 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -143,7 +143,8 @@ def load_model( if model_args.mixture_of_depths == "load": model = load_mod_pretrained_model(**init_kwargs) - elif model_args.visual_inputs: + elif model_args.visual_inputs and model_args.visual_inputs_type == "vision_tower": + # If model DO NOT have visual token(e.g. Qwen-VL) and model have visual_inputs then choose this. model = AutoModelForVision2Seq.from_pretrained(**init_kwargs) elif model_args.train_from_scratch: model = AutoModelForCausalLM.from_config(config) diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index 8fa17d089f..b3da361048 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -127,7 +127,8 @@ def patch_model( if model_args.resize_vocab: resize_embedding_layer(model, tokenizer) - if model_args.visual_inputs: + if model_args.visual_inputs and model_args.visual_inputs_type == "vison_tower": + # If model DO NOT have visual token(e.g. Qwen-VL) and model have visual_inputs then choose this. autocast_projector_dtype(model, model_args) if is_trainable: diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py index a2b54dce7e..2fa1880faa 100644 --- a/src/llamafactory/webui/chatter.py +++ b/src/llamafactory/webui/chatter.py @@ -85,6 +85,7 @@ def load_model(self, data) -> Generator[str, None, None]: flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", use_unsloth=(get("top.booster") == "unsloth"), visual_inputs=get("top.visual_inputs"), + visual_inputs_type=get("top.visual_inputs_type"), rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, infer_backend=get("infer.infer_backend"), ) diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py index 980428a4b7..2d9f8518c8 100644 --- a/src/llamafactory/webui/common.py +++ b/src/llamafactory/webui/common.py @@ -28,6 +28,7 @@ SUPPORTED_MODELS, TRAINING_STAGES, VISION_MODELS, + VISION_TYPES, DownloadSource, ) from ..extras.logging import get_logger @@ -129,7 +130,8 @@ def get_model_info(model_name: str) -> Tuple[str, str, bool]: template (str) visual (bool) """ - return get_model_path(model_name), get_template(model_name), get_visual(model_name) + visual = get_visual(model_name) + return get_model_path(model_name), get_template(model_name), visual[0], visual[1] def get_template(model_name: str) -> str: @@ -141,11 +143,14 @@ def get_template(model_name: str) -> str: return "default" -def get_visual(model_name: str) -> bool: +def get_visual(model_name: str) -> dict[bool,str]: r""" Judges if the model is a vision language model. """ - return get_prefix(model_name) in VISION_MODELS + if get_prefix(model_name) in VISION_MODELS: + return True, VISION_TYPES[get_prefix(model_name)] + else: + return False, "none" def list_checkpoints(model_name: str, finetuning_type: str) -> "gr.Dropdown": diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index 2515a83d8a..87234156be 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -48,8 +48,9 @@ def create_top() -> Dict[str, "Component"]: rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3) booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3) visual_inputs = gr.Checkbox(scale=1) + visual_inputs_type = gr.Dropdown(choices=["none", "vision_tower", "vision_token"], value="none", scale=3) - model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False) + model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs, visual_inputs_type], queue=False) model_name.input(save_config, inputs=[lang, model_name], queue=False) model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False) finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False) @@ -67,4 +68,5 @@ def create_top() -> Dict[str, "Component"]: rope_scaling=rope_scaling, booster=booster, visual_inputs=visual_inputs, + visual_inputs_type=visual_inputs_type, ) diff --git a/src/llamafactory/webui/manager.py b/src/llamafactory/webui/manager.py index 7e9b801ae1..3b800026db 100644 --- a/src/llamafactory/webui/manager.py +++ b/src/llamafactory/webui/manager.py @@ -75,4 +75,5 @@ def get_base_elems(self) -> Set["Component"]: self._id_to_elem["top.rope_scaling"], self._id_to_elem["top.booster"], self._id_to_elem["top.visual_inputs"], + self._id_to_elem["top.visual_inputs_type"], } From 95b8a1d3b0a99639268ed6c7cee1faf51260dc9d Mon Sep 17 00:00:00 2001 From: marko1616 Date: Wed, 19 Jun 2024 20:39:59 +0800 Subject: [PATCH 02/30] Basic support for GLM4V --- src/llamafactory/chat/hf_engine.py | 30 +++++++++++++++++------- src/llamafactory/data/data_utils.py | 1 - src/llamafactory/data/template.py | 18 +++++++++++--- src/llamafactory/extras/constants.py | 13 ++++++++++ src/llamafactory/model/loader.py | 2 +- src/llamafactory/webui/components/top.py | 2 +- src/llamafactory/webui/locales.py | 11 +++++++++ 7 files changed, 62 insertions(+), 15 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 851d1525f9..b569cda18c 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -62,6 +62,7 @@ def __init__( self.model = load_model( self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) ) # must after fixing tokenizer to resize vocab + self.model_args = model_args self.generating_args = generating_args.to_dict() try: asyncio.get_event_loop() @@ -79,6 +80,7 @@ def _process_args( processor: Optional["ProcessorMixin"], template: "Template", generating_args: Dict[str, Any], + model_args: "ModelArguments", messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, @@ -90,15 +92,17 @@ def _process_args( and image is not None and not hasattr(processor, "image_seq_length") and template.image_token not in messages[0]["content"] - and template.format_image is None - ): # llava-like models + and model_args.visual_inputs_type == "vision_tower" + ): + # llava-like models messages[0]["content"] = template.image_token + messages[0]["content"] - # Add image token as vision input - if image is not None and template.format_image is not None: + elif image is not None and model_args.visual_inputs_type == "vision_token": + # Add image pathlike token as vision input image_path = pathlib.Path(DEFAULT_CACHE_DIR) / "temp.png" - Image.fromarray(image).save(image_path) - messages[-1]["role"] = "user_with_image" - messages[-1]["image"] = os.fspath(image_path) + Image.fromarray(image).convert("RGB").save(image_path) + messages[-1]["content"] = template.format_image.apply(content=os.fspath(image_path))[0] + messages[-1]["content"] + elif image is not None and model_args.visual_inputs_type == "vision_message_embed": + messages[-1]["content"] = template.format_image.apply()[0] + messages[-1]["content"] paired_messages = messages + [{"role": "assistant", "content": ""}] system = system or generating_args["default_system"] @@ -175,6 +179,10 @@ def _process_args( generation_config=GenerationConfig(**generating_args), logits_processor=get_logits_processor(), ) + if image is not None and model_args.visual_inputs_type == "vision_message_embed": + gen_kwargs["images"] = torch.tensor(tokenizer.apply_chat_template([{"role": "user", "image": Image.fromarray(image).convert("RGB"), "content": ""}], + add_generation_prompt=True, tokenize=True, return_tensors="pt", + return_dict=True)["images"],dtype=model_args.compute_dtype,device="cuda") if pixel_values is not None: gen_kwargs["pixel_values"] = pixel_values @@ -189,6 +197,7 @@ def _chat( processor: Optional["ProcessorMixin"], template: "Template", generating_args: Dict[str, Any], + model_args: "ModelArguments", messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, @@ -196,7 +205,7 @@ def _chat( input_kwargs: Optional[Dict[str, Any]] = {}, ) -> List["Response"]: gen_kwargs, prompt_length = HuggingfaceEngine._process_args( - model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs + model, tokenizer, processor, template, generating_args, model_args, messages, system, tools, image, input_kwargs ) generate_output = model.generate(**gen_kwargs) response_ids = generate_output[:, prompt_length:] @@ -224,6 +233,7 @@ def _stream_chat( processor: Optional["ProcessorMixin"], template: "Template", generating_args: Dict[str, Any], + model_args: "ModelArguments", messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, @@ -231,7 +241,7 @@ def _stream_chat( input_kwargs: Optional[Dict[str, Any]] = {}, ) -> Callable[[], str]: gen_kwargs, _ = HuggingfaceEngine._process_args( - model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs + model, tokenizer, processor, template, generating_args, model_args, messages, system, tools, image, input_kwargs ) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs["streamer"] = streamer @@ -297,6 +307,7 @@ async def chat( self.processor, self.template, self.generating_args, + self.model_args, messages, system, tools, @@ -325,6 +336,7 @@ async def stream_chat( self.processor, self.template, self.generating_args, + self.model_args, messages, system, tools, diff --git a/src/llamafactory/data/data_utils.py b/src/llamafactory/data/data_utils.py index 5c613f0e47..cc9761b107 100644 --- a/src/llamafactory/data/data_utils.py +++ b/src/llamafactory/data/data_utils.py @@ -37,7 +37,6 @@ class Role(str, Enum): SYSTEM = "system" FUNCTION = "function" OBSERVATION = "observation" - USER_WITH_IMAGE = "user_with_image" def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]: diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 516d05fc21..cb147e8fc6 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -117,9 +117,6 @@ def _encode( if message["role"] == Role.USER.value: elements += self.format_user.apply(content=message["content"], idx=str(i // 2)) - elif message["role"] == Role.USER_WITH_IMAGE.value: - elements += self.format_image.apply(content=message["image"]) - elements += self.format_user.apply(content=message["content"], idx=str(i // 2)) elif message["role"] == Role.ASSISTANT.value: elements += self.format_assistant.apply(content=message["content"]) elif message["role"] == Role.OBSERVATION.value: @@ -699,6 +696,21 @@ def get_template_and_fix_tokenizer( ) +_register_template( + name="glm4v", + format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), + format_assistant=StringFormatter(slots=["\n{{content}}"]), + format_system=StringFormatter(slots=["<|system|>\n{{content}}"]), + format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), + format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]), + format_tools=ToolFormatter(tool_format="glm4"), + format_prefix=EmptyFormatter(slots=["[gMASK]"]), + format_image=EmptyFormatter(slots=["<|begin_of_image|><|endoftext|><|end_of_image|>"]), + stop_words=["<|user|>", "<|observation|>"], + efficient_eos=True, +) + + _register_template( name="intern", format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index d2041f0aff..d60fbcf502 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -534,6 +534,19 @@ def register_model_group( ) +register_model_group( + models={ + "GLM-4v-9B": { + DownloadSource.DEFAULT: "THUDM/glm-4v-9b", + DownloadSource.MODELSCOPE: "ZhipuAI/glm-4v-9b", + } + }, + template="glm4v", + vision=True, + vision_type="vision_message_embed", +) + + register_model_group( models={ "InternLM-7B": { diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 96f975b7a8..2b59f67e4f 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -91,7 +91,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": patch_tokenizer(tokenizer) - if model_args.visual_inputs: + if model_args.visual_inputs and model_args.visual_inputs_type == "vision_tower": try: processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) setattr(processor, "tokenizer", tokenizer) diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index 87234156be..9526dc5227 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -48,7 +48,7 @@ def create_top() -> Dict[str, "Component"]: rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3) booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3) visual_inputs = gr.Checkbox(scale=1) - visual_inputs_type = gr.Dropdown(choices=["none", "vision_tower", "vision_token"], value="none", scale=3) + visual_inputs_type = gr.Dropdown(choices=["none", "vision_tower", "vision_token", "vision_message_embed"], value="none", scale=3) model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs, visual_inputs_type], queue=False) model_name.input(save_config, inputs=[lang, model_name], queue=False) diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py index 8e8d6fce7c..438f4a92d1 100644 --- a/src/llamafactory/webui/locales.py +++ b/src/llamafactory/webui/locales.py @@ -143,6 +143,17 @@ "label": "图像输入", }, }, + "visual_inputs_type": { + "en": { + "label": "Visual inputs type", + }, + "ru": { + "label": "Тип визуальных входов", + }, + "zh": { + "label": "图像输入类型", + }, + }, "training_stage": { "en": { "label": "Stage", From 8044804c1795ec4a2fc748ec133d125d9014a9ae Mon Sep 17 00:00:00 2001 From: marko1616 Date: Wed, 19 Jun 2024 20:52:52 +0800 Subject: [PATCH 03/30] Pass ruff check. --- src/llamafactory/chat/hf_engine.py | 7 +++---- src/llamafactory/webui/common.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index b569cda18c..31dd4770d5 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -20,15 +20,14 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union import torch -from transformers import GenerationConfig, TextIteratorStreamer - from PIL import Image +from transformers import GenerationConfig, TextIteratorStreamer from ..data import get_template_and_fix_tokenizer from ..extras.logging import get_logger from ..extras.misc import get_logits_processor -from ..webui.common import DEFAULT_CACHE_DIR from ..model import load_model, load_tokenizer +from ..webui.common import DEFAULT_CACHE_DIR from .base_engine import BaseEngine, Response @@ -93,7 +92,7 @@ def _process_args( and not hasattr(processor, "image_seq_length") and template.image_token not in messages[0]["content"] and model_args.visual_inputs_type == "vision_tower" - ): + ): # llava-like models messages[0]["content"] = template.image_token + messages[0]["content"] elif image is not None and model_args.visual_inputs_type == "vision_token": diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py index 2d9f8518c8..b1ce33a90f 100644 --- a/src/llamafactory/webui/common.py +++ b/src/llamafactory/webui/common.py @@ -149,7 +149,7 @@ def get_visual(model_name: str) -> dict[bool,str]: """ if get_prefix(model_name) in VISION_MODELS: return True, VISION_TYPES[get_prefix(model_name)] - else: + else: return False, "none" From c58be83523b431be1ba968de50d0c8ecf62216d0 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Thu, 20 Jun 2024 19:39:44 +0800 Subject: [PATCH 04/30] Half of sft support and bug fix. --- src/llamafactory/chat/hf_engine.py | 14 ++++++++++--- src/llamafactory/chat/vllm_engine.py | 2 +- src/llamafactory/data/loader.py | 6 +++++- src/llamafactory/data/preprocess.py | 4 +++- .../data/processors/processor_utils.py | 21 +++++++++++++++---- .../data/processors/supervised.py | 13 +++++++++--- src/llamafactory/hparams/finetuning_args.py | 6 +++--- src/llamafactory/hparams/model_args.py | 3 ++- src/llamafactory/model/adapter.py | 16 +++++++------- src/llamafactory/model/model_utils/misc.py | 4 ++-- src/llamafactory/model/patcher.py | 3 +++ src/llamafactory/train/trainer_utils.py | 2 +- 12 files changed, 66 insertions(+), 28 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 31dd4770d5..32da6b4d0c 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -20,6 +20,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union import torch +import torchvision from PIL import Image from transformers import GenerationConfig, TextIteratorStreamer @@ -179,9 +180,16 @@ def _process_args( logits_processor=get_logits_processor(), ) if image is not None and model_args.visual_inputs_type == "vision_message_embed": - gen_kwargs["images"] = torch.tensor(tokenizer.apply_chat_template([{"role": "user", "image": Image.fromarray(image).convert("RGB"), "content": ""}], - add_generation_prompt=True, tokenize=True, return_tensors="pt", - return_dict=True)["images"],dtype=model_args.compute_dtype,device="cuda") + transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize( + (1120, 1120), interpolation=torchvision.transforms.InterpolationMode.BICUBIC + ), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + gen_kwargs["images"] = transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype) if pixel_values is not None: gen_kwargs["pixel_values"] = pixel_values diff --git a/src/llamafactory/chat/vllm_engine.py b/src/llamafactory/chat/vllm_engine.py index 2626d61253..854ad8b06d 100644 --- a/src/llamafactory/chat/vllm_engine.py +++ b/src/llamafactory/chat/vllm_engine.py @@ -77,7 +77,7 @@ def __init__( "max_lora_rank": model_args.vllm_max_lora_rank, } - if model_args.visual_inputs: + if model_args.visual_inputs and model_args.visual_inputs_type == "vision_tower": image_size = config.vision_config.image_size patch_size = config.vision_config.patch_size self.image_feature_size = (image_size // patch_size) ** 2 diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index f44ef5de6e..2c7bc85845 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -177,7 +177,7 @@ def get_dataset( with training_args.main_process_first(desc="pre-process dataset"): preprocess_func, print_function = get_preprocess_and_print_func( - data_args, training_args, stage, template, tokenizer, processor + data_args, training_args, model_args, stage, template, tokenizer, processor ) column_names = list(next(iter(dataset)).keys()) kwargs = {} @@ -190,6 +190,10 @@ def get_dataset( dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) + if model_args.visual_inputs_type == "vision_message_embed": + dataset = dataset.rename_column("image_inputs","images") + print(dataset["images"]) + if data_args.tokenized_path is not None: if training_args.should_save: dataset.save_to_disk(data_args.tokenized_path) diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py index 9a8b97f326..e6ff34371a 100644 --- a/src/llamafactory/data/preprocess.py +++ b/src/llamafactory/data/preprocess.py @@ -29,13 +29,14 @@ if TYPE_CHECKING: from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments - from ..hparams import DataArguments + from ..hparams import DataArguments, ModelArguments from .template import Template def get_preprocess_and_print_func( data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", + model_args: "ModelArguments", stage: Literal["pt", "sft", "rm", "ppo", "kto"], template: "Template", tokenizer: "PreTrainedTokenizer", @@ -63,6 +64,7 @@ def get_preprocess_and_print_func( tokenizer=tokenizer, processor=processor, data_args=data_args, + model_args=model_args, ) print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py index 93df0cd546..8865ba59d3 100644 --- a/src/llamafactory/data/processors/processor_utils.py +++ b/src/llamafactory/data/processors/processor_utils.py @@ -14,6 +14,7 @@ import bisect from typing import TYPE_CHECKING, List, Sequence +from torchvision import transforms from ...extras.packages import is_pillow_available @@ -61,13 +62,25 @@ def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]: return knapsacks -def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "NDArray": +def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin", vision_type: str = "vision_tower") -> "NDArray": r""" Processes visual inputs. (currently only supports a single image) """ - image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") - image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255)) - return image_processor(image, return_tensors="pt")["pixel_values"][0] # shape (C, H, W) + if vision_type == "vision_tower": + image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") + image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255)) + return image_processor(image, return_tensors="pt")["pixel_values"][0] # shape (C, H, W) + elif vision_type == "vision_message_embed": + transform = transforms.Compose( + [ + transforms.Resize( + (1120, 1120), interpolation=transforms.InterpolationMode.BICUBIC + ), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + return transform(images[0]) if len(images) != 0 else transform(Image.new("RGB", (1120, 1120), (255, 255, 255))) def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") -> List[int]: diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index eb5ffb1a0c..01d18db0e9 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -23,7 +23,7 @@ if TYPE_CHECKING: from transformers import PreTrainedTokenizer, ProcessorMixin - from ...hparams import DataArguments + from ...hparams import DataArguments, ModelArguments from ..template import Template @@ -78,19 +78,26 @@ def preprocess_supervised_dataset( tokenizer: "PreTrainedTokenizer", processor: Optional["ProcessorMixin"], data_args: "DataArguments", + model_args: "ModelArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - if processor is not None: + if processor is not None and model_args.visual_inputs_type == "vision_tower": model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] + elif model_args.visual_inputs_type == "vision_message_embed": + model_inputs["image_inputs"] = [] for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue + if model_args.visual_inputs_type == "vision_message_embed": + assert len(examples["images"][i]) <= 1,"GLM4v only support 1 image train yet." + model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "vision_message_embed")) + examples["prompt"][i][-1]["content"] = template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] input_ids, labels = _encode_supervised_example( prompt=examples["prompt"][i], @@ -105,7 +112,7 @@ def preprocess_supervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - if processor is not None: + if processor is not None and model_args.visual_inputs_type == "vision_tower": model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py index b676891e9b..e1320dc845 100644 --- a/src/llamafactory/hparams/finetuning_args.py +++ b/src/llamafactory/hparams/finetuning_args.py @@ -326,9 +326,9 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to make only the parameters in the expanded blocks trainable."}, ) - freeze_vision_tower: bool = field( + freeze_vision: bool = field( default=True, - metadata={"help": "Whether ot not to freeze vision tower in MLLM training."}, + metadata={"help": "Whether ot not to freeze vision module in MLLM training."}, ) train_mm_proj_only: bool = field( default=False, @@ -351,7 +351,7 @@ def split_arg(arg): self.lora_target: List[str] = split_arg(self.lora_target) self.additional_target: Optional[List[str]] = split_arg(self.additional_target) self.galore_target: List[str] = split_arg(self.galore_target) - self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only + self.freeze_vision = self.freeze_vision or self.train_mm_proj_only self.use_ref_model = self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"] assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method." diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 8777aa9de9..a3e0bd0de4 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -239,7 +239,8 @@ def __post_init__(self): if self.new_special_tokens is not None: # support multiple special tokens self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] - assert self.visual_inputs and self.visual_inputs_type, "Must specify visual inputs type while using visual input." + if self.visual_inputs: + assert self.visual_inputs_type is not None, "Must specify visual inputs type while using visual input." assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization." diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index 34518878a6..3307c7ba0a 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -33,7 +33,7 @@ logger = get_logger(__name__) - +VISION_FREEZE_MAP = {"none":"","vision_tower":"vision_tower","vision_message_embed":"vision","vision_token":"visual"} def _setup_full_tuning( model: "PreTrainedModel", @@ -47,8 +47,8 @@ def _setup_full_tuning( logger.info("Fine-tuning method: Full") forbidden_modules = set() - if model_args.visual_inputs and finetuning_args.freeze_vision_tower: - forbidden_modules.add("vision_tower") + if model_args.visual_inputs and finetuning_args.freeze_vision: + forbidden_modules.add(VISION_FREEZE_MAP[model_args.visual_inputs_type]) if model_args.visual_inputs and finetuning_args.train_mm_proj_only: forbidden_modules.add("language_model") @@ -131,8 +131,8 @@ def _setup_freeze_tuning( trainable_layers.append(module_name) forbidden_modules = set() - if model_args.visual_inputs and finetuning_args.freeze_vision_tower: - forbidden_modules.add("vision_tower") + if model_args.visual_inputs and finetuning_args.freeze_vision: + forbidden_modules.add(VISION_FREEZE_MAP[model_args.visual_inputs_type]) for name, param in model.named_parameters(): if any(trainable_layer in name for trainable_layer in trainable_layers) and not any( @@ -204,15 +204,15 @@ def _setup_lora_tuning( if is_trainable and adapter_to_resume is None: # create new lora weights while training if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": - target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower) + target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision) else: target_modules = finetuning_args.lora_target if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) - if model_args.visual_inputs and finetuning_args.freeze_vision_tower: - target_modules = "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules)) + if model_args.visual_inputs and finetuning_args.freeze_vision: + target_modules = f"^(?!.*{VISION_FREEZE_MAP[model_args.visual_inputs_type]})."+"*(?:{}).*".format("|".join(target_modules)) if ( finetuning_args.use_dora diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py index a2812228ea..8fbaf28207 100644 --- a/src/llamafactory/model/model_utils/misc.py +++ b/src/llamafactory/model/model_utils/misc.py @@ -24,7 +24,7 @@ logger = get_logger(__name__) -def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool) -> List[str]: +def find_all_linear_modules(model: "PreTrainedModel", freeze_vision: bool) -> List[str]: r""" Finds all available modules to apply lora or galore. """ @@ -37,7 +37,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool) elif model.config.model_type in ["llava", "paligemma"]: forbidden_modules.add("multi_modal_projector") - if freeze_vision_tower: + if freeze_vision: forbidden_modules.add("vision_tower") module_names = set() diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index b3da361048..f5f81ca987 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -130,6 +130,9 @@ def patch_model( if model_args.visual_inputs and model_args.visual_inputs_type == "vison_tower": # If model DO NOT have visual token(e.g. Qwen-VL) and model have visual_inputs then choose this. autocast_projector_dtype(model, model_args) + elif model_args.visual_inputs and model_args.visual_inputs_type == "vision_message_embed": + # Patch glm4v + model.transformer.vision.to(model_args.compute_dtype) if is_trainable: prepare_model_for_training(model, model_args) diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index c1b9015546..255acecc46 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -234,7 +234,7 @@ def _create_galore_optimizer( finetuning_args: "FinetuningArguments", ) -> "torch.optim.Optimizer": if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all": - galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower) + galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision) else: galore_targets = finetuning_args.galore_target From 4b01584758b7aecf78e451e9291eea56d1d75151 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Fri, 21 Jun 2024 16:08:21 +0800 Subject: [PATCH 05/30] GLM4v lora sft support --- src/llamafactory/chat/hf_engine.py | 2 ++ src/llamafactory/data/loader.py | 1 - src/llamafactory/data/template.py | 2 +- src/llamafactory/extras/constants.py | 2 +- src/llamafactory/model/adapter.py | 2 +- src/llamafactory/train/sft/trainer.py | 28 +++++++++++++++++++++ src/llamafactory/train/sft/workflow.py | 34 +++++++++++++++++--------- 7 files changed, 56 insertions(+), 15 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 32da6b4d0c..3dd353b3e0 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -190,6 +190,8 @@ def _process_args( ] ) gen_kwargs["images"] = transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype) + elif model_args.visual_inputs_type == "vision_message_embed": + gen_kwargs["images"] = [] if pixel_values is not None: gen_kwargs["pixel_values"] = pixel_values diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index 2c7bc85845..a11a638e95 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -192,7 +192,6 @@ def get_dataset( if model_args.visual_inputs_type == "vision_message_embed": dataset = dataset.rename_column("image_inputs","images") - print(dataset["images"]) if data_args.tokenized_path is not None: if training_args.should_save: diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index ec49818138..f45e4579c8 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -690,7 +690,7 @@ def get_template_and_fix_tokenizer( _register_template( - name="glm4v", + name="glm4_v", format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), format_assistant=StringFormatter(slots=["\n{{content}}"]), format_system=StringFormatter(slots=["<|system|>\n{{content}}"]), diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index d60fbcf502..7bbe6d6cd7 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -541,7 +541,7 @@ def register_model_group( DownloadSource.MODELSCOPE: "ZhipuAI/glm-4v-9b", } }, - template="glm4v", + template="glm4_v", vision=True, vision_type="vision_message_embed", ) diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index 3307c7ba0a..1988b10cae 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -211,7 +211,7 @@ def _setup_lora_tuning( if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) - if model_args.visual_inputs and finetuning_args.freeze_vision: + if finetuning_args.freeze_vision and model_args.visual_inputs_type !="none": target_modules = f"^(?!.*{VISION_FREEZE_MAP[model_args.visual_inputs_type]})."+"*(?:{}).*".format("|".join(target_modules)) if ( diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index 921e49abda..1e3ca81209 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -158,3 +158,31 @@ def save_predictions(self, dataset: "Dataset", predict_results: "PredictionOutpu for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds): res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False)) writer.write("\n".join(res)) + +class GLM4VSeq2SeqTrainer(CustomSeq2SeqTrainer): + def compute_loss(self, model, inputs, return_outputs=False): + # Padding for labels and attention masks cuz modeling_glm4 will auto filling 1600 image tokens. + boi_ids = self.tokenizer.all_special_ids[self.tokenizer.all_special_tokens.index("<|begin_of_image|>")] + padded_ids = None + padded_labels = None + padded_attention_masks = None + if any(boi_ids == inputs["input_ids"].flatten()): + for input_id, label, attention_mask in zip(inputs["input_ids"], inputs["labels"], inputs["attention_mask"]): + if any(boi_ids == input_id.flatten()): + boi_index = input_id.tolist().index(boi_ids) + # GLM will auto filling this. + input_id_padded = input_id.unsqueeze(0) + label_padded = torch.cat((label[:boi_index],-100*torch.ones(1600,device=label.device,dtype=label.dtype),label[boi_index+1:])).unsqueeze(0) + attention_mask_padded = torch.cat((attention_mask[:boi_index],torch.ones(1600,device=attention_mask.device,dtype=attention_mask.dtype),attention_mask[boi_index+1:])).unsqueeze(0) + else: + input_id_padded = torch.cat((input_id,torch.ones(1600,device=input_id.device,dtype=input_id.dtype))).unsqueeze(0) + label_padded = torch.cat((label,-100*torch.ones(1600,device=label.device,dtype=label.dtype))).unsqueeze(0) + attention_mask_padded = torch.cat((attention_mask,torch.zeros(1600,device=attention_mask.device,dtype=attention_mask.dtype))).unsqueeze(0) + padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids,input_id_padded)) + padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels,label_padded)) + padded_attention_masks = attention_mask_padded if padded_attention_masks is None else torch.cat((padded_attention_masks,attention_mask_padded)) + inputs["input_ids"] = padded_ids.contiguous() + inputs["labels"] = padded_labels.contiguous() + inputs["attention_mask"] = padded_attention_masks.contiguous() + + super().compute_loss(model, inputs, return_outputs) diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 885bc7ac7e..064c208416 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -26,7 +26,7 @@ from ...model import load_model, load_tokenizer from ..trainer_utils import create_modelcard_and_push from .metric import ComputeMetrics -from .trainer import CustomSeq2SeqTrainer +from .trainer import CustomSeq2SeqTrainer, GLM4VSeq2SeqTrainer if TYPE_CHECKING: @@ -66,16 +66,28 @@ def run_sft( training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer - trainer = CustomSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - data_collator=data_collator, - callbacks=callbacks, - compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, - **tokenizer_module, - **split_dataset(dataset, data_args, training_args), - ) + if model_args.visual_inputs_type == "vision_message_embed": + trainer = GLM4VSeq2SeqTrainer( + model=model, + args=training_args, + finetuning_args=finetuning_args, + data_collator=data_collator, + callbacks=callbacks, + compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + **tokenizer_module, + **split_dataset(dataset, data_args, training_args), + ) + else: + trainer = CustomSeq2SeqTrainer( + model=model, + args=training_args, + finetuning_args=finetuning_args, + data_collator=data_collator, + callbacks=callbacks, + compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + **tokenizer_module, + **split_dataset(dataset, data_args, training_args), + ) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() From c233520cc97318068630965d14cab8ec24cd4017 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Sat, 22 Jun 2024 21:07:37 +0800 Subject: [PATCH 06/30] Little fix --- src/llamafactory/train/sft/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index 1e3ca81209..99763cba72 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -185,4 +185,4 @@ def compute_loss(self, model, inputs, return_outputs=False): inputs["labels"] = padded_labels.contiguous() inputs["attention_mask"] = padded_attention_masks.contiguous() - super().compute_loss(model, inputs, return_outputs) + return super().compute_loss(model, inputs, return_outputs) From 67542a0f8361d5696d86c366bdf3c5d4568c88bf Mon Sep 17 00:00:00 2001 From: marko1616 Date: Tue, 25 Jun 2024 15:02:39 +0800 Subject: [PATCH 07/30] Fix requirements.txt --- requirements.txt | 1 + src/llamafactory/chat/hf_engine.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7380add46e..f31404b0b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ peft>=0.11.1 trl>=0.8.6 gradio>=4.0.0 pandas>=2.0.0 +torchvision scipy einops sentencepiece diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index edc0273d02..3efb4346d0 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -191,7 +191,7 @@ def _process_args( ) gen_kwargs["images"] = transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype) elif model_args.visual_inputs_type == "vision_message_embed": - gen_kwargs["images"] = [] + gen_kwargs["images"] = None if pixel_values is not None: gen_kwargs["pixel_values"] = pixel_values From f698b43412b5b111cca75d54867c692846b61b5d Mon Sep 17 00:00:00 2001 From: marko1616 Date: Sat, 29 Jun 2024 20:43:52 +0800 Subject: [PATCH 08/30] QwenVL sft & webui train buxfix. --- src/llamafactory/data/aligner.py | 22 +++++++++++++------ src/llamafactory/data/loader.py | 2 +- .../data/processors/supervised.py | 4 ++++ src/llamafactory/webui/runner.py | 1 + 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py index 299bdca32d..ecf7ce5eb7 100644 --- a/src/llamafactory/data/aligner.py +++ b/src/llamafactory/data/aligner.py @@ -19,6 +19,7 @@ from datasets import Features from ..extras.logging import get_logger +from ..hparams import ModelArguments from .data_utils import Role @@ -49,7 +50,7 @@ def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: " def convert_alpaca( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments", model_args: "ModelArguments", ) -> Dict[str, List[Any]]: r""" Converts alpaca format dataset to the standard format. @@ -96,13 +97,16 @@ def convert_alpaca( outputs["response"].append(response) outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) + if model_args.visual_inputs_type == "vision_token": + outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) + else: + outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) return outputs def convert_sharegpt( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments", model_args: "ModelArguments", ) -> Dict[str, List[Any]]: r""" Converts sharegpt format dataset to the standard format. @@ -184,7 +188,10 @@ def convert_sharegpt( outputs["response"].append(response) outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) + if model_args.visual_inputs_type == "vision_token": + outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) + else: + outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) return outputs @@ -194,6 +201,7 @@ def align_dataset( dataset_attr: "DatasetAttr", data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", + model_args: "ModelArguments", ) -> Union["Dataset", "IterableDataset"]: r""" Aligned dataset: @@ -204,9 +212,9 @@ def align_dataset( images: [], """ if dataset_attr.formatting == "alpaca": - convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args) + convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args, model_args=model_args) else: - convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args) + convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args, model_args=model_args) column_names = list(next(iter(dataset)).keys()) features = Features.from_dict( @@ -219,7 +227,7 @@ def align_dataset( ], "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, - "images": [{"_type": "Image"}], + "images": [{"dtype": "string", "_type": "Value"} if model_args.visual_inputs_type == "vision_token" else {"_type": "Image"}], } ) kwargs = {} diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index 39f92d0599..baedce8eaa 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -137,7 +137,7 @@ def load_single_dataset( max_samples = min(data_args.max_samples, len(dataset)) dataset = dataset.select(range(max_samples)) - return align_dataset(dataset, dataset_attr, data_args, training_args) + return align_dataset(dataset, dataset_attr, data_args, training_args, model_args) def get_dataset( diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index 01d18db0e9..c2ee3dfeaa 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from collections import defaultdict from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple @@ -98,6 +99,9 @@ def preprocess_supervised_dataset( assert len(examples["images"][i]) <= 1,"GLM4v only support 1 image train yet." model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "vision_message_embed")) examples["prompt"][i][-1]["content"] = template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] + elif model_args.visual_inputs_type == "vision_token": + assert len(examples["images"][i]) <= 1,"Qwenvl only support 1 image train yet." + examples["prompt"][i][-1]["content"] = template.format_image.apply(content=os.path.join(data_args.dataset_dir,examples["images"][i][-1]))[0] + examples["prompt"][i][-1]["content"] input_ids, labels = _encode_supervised_example( prompt=examples["prompt"][i], diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py index 131d180d0a..52d34c9b0f 100644 --- a/src/llamafactory/webui/runner.py +++ b/src/llamafactory/webui/runner.py @@ -123,6 +123,7 @@ def _parse_train_args(self, data: Dict["Component", Any]) -> Dict[str, Any]: flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", use_unsloth=(get("top.booster") == "unsloth"), visual_inputs=get("top.visual_inputs"), + visual_inputs_type=get("top.visual_inputs_type"), dataset_dir=get("train.dataset_dir"), dataset=",".join(get("train.dataset")), cutoff_len=get("train.cutoff_len"), From 3fa3a0b0cea064c2a2f0129c0b7b3c32bd54e078 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Mon, 1 Jul 2024 00:13:02 +0800 Subject: [PATCH 09/30] phi3v infer support & rename. --- src/llamafactory/chat/hf_engine.py | 20 +++++++++++++------ src/llamafactory/data/aligner.py | 6 +++--- src/llamafactory/data/loader.py | 2 +- .../data/processors/processor_utils.py | 2 +- .../data/processors/supervised.py | 8 ++++---- src/llamafactory/data/template.py | 12 +++++++++++ src/llamafactory/extras/constants.py | 17 ++++++++++++++-- src/llamafactory/model/adapter.py | 2 +- src/llamafactory/model/loader.py | 2 +- src/llamafactory/model/patcher.py | 2 +- src/llamafactory/train/sft/workflow.py | 2 +- src/llamafactory/webui/components/top.py | 2 +- 12 files changed, 55 insertions(+), 22 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 3efb4346d0..558cd5ef1d 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -92,32 +92,37 @@ def _process_args( and image is not None and not hasattr(processor, "image_seq_length") and template.image_token not in messages[0]["content"] - and model_args.visual_inputs_type == "vision_tower" + and model_args.visual_inputs_type in ["vision_tower","phi3v_like"] ): # llava-like models messages[0]["content"] = template.image_token + messages[0]["content"] - elif image is not None and model_args.visual_inputs_type == "vision_token": + elif image is not None and model_args.visual_inputs_type == "qwenvl_like": # Add image pathlike token as vision input image_path = pathlib.Path(DEFAULT_CACHE_DIR) / "temp.png" Image.fromarray(image).convert("RGB").save(image_path) messages[-1]["content"] = template.format_image.apply(content=os.fspath(image_path))[0] + messages[-1]["content"] - elif image is not None and model_args.visual_inputs_type == "vision_message_embed": + elif image is not None and model_args.visual_inputs_type == "glm4v_like": messages[-1]["content"] = template.format_image.apply()[0] + messages[-1]["content"] paired_messages = messages + [{"role": "assistant", "content": ""}] system = system or generating_args["default_system"] pixel_values = None + image_sizes = None prompt_ids, _ = template.encode_oneturn( tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools ) # add image features for vision tower if processor is not None and image is not None and template.format_image is None: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") - batch_feature = image_processor(image, return_tensors="pt") + batch_feature = image_processor(image, return_tensors="pt") if model_args.visual_inputs_type == "vision_tower" else image_processor(Image.fromarray(image), return_tensors="pt") pixel_values = batch_feature.to(model.device)["pixel_values"] # shape (B, C, H, W) if hasattr(processor, "image_seq_length"): # paligemma models image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids + if model_args.visual_inputs_type == "phi3v_like": + image_sizes = batch_feature["image_sizes"] + index_image = prompt_ids.index(tokenizer.vocab["<|image|>"]) + prompt_ids = prompt_ids[:index_image] + [-1]*batch_feature["num_img_tokens"].item() + prompt_ids[index_image+1:] prompt_length = len(prompt_ids) inputs = torch.tensor([prompt_ids], device=model.device) @@ -179,7 +184,7 @@ def _process_args( generation_config=GenerationConfig(**generating_args), logits_processor=get_logits_processor(), ) - if image is not None and model_args.visual_inputs_type == "vision_message_embed": + if image is not None and model_args.visual_inputs_type == "glm4v_like": transform = torchvision.transforms.Compose( [ torchvision.transforms.Resize( @@ -190,12 +195,15 @@ def _process_args( ] ) gen_kwargs["images"] = transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype) - elif model_args.visual_inputs_type == "vision_message_embed": + elif model_args.visual_inputs_type == "glm4v_like": gen_kwargs["images"] = None if pixel_values is not None: gen_kwargs["pixel_values"] = pixel_values + if image_sizes is not None and model_args.visual_inputs_type == "phi3v_like": + gen_kwargs["image_sizes"] = image_sizes + return gen_kwargs, prompt_length @staticmethod diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py index ecf7ce5eb7..dbfa19b0f1 100644 --- a/src/llamafactory/data/aligner.py +++ b/src/llamafactory/data/aligner.py @@ -97,7 +97,7 @@ def convert_alpaca( outputs["response"].append(response) outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - if model_args.visual_inputs_type == "vision_token": + if model_args.visual_inputs_type == "qwenvl_like": outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) else: outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) @@ -188,7 +188,7 @@ def convert_sharegpt( outputs["response"].append(response) outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - if model_args.visual_inputs_type == "vision_token": + if model_args.visual_inputs_type == "qwenvl_like": outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) else: outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) @@ -227,7 +227,7 @@ def align_dataset( ], "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, - "images": [{"dtype": "string", "_type": "Value"} if model_args.visual_inputs_type == "vision_token" else {"_type": "Image"}], + "images": [{"dtype": "string", "_type": "Value"} if model_args.visual_inputs_type == "qwenvl_like" else {"_type": "Image"}], } ) kwargs = {} diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index baedce8eaa..e3d1b30e31 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -190,7 +190,7 @@ def get_dataset( dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) - if model_args.visual_inputs_type == "vision_message_embed": + if model_args.visual_inputs_type == "glm4v_like": dataset = dataset.rename_column("image_inputs","images") if data_args.tokenized_path is not None: diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py index 8865ba59d3..e8888fab46 100644 --- a/src/llamafactory/data/processors/processor_utils.py +++ b/src/llamafactory/data/processors/processor_utils.py @@ -70,7 +70,7 @@ def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255)) return image_processor(image, return_tensors="pt")["pixel_values"][0] # shape (C, H, W) - elif vision_type == "vision_message_embed": + elif vision_type == "glm4v_like": transform = transforms.Compose( [ transforms.Resize( diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index c2ee3dfeaa..4c3eef6410 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -88,18 +88,18 @@ def preprocess_supervised_dataset( model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] - elif model_args.visual_inputs_type == "vision_message_embed": + elif model_args.visual_inputs_type == "glm4v_like": model_inputs["image_inputs"] = [] for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue - if model_args.visual_inputs_type == "vision_message_embed": + if model_args.visual_inputs_type == "glm4v_like": assert len(examples["images"][i]) <= 1,"GLM4v only support 1 image train yet." - model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "vision_message_embed")) + model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) examples["prompt"][i][-1]["content"] = template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] - elif model_args.visual_inputs_type == "vision_token": + elif model_args.visual_inputs_type == "qwenvl_like": assert len(examples["images"][i]) <= 1,"Qwenvl only support 1 image train yet." examples["prompt"][i][-1]["content"] = template.format_image.apply(content=os.path.join(data_args.dataset_dir,examples["images"][i][-1]))[0] + examples["prompt"][i][-1]["content"] diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index bc2184200f..90551bf20e 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -806,6 +806,18 @@ def get_template_and_fix_tokenizer( ) +_register_template( + name="phi_v", + format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]), + format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]), + format_separator=EmptyFormatter(slots=["\n"]), + format_prefix=EmptyFormatter(slots=[{"bos_token"}]), + stop_words=["<|end|>"], + image_token = "<|image|>", + replace_eos=True, +) + + _register_template( name="qwen", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index d4e4d62318..c988f82007 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -555,7 +555,7 @@ def register_model_group( }, template="glm4_v", vision=True, - vision_type="vision_message_embed", + vision_type="glm4v_like", ) @@ -879,6 +879,7 @@ def register_model_group( }, }, vision=True, + vision_type="vision_tower", ) @@ -927,6 +928,18 @@ def register_model_group( ) +register_model_group( + models={ + "Phi-3-vision-128k-instruct": { + DownloadSource.DEFAULT: "microsoft/Phi-3-vision-128k-instruct", + } + }, + template="phi", + vision=True, + vision_type="phi3v_like", +) + + register_model_group( models={ "Qwen-1.8B": { @@ -1245,7 +1258,7 @@ def register_model_group( }, template="qwenvl", vision=True, - vision_type="vision_token", + vision_type="qwenvl_like", ) diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index 2bdb778bdb..9bbb0e4c11 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -33,7 +33,7 @@ logger = get_logger(__name__) -VISION_FREEZE_MAP = {"none":"","vision_tower":"vision_tower","vision_message_embed":"vision","vision_token":"visual"} +VISION_FREEZE_MAP = {"none":"","vision_tower":"vision_tower","glm4v_like":"vision","qwenvl_like":"visual","phi3v_like":"vision_embed_tokens"} def _setup_full_tuning( model: "PreTrainedModel", diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index b734de2adf..83bffb2c27 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -92,7 +92,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": patch_tokenizer(tokenizer) - if model_args.visual_inputs and model_args.visual_inputs_type == "vision_tower": + if model_args.visual_inputs and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: try: processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) setattr(processor, "tokenizer", tokenizer) diff --git a/src/llamafactory/model/patcher.py b/src/llamafactory/model/patcher.py index b241f75f47..4e4876363f 100644 --- a/src/llamafactory/model/patcher.py +++ b/src/llamafactory/model/patcher.py @@ -130,7 +130,7 @@ def patch_model( if model_args.visual_inputs and model_args.visual_inputs_type == "vison_tower": # If model DO NOT have visual token(e.g. Qwen-VL) and model have visual_inputs then choose this. autocast_projector_dtype(model, model_args) - elif model_args.visual_inputs and model_args.visual_inputs_type == "vision_message_embed": + elif model_args.visual_inputs and model_args.visual_inputs_type == "glm4v_like": # Patch glm4v model.transformer.vision.to(model_args.compute_dtype) diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 064c208416..b5e1f08810 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -66,7 +66,7 @@ def run_sft( training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer - if model_args.visual_inputs_type == "vision_message_embed": + if model_args.visual_inputs_type == "glm4v_like": trainer = GLM4VSeq2SeqTrainer( model=model, args=training_args, diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index 75ffa95567..d2a3c5dfb1 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -49,7 +49,7 @@ def create_top() -> Dict[str, "Component"]: rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2) booster = gr.Radio(choices=["auto", "flashattn2", "unsloth"], value="auto", scale=2) visual_inputs = gr.Checkbox(scale=1) - visual_inputs_type = gr.Dropdown(choices=["none", "vision_tower", "vision_token", "vision_message_embed"], value="none", scale=3) + visual_inputs_type = gr.Dropdown(choices=["none", "vision_tower", "qwenvl_like", "glm4v_like", "phi3v_like"], value="none", scale=4) model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs, visual_inputs_type], queue=False).then( list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False From 06823f4608d63c973a7187886c0c21296c837633 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Mon, 1 Jul 2024 00:29:25 +0800 Subject: [PATCH 10/30] Add rm,pt,ppo,kto,dpo support for glm4v(Not tested). --- src/llamafactory/train/dpo/workflow.py | 5 ++-- src/llamafactory/train/kto/workflow.py | 5 ++-- src/llamafactory/train/ppo/workflow.py | 5 ++-- src/llamafactory/train/pt/workflow.py | 5 ++-- src/llamafactory/train/rm/workflow.py | 5 ++-- src/llamafactory/train/sft/trainer.py | 28 ------------------- src/llamafactory/train/sft/workflow.py | 37 +++++++++----------------- 7 files changed, 28 insertions(+), 62 deletions(-) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 431b52856b..fc313f7a58 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -22,7 +22,7 @@ from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, create_ref_model +from ..trainer_utils import create_modelcard_and_push, create_ref_model, factory_glm4v_trainer from .trainer import CustomDPOTrainer @@ -63,7 +63,8 @@ def run_dpo( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - trainer = CustomDPOTrainer( + Trainer = factory_glm4v_trainer(CustomDPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomDPOTrainer + trainer = Trainer( model=model, ref_model=ref_model, args=training_args, diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py index 8182a1844e..89a789dbd4 100644 --- a/src/llamafactory/train/kto/workflow.py +++ b/src/llamafactory/train/kto/workflow.py @@ -22,7 +22,7 @@ from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, create_ref_model +from ..trainer_utils import create_modelcard_and_push, create_ref_model, factory_glm4v_trainer from .trainer import CustomKTOTrainer @@ -60,7 +60,8 @@ def run_kto( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - trainer = CustomKTOTrainer( + Trainer = factory_glm4v_trainer(CustomKTOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomKTOTrainer + trainer = Trainer( model=model, ref_model=ref_model, args=training_args, diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py index 651296f36b..9dd81d5a24 100644 --- a/src/llamafactory/train/ppo/workflow.py +++ b/src/llamafactory/train/ppo/workflow.py @@ -23,7 +23,7 @@ from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import FixValueHeadModelCallback, fix_valuehead_checkpoint -from ..trainer_utils import create_ref_model, create_reward_model +from ..trainer_utils import create_ref_model, create_reward_model, factory_glm4v_trainer from .trainer import CustomPPOTrainer @@ -54,7 +54,8 @@ def run_ppo( reward_model = create_reward_model(model, model_args, finetuning_args) # Initialize our Trainer - ppo_trainer = CustomPPOTrainer( + Trainer = factory_glm4v_trainer(CustomPPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomPPOTrainer + ppo_trainer = Trainer( model_args=model_args, training_args=training_args, finetuning_args=finetuning_args, diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py index b84a0e7da8..d9d3ad1df8 100644 --- a/src/llamafactory/train/pt/workflow.py +++ b/src/llamafactory/train/pt/workflow.py @@ -23,7 +23,7 @@ from ...data import get_dataset, split_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push +from ..trainer_utils import create_modelcard_and_push, factory_glm4v_trainer from .trainer import CustomTrainer @@ -47,7 +47,8 @@ def run_pt( data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Initialize our Trainer - trainer = CustomTrainer( + Trainer = factory_glm4v_trainer(CustomTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomTrainer + trainer = Trainer( model=model, args=training_args, finetuning_args=finetuning_args, diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py index e0b32b7782..1814b7b8f5 100644 --- a/src/llamafactory/train/rm/workflow.py +++ b/src/llamafactory/train/rm/workflow.py @@ -43,7 +43,7 @@ from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import fix_valuehead_checkpoint -from ..trainer_utils import create_modelcard_and_push +from ..trainer_utils import create_modelcard_and_push, factory_glm4v_trainer from .metric import compute_accuracy from .trainer import PairwiseTrainer @@ -71,7 +71,8 @@ def run_rm( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - trainer = PairwiseTrainer( + Trainer = factory_glm4v_trainer(PairwiseTrainer) if model_args.visual_inputs_type == "glm4v_like" else PairwiseTrainer + trainer = Trainer( model=model, args=training_args, finetuning_args=finetuning_args, diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index 3868ca230d..06bd2b6bba 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -153,31 +153,3 @@ def save_predictions(self, dataset: "Dataset", predict_results: "PredictionOutpu for text, label, pred in zip(decoded_inputs, decoded_labels, decoded_preds): res.append(json.dumps({"prompt": text, "label": label, "predict": pred}, ensure_ascii=False)) writer.write("\n".join(res)) - -class GLM4VSeq2SeqTrainer(CustomSeq2SeqTrainer): - def compute_loss(self, model, inputs, return_outputs=False): - # Padding for labels and attention masks cuz modeling_glm4 will auto filling 1600 image tokens. - boi_ids = self.tokenizer.all_special_ids[self.tokenizer.all_special_tokens.index("<|begin_of_image|>")] - padded_ids = None - padded_labels = None - padded_attention_masks = None - if any(boi_ids == inputs["input_ids"].flatten()): - for input_id, label, attention_mask in zip(inputs["input_ids"], inputs["labels"], inputs["attention_mask"]): - if any(boi_ids == input_id.flatten()): - boi_index = input_id.tolist().index(boi_ids) - # GLM will auto filling this. - input_id_padded = input_id.unsqueeze(0) - label_padded = torch.cat((label[:boi_index],-100*torch.ones(1600,device=label.device,dtype=label.dtype),label[boi_index+1:])).unsqueeze(0) - attention_mask_padded = torch.cat((attention_mask[:boi_index],torch.ones(1600,device=attention_mask.device,dtype=attention_mask.dtype),attention_mask[boi_index+1:])).unsqueeze(0) - else: - input_id_padded = torch.cat((input_id,torch.ones(1600,device=input_id.device,dtype=input_id.dtype))).unsqueeze(0) - label_padded = torch.cat((label,-100*torch.ones(1600,device=label.device,dtype=label.dtype))).unsqueeze(0) - attention_mask_padded = torch.cat((attention_mask,torch.zeros(1600,device=attention_mask.device,dtype=attention_mask.dtype))).unsqueeze(0) - padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids,input_id_padded)) - padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels,label_padded)) - padded_attention_masks = attention_mask_padded if padded_attention_masks is None else torch.cat((padded_attention_masks,attention_mask_padded)) - inputs["input_ids"] = padded_ids.contiguous() - inputs["labels"] = padded_labels.contiguous() - inputs["attention_mask"] = padded_attention_masks.contiguous() - - return super().compute_loss(model, inputs, return_outputs) diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index b5e1f08810..bbf407c23c 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -24,9 +24,9 @@ from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push +from ..trainer_utils import create_modelcard_and_push, factory_glm4v_trainer from .metric import ComputeMetrics -from .trainer import CustomSeq2SeqTrainer, GLM4VSeq2SeqTrainer +from .trainer import CustomSeq2SeqTrainer if TYPE_CHECKING: @@ -66,28 +66,17 @@ def run_sft( training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer - if model_args.visual_inputs_type == "glm4v_like": - trainer = GLM4VSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - data_collator=data_collator, - callbacks=callbacks, - compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, - **tokenizer_module, - **split_dataset(dataset, data_args, training_args), - ) - else: - trainer = CustomSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - data_collator=data_collator, - callbacks=callbacks, - compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, - **tokenizer_module, - **split_dataset(dataset, data_args, training_args), - ) + Trainer = factory_glm4v_trainer(CustomSeq2SeqTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomSeq2SeqTrainer + trainer = Trainer( + model=model, + args=training_args, + finetuning_args=finetuning_args, + data_collator=data_collator, + callbacks=callbacks, + compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + **tokenizer_module, + **split_dataset(dataset, data_args, training_args), + ) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() From 4e4f9590d23f2c96a5480d9d584f4b796f5773c4 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Mon, 1 Jul 2024 00:56:24 +0800 Subject: [PATCH 11/30] little fix --- src/llamafactory/train/trainer_utils.py | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 4de4cc2323..6a02d998a2 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -425,3 +425,34 @@ def get_batch_logps( labels[labels == label_pad_token_id] = 0 # dummy token per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1) + + +def factory_glm4v_trainer(BastTrainer): + class GLM4VTrainer(BastTrainer): + def compute_loss(self, model, inputs, return_outputs=False): + # Padding for labels and attention masks cuz modeling_glm4 will auto filling 1600 image tokens. + boi_ids = self.tokenizer.all_special_ids[self.tokenizer.all_special_tokens.index("<|begin_of_image|>")] + padded_ids = None + padded_labels = None + padded_attention_masks = None + if any(boi_ids == inputs["input_ids"].flatten()): + for input_id, label, attention_mask in zip(inputs["input_ids"], inputs["labels"], inputs["attention_mask"]): + if any(boi_ids == input_id.flatten()): + boi_index = input_id.tolist().index(boi_ids) + # GLM will auto filling this. + input_id_padded = input_id.unsqueeze(0) + label_padded = torch.cat((label[:boi_index],-100*torch.ones(1600,device=label.device,dtype=label.dtype),label[boi_index+1:])).unsqueeze(0) + attention_mask_padded = torch.cat((attention_mask[:boi_index],torch.ones(1600,device=attention_mask.device,dtype=attention_mask.dtype),attention_mask[boi_index+1:])).unsqueeze(0) + else: + input_id_padded = torch.cat((input_id,torch.ones(1600,device=input_id.device,dtype=input_id.dtype))).unsqueeze(0) + label_padded = torch.cat((label,-100*torch.ones(1600,device=label.device,dtype=label.dtype))).unsqueeze(0) + attention_mask_padded = torch.cat((attention_mask,torch.zeros(1600,device=attention_mask.device,dtype=attention_mask.dtype))).unsqueeze(0) + padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids,input_id_padded)) + padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels,label_padded)) + padded_attention_masks = attention_mask_padded if padded_attention_masks is None else torch.cat((padded_attention_masks,attention_mask_padded)) + inputs["input_ids"] = padded_ids.contiguous() + inputs["labels"] = padded_labels.contiguous() + inputs["attention_mask"] = padded_attention_masks.contiguous() + + return super().compute_loss(model, inputs, return_outputs) + return GLM4VTrainer \ No newline at end of file From 4f564a1d003775a4b52e1bf8737d272916cfb534 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Mon, 1 Jul 2024 01:15:08 +0800 Subject: [PATCH 12/30] Pass ruff --- src/llamafactory/chat/hf_engine.py | 2 +- src/llamafactory/data/processors/processor_utils.py | 1 + src/llamafactory/train/trainer_utils.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 558cd5ef1d..306d0fe7ed 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -121,7 +121,7 @@ def _process_args( prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids if model_args.visual_inputs_type == "phi3v_like": image_sizes = batch_feature["image_sizes"] - index_image = prompt_ids.index(tokenizer.vocab["<|image|>"]) + index_image = prompt_ids.index(tokenizer.vocab["<|image|>"]) prompt_ids = prompt_ids[:index_image] + [-1]*batch_feature["num_img_tokens"].item() + prompt_ids[index_image+1:] prompt_length = len(prompt_ids) diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py index e8888fab46..7ceb89ac6d 100644 --- a/src/llamafactory/data/processors/processor_utils.py +++ b/src/llamafactory/data/processors/processor_utils.py @@ -14,6 +14,7 @@ import bisect from typing import TYPE_CHECKING, List, Sequence + from torchvision import transforms from ...extras.packages import is_pillow_available diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 6a02d998a2..c308160728 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -455,4 +455,4 @@ def compute_loss(self, model, inputs, return_outputs=False): inputs["attention_mask"] = padded_attention_masks.contiguous() return super().compute_loss(model, inputs, return_outputs) - return GLM4VTrainer \ No newline at end of file + return GLM4VTrainer From c37465e1a00e46f10cbf4e2e847d8473c496d817 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Mon, 1 Jul 2024 20:52:27 +0800 Subject: [PATCH 13/30] Style check & fix requirements.txt --- requirements.txt | 1 - src/llamafactory/chat/hf_engine.py | 50 ++++++++++++++++--- src/llamafactory/data/aligner.py | 16 ++++-- src/llamafactory/data/loader.py | 2 +- .../data/processors/processor_utils.py | 8 +-- .../data/processors/supervised.py | 13 +++-- src/llamafactory/data/template.py | 2 +- src/llamafactory/extras/constants.py | 1 + src/llamafactory/model/adapter.py | 15 ++++-- src/llamafactory/model/loader.py | 2 +- src/llamafactory/train/dpo/workflow.py | 4 +- src/llamafactory/train/kto/workflow.py | 4 +- src/llamafactory/train/ppo/workflow.py | 4 +- src/llamafactory/train/rm/workflow.py | 4 +- src/llamafactory/train/sft/workflow.py | 6 ++- src/llamafactory/train/trainer_utils.py | 46 +++++++++++++---- src/llamafactory/webui/common.py | 2 +- src/llamafactory/webui/components/top.py | 10 ++-- 18 files changed, 145 insertions(+), 45 deletions(-) diff --git a/requirements.txt b/requirements.txt index f31404b0b6..7380add46e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ peft>=0.11.1 trl>=0.8.6 gradio>=4.0.0 pandas>=2.0.0 -torchvision scipy einops sentencepiece diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 306d0fe7ed..921042ff57 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -92,7 +92,7 @@ def _process_args( and image is not None and not hasattr(processor, "image_seq_length") and template.image_token not in messages[0]["content"] - and model_args.visual_inputs_type in ["vision_tower","phi3v_like"] + and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"] ): # llava-like models messages[0]["content"] = template.image_token + messages[0]["content"] @@ -100,7 +100,9 @@ def _process_args( # Add image pathlike token as vision input image_path = pathlib.Path(DEFAULT_CACHE_DIR) / "temp.png" Image.fromarray(image).convert("RGB").save(image_path) - messages[-1]["content"] = template.format_image.apply(content=os.fspath(image_path))[0] + messages[-1]["content"] + messages[-1]["content"] = ( + template.format_image.apply(content=os.fspath(image_path))[0] + messages[-1]["content"] + ) elif image is not None and model_args.visual_inputs_type == "glm4v_like": messages[-1]["content"] = template.format_image.apply()[0] + messages[-1]["content"] @@ -114,7 +116,11 @@ def _process_args( # add image features for vision tower if processor is not None and image is not None and template.format_image is None: image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") - batch_feature = image_processor(image, return_tensors="pt") if model_args.visual_inputs_type == "vision_tower" else image_processor(Image.fromarray(image), return_tensors="pt") + batch_feature = ( + image_processor(image, return_tensors="pt") + if model_args.visual_inputs_type == "vision_tower" + else image_processor(Image.fromarray(image), return_tensors="pt") + ) pixel_values = batch_feature.to(model.device)["pixel_values"] # shape (B, C, H, W) if hasattr(processor, "image_seq_length"): # paligemma models image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) @@ -122,7 +128,11 @@ def _process_args( if model_args.visual_inputs_type == "phi3v_like": image_sizes = batch_feature["image_sizes"] index_image = prompt_ids.index(tokenizer.vocab["<|image|>"]) - prompt_ids = prompt_ids[:index_image] + [-1]*batch_feature["num_img_tokens"].item() + prompt_ids[index_image+1:] + prompt_ids = ( + prompt_ids[:index_image] + + [-1] * batch_feature["num_img_tokens"].item() + + prompt_ids[index_image + 1 :] + ) prompt_length = len(prompt_ids) inputs = torch.tensor([prompt_ids], device=model.device) @@ -191,10 +201,14 @@ def _process_args( (1120, 1120), interpolation=torchvision.transforms.InterpolationMode.BICUBIC ), torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + torchvision.transforms.Normalize( + (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711) + ), ] ) - gen_kwargs["images"] = transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype) + gen_kwargs["images"] = ( + transform(Image.fromarray(image)).unsqueeze(0).to(model.device).to(model_args.compute_dtype) + ) elif model_args.visual_inputs_type == "glm4v_like": gen_kwargs["images"] = None @@ -222,7 +236,17 @@ def _chat( input_kwargs: Optional[Dict[str, Any]] = {}, ) -> List["Response"]: gen_kwargs, prompt_length = HuggingfaceEngine._process_args( - model, tokenizer, processor, template, generating_args, model_args, messages, system, tools, image, input_kwargs + model, + tokenizer, + processor, + template, + generating_args, + model_args, + messages, + system, + tools, + image, + input_kwargs, ) generate_output = model.generate(**gen_kwargs) response_ids = generate_output[:, prompt_length:] @@ -258,7 +282,17 @@ def _stream_chat( input_kwargs: Optional[Dict[str, Any]] = {}, ) -> Callable[[], str]: gen_kwargs, _ = HuggingfaceEngine._process_args( - model, tokenizer, processor, template, generating_args, model_args, messages, system, tools, image, input_kwargs + model, + tokenizer, + processor, + template, + generating_args, + model_args, + messages, + system, + tools, + image, + input_kwargs, ) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs["streamer"] = streamer diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py index dbfa19b0f1..4b2610e538 100644 --- a/src/llamafactory/data/aligner.py +++ b/src/llamafactory/data/aligner.py @@ -50,7 +50,10 @@ def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: " def convert_alpaca( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments", model_args: "ModelArguments", + examples: Dict[str, List[Any]], + dataset_attr: "DatasetAttr", + data_args: "DataArguments", + model_args: "ModelArguments", ) -> Dict[str, List[Any]]: r""" Converts alpaca format dataset to the standard format. @@ -106,7 +109,10 @@ def convert_alpaca( def convert_sharegpt( - examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments", model_args: "ModelArguments", + examples: Dict[str, List[Any]], + dataset_attr: "DatasetAttr", + data_args: "DataArguments", + model_args: "ModelArguments", ) -> Dict[str, List[Any]]: r""" Converts sharegpt format dataset to the standard format. @@ -227,7 +233,11 @@ def align_dataset( ], "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, - "images": [{"dtype": "string", "_type": "Value"} if model_args.visual_inputs_type == "qwenvl_like" else {"_type": "Image"}], + "images": [ + {"dtype": "string", "_type": "Value"} + if model_args.visual_inputs_type == "qwenvl_like" + else {"_type": "Image"} + ], } ) kwargs = {} diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index e3d1b30e31..944d603ad9 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -191,7 +191,7 @@ def get_dataset( dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) if model_args.visual_inputs_type == "glm4v_like": - dataset = dataset.rename_column("image_inputs","images") + dataset = dataset.rename_column("image_inputs", "images") if data_args.tokenized_path is not None: if training_args.should_save: diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py index 3bb81a4a4c..79a9e5d2b0 100644 --- a/src/llamafactory/data/processors/processor_utils.py +++ b/src/llamafactory/data/processors/processor_utils.py @@ -63,7 +63,9 @@ def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]: return knapsacks -def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin", vision_type: str = "vision_tower") -> "NDArray": +def get_pixel_values( + images: Sequence["ImageObject"], processor: "ProcessorMixin", vision_type: str = "vision_tower" +) -> "NDArray": r""" Processes visual inputs. (currently only supports a single image) """ @@ -74,9 +76,7 @@ def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin elif vision_type == "glm4v_like": transform = transforms.Compose( [ - transforms.Resize( - (1120, 1120), interpolation=transforms.InterpolationMode.BICUBIC - ), + transforms.Resize((1120, 1120), interpolation=transforms.InterpolationMode.BICUBIC), transforms.ToTensor(), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ] diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index a425a47b51..6a27b57f0b 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -103,12 +103,17 @@ def preprocess_supervised_dataset( logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue if model_args.visual_inputs_type == "glm4v_like": - assert len(examples["images"][i]) <= 1,"GLM4v only support 1 image train yet." + assert len(examples["images"][i]) <= 1, "GLM4v only support 1 image train yet." model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) - examples["prompt"][i][-1]["content"] = template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] + examples["prompt"][i][-1]["content"] = ( + template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] + ) elif model_args.visual_inputs_type == "qwenvl_like": - assert len(examples["images"][i]) <= 1,"Qwenvl only support 1 image train yet." - examples["prompt"][i][-1]["content"] = template.format_image.apply(content=os.path.join(data_args.dataset_dir,examples["images"][i][-1]))[0] + examples["prompt"][i][-1]["content"] + assert len(examples["images"][i]) <= 1, "Qwenvl only support 1 image train yet." + examples["prompt"][i][-1]["content"] = ( + template.format_image.apply(content=os.path.join(data_args.dataset_dir, examples["images"][i][-1]))[0] + + examples["prompt"][i][-1]["content"] + ) input_ids, labels = _encode_supervised_example( prompt=examples["prompt"][i], diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index f86721a69c..1c1ff50fe7 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -777,7 +777,7 @@ def get_template_and_fix_tokenizer( format_separator=EmptyFormatter(slots=["\n"]), format_prefix=EmptyFormatter(slots=[{"bos_token"}]), stop_words=["<|end|>"], - image_token = "<|image|>", + image_token="<|image|>", replace_eos=True, ) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index c988f82007..9144c943b1 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -113,6 +113,7 @@ def register_model_group( VISION_MODELS.add(prefix) VISION_TYPES[prefix] = vision_type + register_model_group( models={ "Aya-23-8B-Chat": { diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index 9bbb0e4c11..dc1de9fbb2 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -33,7 +33,14 @@ logger = get_logger(__name__) -VISION_FREEZE_MAP = {"none":"","vision_tower":"vision_tower","glm4v_like":"vision","qwenvl_like":"visual","phi3v_like":"vision_embed_tokens"} +VISION_FREEZE_MAP = { + "none": "", + "vision_tower": "vision_tower", + "glm4v_like": "vision", + "qwenvl_like": "visual", + "phi3v_like": "vision_embed_tokens", +} + def _setup_full_tuning( model: "PreTrainedModel", @@ -211,8 +218,10 @@ def _setup_lora_tuning( if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) - if finetuning_args.freeze_vision and model_args.visual_inputs_type !="none": - target_modules = f"^(?!.*{VISION_FREEZE_MAP[model_args.visual_inputs_type]})."+"*(?:{}).*".format("|".join(target_modules)) + if finetuning_args.freeze_vision and model_args.visual_inputs_type != "none": + target_modules = f"^(?!.*{VISION_FREEZE_MAP[model_args.visual_inputs_type]})." + "*(?:{}).*".format( + "|".join(target_modules) + ) if ( finetuning_args.use_dora diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index a14ca98f09..1ce371b4e0 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -93,7 +93,7 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": patch_tokenizer(tokenizer) - if model_args.visual_inputs and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if model_args.visual_inputs and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: try: processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) setattr(processor, "tokenizer", tokenizer) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index fc313f7a58..024ae26c4c 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -63,7 +63,9 @@ def run_dpo( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - Trainer = factory_glm4v_trainer(CustomDPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomDPOTrainer + Trainer = ( + factory_glm4v_trainer(CustomDPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomDPOTrainer + ) trainer = Trainer( model=model, ref_model=ref_model, diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py index 89a789dbd4..2a7a657fae 100644 --- a/src/llamafactory/train/kto/workflow.py +++ b/src/llamafactory/train/kto/workflow.py @@ -60,7 +60,9 @@ def run_kto( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - Trainer = factory_glm4v_trainer(CustomKTOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomKTOTrainer + Trainer = ( + factory_glm4v_trainer(CustomKTOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomKTOTrainer + ) trainer = Trainer( model=model, ref_model=ref_model, diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py index 9dd81d5a24..b493b22755 100644 --- a/src/llamafactory/train/ppo/workflow.py +++ b/src/llamafactory/train/ppo/workflow.py @@ -54,7 +54,9 @@ def run_ppo( reward_model = create_reward_model(model, model_args, finetuning_args) # Initialize our Trainer - Trainer = factory_glm4v_trainer(CustomPPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomPPOTrainer + Trainer = ( + factory_glm4v_trainer(CustomPPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomPPOTrainer + ) ppo_trainer = Trainer( model_args=model_args, training_args=training_args, diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py index 1814b7b8f5..2fe851eb9d 100644 --- a/src/llamafactory/train/rm/workflow.py +++ b/src/llamafactory/train/rm/workflow.py @@ -71,7 +71,9 @@ def run_rm( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - Trainer = factory_glm4v_trainer(PairwiseTrainer) if model_args.visual_inputs_type == "glm4v_like" else PairwiseTrainer + Trainer = ( + factory_glm4v_trainer(PairwiseTrainer) if model_args.visual_inputs_type == "glm4v_like" else PairwiseTrainer + ) trainer = Trainer( model=model, args=training_args, diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 4a0b7e65e3..94c95c19fb 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -66,7 +66,11 @@ def run_sft( training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer - Trainer = factory_glm4v_trainer(CustomSeq2SeqTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomSeq2SeqTrainer + Trainer = ( + factory_glm4v_trainer(CustomSeq2SeqTrainer) + if model_args.visual_inputs_type == "glm4v_like" + else CustomSeq2SeqTrainer + ) trainer = Trainer( model=model, args=training_args, diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index c308160728..313645159b 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -436,23 +436,51 @@ def compute_loss(self, model, inputs, return_outputs=False): padded_labels = None padded_attention_masks = None if any(boi_ids == inputs["input_ids"].flatten()): - for input_id, label, attention_mask in zip(inputs["input_ids"], inputs["labels"], inputs["attention_mask"]): + for input_id, label, attention_mask in zip( + inputs["input_ids"], inputs["labels"], inputs["attention_mask"] + ): if any(boi_ids == input_id.flatten()): boi_index = input_id.tolist().index(boi_ids) # GLM will auto filling this. input_id_padded = input_id.unsqueeze(0) - label_padded = torch.cat((label[:boi_index],-100*torch.ones(1600,device=label.device,dtype=label.dtype),label[boi_index+1:])).unsqueeze(0) - attention_mask_padded = torch.cat((attention_mask[:boi_index],torch.ones(1600,device=attention_mask.device,dtype=attention_mask.dtype),attention_mask[boi_index+1:])).unsqueeze(0) + label_padded = torch.cat( + ( + label[:boi_index], + -100 * torch.ones(1600, device=label.device, dtype=label.dtype), + label[boi_index + 1 :], + ) + ).unsqueeze(0) + attention_mask_padded = torch.cat( + ( + attention_mask[:boi_index], + torch.ones(1600, device=attention_mask.device, dtype=attention_mask.dtype), + attention_mask[boi_index + 1 :], + ) + ).unsqueeze(0) else: - input_id_padded = torch.cat((input_id,torch.ones(1600,device=input_id.device,dtype=input_id.dtype))).unsqueeze(0) - label_padded = torch.cat((label,-100*torch.ones(1600,device=label.device,dtype=label.dtype))).unsqueeze(0) - attention_mask_padded = torch.cat((attention_mask,torch.zeros(1600,device=attention_mask.device,dtype=attention_mask.dtype))).unsqueeze(0) - padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids,input_id_padded)) - padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels,label_padded)) - padded_attention_masks = attention_mask_padded if padded_attention_masks is None else torch.cat((padded_attention_masks,attention_mask_padded)) + input_id_padded = torch.cat( + (input_id, torch.ones(1600, device=input_id.device, dtype=input_id.dtype)) + ).unsqueeze(0) + label_padded = torch.cat( + (label, -100 * torch.ones(1600, device=label.device, dtype=label.dtype)) + ).unsqueeze(0) + attention_mask_padded = torch.cat( + ( + attention_mask, + torch.zeros(1600, device=attention_mask.device, dtype=attention_mask.dtype), + ) + ).unsqueeze(0) + padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids, input_id_padded)) + padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels, label_padded)) + padded_attention_masks = ( + attention_mask_padded + if padded_attention_masks is None + else torch.cat((padded_attention_masks, attention_mask_padded)) + ) inputs["input_ids"] = padded_ids.contiguous() inputs["labels"] = padded_labels.contiguous() inputs["attention_mask"] = padded_attention_masks.contiguous() return super().compute_loss(model, inputs, return_outputs) + return GLM4VTrainer diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py index 4830a2eaa8..8ba32bb4da 100644 --- a/src/llamafactory/webui/common.py +++ b/src/llamafactory/webui/common.py @@ -145,7 +145,7 @@ def get_template(model_name: str) -> str: return "default" -def get_visual(model_name: str) -> dict[bool,str]: +def get_visual(model_name: str) -> dict[bool, str]: r""" Judges if the model is a vision language model. """ diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index d2a3c5dfb1..afdb075e7a 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -49,11 +49,13 @@ def create_top() -> Dict[str, "Component"]: rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2) booster = gr.Radio(choices=["auto", "flashattn2", "unsloth"], value="auto", scale=2) visual_inputs = gr.Checkbox(scale=1) - visual_inputs_type = gr.Dropdown(choices=["none", "vision_tower", "qwenvl_like", "glm4v_like", "phi3v_like"], value="none", scale=4) + visual_inputs_type = gr.Dropdown( + choices=["none", "vision_tower", "qwenvl_like", "glm4v_like", "phi3v_like"], value="none", scale=4 + ) - model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs, visual_inputs_type], queue=False).then( - list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False - ) + model_name.change( + get_model_info, [model_name], [model_path, template, visual_inputs, visual_inputs_type], queue=False + ).then(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False) model_name.input(save_config, inputs=[lang, model_name], queue=False) model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False) finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False).then( From 9e7bb3f3abee432ec7de49227c7feda507c3e473 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Tue, 2 Jul 2024 19:37:40 +0800 Subject: [PATCH 14/30] Bugfix --- src/llamafactory/chat/hf_engine.py | 31 ++++++++++++++++++++------ src/llamafactory/data/template.py | 2 +- src/llamafactory/hparams/model_args.py | 2 +- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index 921042ff57..fa6324d12f 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -18,6 +18,7 @@ import pathlib from threading import Thread from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union +from uuid import uuid4 import torch import torchvision @@ -86,7 +87,8 @@ def _process_args( tools: Optional[str] = None, image: Optional["NDArray"] = None, input_kwargs: Optional[Dict[str, Any]] = {}, - ) -> Tuple[Dict[str, Any], int]: + ) -> Tuple[Dict[str, Any], int, Optional[pathlib.Path]]: + image_path = None if ( processor is not None and image is not None @@ -98,7 +100,7 @@ def _process_args( messages[0]["content"] = template.image_token + messages[0]["content"] elif image is not None and model_args.visual_inputs_type == "qwenvl_like": # Add image pathlike token as vision input - image_path = pathlib.Path(DEFAULT_CACHE_DIR) / "temp.png" + image_path = pathlib.Path(DEFAULT_CACHE_DIR) / f"{str(uuid4())}.png" Image.fromarray(image).convert("RGB").save(image_path) messages[-1]["content"] = ( template.format_image.apply(content=os.fspath(image_path))[0] + messages[-1]["content"] @@ -218,7 +220,18 @@ def _process_args( if image_sizes is not None and model_args.visual_inputs_type == "phi3v_like": gen_kwargs["image_sizes"] = image_sizes - return gen_kwargs, prompt_length + return gen_kwargs, prompt_length, image_path + + @staticmethod + def image_clean_wrapper(func, temporary_image): + # clean up for qwenvl. + def wrapped_function(**kwargs): + result = func(**kwargs) + if temporary_image: + os.remove(temporary_image) + return result + + return wrapped_function @staticmethod @torch.inference_mode() @@ -235,7 +248,7 @@ def _chat( image: Optional["NDArray"] = None, input_kwargs: Optional[Dict[str, Any]] = {}, ) -> List["Response"]: - gen_kwargs, prompt_length = HuggingfaceEngine._process_args( + gen_kwargs, prompt_length, temporary_image = HuggingfaceEngine._process_args( model, tokenizer, processor, @@ -248,7 +261,7 @@ def _chat( image, input_kwargs, ) - generate_output = model.generate(**gen_kwargs) + generate_output = HuggingfaceEngine.image_clean_wrapper(model.generate, temporary_image)(**gen_kwargs) response_ids = generate_output[:, prompt_length:] response = tokenizer.batch_decode(response_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) results = [] @@ -281,7 +294,7 @@ def _stream_chat( image: Optional["NDArray"] = None, input_kwargs: Optional[Dict[str, Any]] = {}, ) -> Callable[[], str]: - gen_kwargs, _ = HuggingfaceEngine._process_args( + gen_kwargs, _, temporary_image = HuggingfaceEngine._process_args( model, tokenizer, processor, @@ -296,7 +309,11 @@ def _stream_chat( ) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs["streamer"] = streamer - thread = Thread(target=model.generate, kwargs=gen_kwargs, daemon=True) + thread = Thread( + target=HuggingfaceEngine.image_clean_wrapper(model.generate, temporary_image), + kwargs=gen_kwargs, + daemon=True, + ) thread.start() def stream(): diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 1c1ff50fe7..78f21af052 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -643,7 +643,7 @@ def get_template_and_fix_tokenizer( format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), format_assistant=StringFormatter(slots=["\n{{content}}"]), format_system=StringFormatter(slots=["<|system|>\n{{content}}"]), - format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), + format_function=FunctionFormatter(slots=[], tool_format="glm4"), format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]), format_tools=ToolFormatter(tool_format="glm4"), format_prefix=EmptyFormatter(slots=["[gMASK]"]), diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 9ece56d813..317cb538a2 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -122,7 +122,7 @@ class ModelArguments: metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, ) visual_inputs_type: str = field( - default="", + default="none", metadata={"help": "Type of visual inputs."}, ) moe_aux_loss_coef: Optional[float] = field( From 5fe28626ec8833987f2e0e613973b8bd9a5dcbc5 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Tue, 2 Jul 2024 20:00:20 +0800 Subject: [PATCH 15/30] Change implementation. --- src/llamafactory/train/dpo/workflow.py | 9 +- src/llamafactory/train/kto/workflow.py | 9 +- src/llamafactory/train/ppo/workflow.py | 13 ++- src/llamafactory/train/pt/workflow.py | 7 +- src/llamafactory/train/rm/workflow.py | 9 +- src/llamafactory/train/sft/workflow.py | 11 +-- src/llamafactory/train/trainer_utils.py | 113 ++++++++++++------------ 7 files changed, 82 insertions(+), 89 deletions(-) diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 024ae26c4c..8af96c9d34 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -22,7 +22,7 @@ from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, create_ref_model, factory_glm4v_trainer +from ..trainer_utils import create_modelcard_and_push, create_ref_model, glm4v_compute_loss_warpper from .trainer import CustomDPOTrainer @@ -63,10 +63,7 @@ def run_dpo( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - Trainer = ( - factory_glm4v_trainer(CustomDPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomDPOTrainer - ) - trainer = Trainer( + trainer = CustomDPOTrainer( model=model, ref_model=ref_model, args=training_args, @@ -76,6 +73,8 @@ def run_dpo( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) + if model_args.visual_inputs_type == "glm4v_like": + trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py index 2a7a657fae..55a43b1369 100644 --- a/src/llamafactory/train/kto/workflow.py +++ b/src/llamafactory/train/kto/workflow.py @@ -22,7 +22,7 @@ from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, create_ref_model, factory_glm4v_trainer +from ..trainer_utils import create_modelcard_and_push, create_ref_model, glm4v_compute_loss_warpper from .trainer import CustomKTOTrainer @@ -60,10 +60,7 @@ def run_kto( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - Trainer = ( - factory_glm4v_trainer(CustomKTOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomKTOTrainer - ) - trainer = Trainer( + trainer = CustomKTOTrainer( model=model, ref_model=ref_model, args=training_args, @@ -73,6 +70,8 @@ def run_kto( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) + if model_args.visual_inputs_type == "glm4v_like": + trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py index b493b22755..3e0dbfad65 100644 --- a/src/llamafactory/train/ppo/workflow.py +++ b/src/llamafactory/train/ppo/workflow.py @@ -22,8 +22,8 @@ from ...data import get_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..callbacks import FixValueHeadModelCallback, fix_valuehead_checkpoint -from ..trainer_utils import create_ref_model, create_reward_model, factory_glm4v_trainer +from ..callbacks import fix_valuehead_checkpoint +from ..trainer_utils import create_ref_model, create_reward_model, glm4v_compute_loss_warpper from .trainer import CustomPPOTrainer @@ -54,15 +54,12 @@ def run_ppo( reward_model = create_reward_model(model, model_args, finetuning_args) # Initialize our Trainer - Trainer = ( - factory_glm4v_trainer(CustomPPOTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomPPOTrainer - ) - ppo_trainer = Trainer( + ppo_trainer = CustomPPOTrainer( model_args=model_args, training_args=training_args, finetuning_args=finetuning_args, generating_args=generating_args, - callbacks=callbacks + [FixValueHeadModelCallback()], + callbacks=callbacks, model=model, reward_model=reward_model, ref_model=ref_model, @@ -70,6 +67,8 @@ def run_ppo( data_collator=data_collator, **tokenizer_module, ) + if model_args.visual_inputs_type == "glm4v_like": + ppo_trainer.compute_loss = glm4v_compute_loss_warpper(ppo_trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py index d9d3ad1df8..aa5afaee12 100644 --- a/src/llamafactory/train/pt/workflow.py +++ b/src/llamafactory/train/pt/workflow.py @@ -23,7 +23,7 @@ from ...data import get_dataset, split_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, factory_glm4v_trainer +from ..trainer_utils import create_modelcard_and_push, glm4v_compute_loss_warpper from .trainer import CustomTrainer @@ -47,8 +47,7 @@ def run_pt( data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Initialize our Trainer - Trainer = factory_glm4v_trainer(CustomTrainer) if model_args.visual_inputs_type == "glm4v_like" else CustomTrainer - trainer = Trainer( + trainer = CustomTrainer( model=model, args=training_args, finetuning_args=finetuning_args, @@ -57,6 +56,8 @@ def run_pt( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) + if model_args.visual_inputs_type == "glm4v_like": + trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py index 2fe851eb9d..6f94174e35 100644 --- a/src/llamafactory/train/rm/workflow.py +++ b/src/llamafactory/train/rm/workflow.py @@ -43,7 +43,7 @@ from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import fix_valuehead_checkpoint -from ..trainer_utils import create_modelcard_and_push, factory_glm4v_trainer +from ..trainer_utils import create_modelcard_and_push, glm4v_compute_loss_warpper from .metric import compute_accuracy from .trainer import PairwiseTrainer @@ -71,10 +71,7 @@ def run_rm( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - Trainer = ( - factory_glm4v_trainer(PairwiseTrainer) if model_args.visual_inputs_type == "glm4v_like" else PairwiseTrainer - ) - trainer = Trainer( + trainer = PairwiseTrainer( model=model, args=training_args, finetuning_args=finetuning_args, @@ -84,6 +81,8 @@ def run_rm( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) + if model_args.visual_inputs_type == "glm4v_like": + trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 94c95c19fb..21a89fd79e 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -24,7 +24,7 @@ from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, factory_glm4v_trainer +from ..trainer_utils import create_modelcard_and_push, glm4v_compute_loss_warpper from .metric import ComputeMetrics, compute_accuracy, eval_logit_processor from .trainer import CustomSeq2SeqTrainer @@ -66,12 +66,7 @@ def run_sft( training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer - Trainer = ( - factory_glm4v_trainer(CustomSeq2SeqTrainer) - if model_args.visual_inputs_type == "glm4v_like" - else CustomSeq2SeqTrainer - ) - trainer = Trainer( + trainer = CustomSeq2SeqTrainer( model=model, args=training_args, finetuning_args=finetuning_args, @@ -82,6 +77,8 @@ def run_sft( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) + if model_args.visual_inputs_type == "glm4v_like": + trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 313645159b..4e340da197 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -427,60 +427,59 @@ def get_batch_logps( return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1) -def factory_glm4v_trainer(BastTrainer): - class GLM4VTrainer(BastTrainer): - def compute_loss(self, model, inputs, return_outputs=False): - # Padding for labels and attention masks cuz modeling_glm4 will auto filling 1600 image tokens. - boi_ids = self.tokenizer.all_special_ids[self.tokenizer.all_special_tokens.index("<|begin_of_image|>")] - padded_ids = None - padded_labels = None - padded_attention_masks = None - if any(boi_ids == inputs["input_ids"].flatten()): - for input_id, label, attention_mask in zip( - inputs["input_ids"], inputs["labels"], inputs["attention_mask"] - ): - if any(boi_ids == input_id.flatten()): - boi_index = input_id.tolist().index(boi_ids) - # GLM will auto filling this. - input_id_padded = input_id.unsqueeze(0) - label_padded = torch.cat( - ( - label[:boi_index], - -100 * torch.ones(1600, device=label.device, dtype=label.dtype), - label[boi_index + 1 :], - ) - ).unsqueeze(0) - attention_mask_padded = torch.cat( - ( - attention_mask[:boi_index], - torch.ones(1600, device=attention_mask.device, dtype=attention_mask.dtype), - attention_mask[boi_index + 1 :], - ) - ).unsqueeze(0) - else: - input_id_padded = torch.cat( - (input_id, torch.ones(1600, device=input_id.device, dtype=input_id.dtype)) - ).unsqueeze(0) - label_padded = torch.cat( - (label, -100 * torch.ones(1600, device=label.device, dtype=label.dtype)) - ).unsqueeze(0) - attention_mask_padded = torch.cat( - ( - attention_mask, - torch.zeros(1600, device=attention_mask.device, dtype=attention_mask.dtype), - ) - ).unsqueeze(0) - padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids, input_id_padded)) - padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels, label_padded)) - padded_attention_masks = ( - attention_mask_padded - if padded_attention_masks is None - else torch.cat((padded_attention_masks, attention_mask_padded)) - ) - inputs["input_ids"] = padded_ids.contiguous() - inputs["labels"] = padded_labels.contiguous() - inputs["attention_mask"] = padded_attention_masks.contiguous() - - return super().compute_loss(model, inputs, return_outputs) - - return GLM4VTrainer +def glm4v_compute_loss_warpper(old_compute_loss): + def compute_loss(self, model, inputs, return_outputs=False): + # Padding for labels and attention masks cuz modeling_glm4 will auto filling 1600 image tokens. + boi_ids = self.tokenizer.all_special_ids[self.tokenizer.all_special_tokens.index("<|begin_of_image|>")] + padded_ids = None + padded_labels = None + padded_attention_masks = None + if any(boi_ids == inputs["input_ids"].flatten()): + for input_id, label, attention_mask in zip( + inputs["input_ids"], inputs["labels"], inputs["attention_mask"] + ): + if any(boi_ids == input_id.flatten()): + boi_index = input_id.tolist().index(boi_ids) + # GLM will auto filling this. + input_id_padded = input_id.unsqueeze(0) + label_padded = torch.cat( + ( + label[:boi_index], + -100 * torch.ones(1600, device=label.device, dtype=label.dtype), + label[boi_index + 1 :], + ) + ).unsqueeze(0) + attention_mask_padded = torch.cat( + ( + attention_mask[:boi_index], + torch.ones(1600, device=attention_mask.device, dtype=attention_mask.dtype), + attention_mask[boi_index + 1 :], + ) + ).unsqueeze(0) + else: + input_id_padded = torch.cat( + (input_id, torch.ones(1600, device=input_id.device, dtype=input_id.dtype)) + ).unsqueeze(0) + label_padded = torch.cat( + (label, -100 * torch.ones(1600, device=label.device, dtype=label.dtype)) + ).unsqueeze(0) + attention_mask_padded = torch.cat( + ( + attention_mask, + torch.zeros(1600, device=attention_mask.device, dtype=attention_mask.dtype), + ) + ).unsqueeze(0) + padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids, input_id_padded)) + padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels, label_padded)) + padded_attention_masks = ( + attention_mask_padded + if padded_attention_masks is None + else torch.cat((padded_attention_masks, attention_mask_padded)) + ) + inputs["input_ids"] = padded_ids.contiguous() + inputs["labels"] = padded_labels.contiguous() + inputs["attention_mask"] = padded_attention_masks.contiguous() + + return old_compute_loss(model, inputs, return_outputs) + + return compute_loss From b8cf95a2cdbe41ae927c60b8b66c07e4a85c5a42 Mon Sep 17 00:00:00 2001 From: Radeon_graphics Date: Tue, 2 Jul 2024 21:32:18 +0800 Subject: [PATCH 16/30] Update README, fix template constant, and add download source for phi3v. --- README.md | 3 +++ README_zh.md | 3 +++ src/llamafactory/extras/constants.py | 3 ++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d3feae51a..831c4a9924 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [GLM-4V](https://huggingface.co/THUDM) | 13B(include vision) | glm4_v | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | intern2 | | [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | @@ -171,7 +172,9 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [PaliGemma](https://huggingface.co/google) | 3B | gemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Phi-3-vision](https://huggingface.co/microsoft) | 4B | phi_v | | [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen | +| [Qwen-VL](https://huggingface.co/Qwen) | 9B | qwenvl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Yi/Yi-1.5](https://huggingface.co/01-ai) | 6B/9B/34B | yi | diff --git a/README_zh.md b/README_zh.md index cb5a42e40d..b48ec0a98a 100644 --- a/README_zh.md +++ b/README_zh.md @@ -161,6 +161,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | +| [GLM-4V](https://huggingface.co/THUDM) | 13B(include vision) | glm4_v | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | intern2 | | [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | @@ -171,7 +172,9 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [PaliGemma](https://huggingface.co/google) | 3B | gemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | +| [Phi-3-vision](https://huggingface.co/microsoft) | 4B | phi_v | | [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen | +| [Qwen-VL](https://huggingface.co/Qwen) | 9B | qwenvl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Yi/Yi-1.5](https://huggingface.co/01-ai) | 6B/9B/34B | yi | diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 97913222db..8dd148c4e0 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -933,9 +933,10 @@ def register_model_group( models={ "Phi-3-vision-128k-instruct": { DownloadSource.DEFAULT: "microsoft/Phi-3-vision-128k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct", } }, - template="phi", + template="phi_v", vision=True, vision_type="phi3v_like", ) From e6099f58979eb64fe9216c329bb7a779e271350f Mon Sep 17 00:00:00 2001 From: marko1616 Date: Tue, 2 Jul 2024 22:18:15 +0800 Subject: [PATCH 17/30] Name style fix. --- README.md | 2 +- README_zh.md | 4 ++-- src/llamafactory/chat/hf_engine.py | 4 ++-- src/llamafactory/data/aligner.py | 6 +++--- src/llamafactory/data/processors/supervised.py | 4 ++-- src/llamafactory/data/template.py | 2 +- src/llamafactory/extras/constants.py | 4 ++-- src/llamafactory/model/adapter.py | 2 +- src/llamafactory/webui/components/top.py | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 831c4a9924..207879b657 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | | [Phi-3-vision](https://huggingface.co/microsoft) | 4B | phi_v | | [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen | -| [Qwen-VL](https://huggingface.co/Qwen) | 9B | qwenvl | +| [Qwen-VL](https://huggingface.co/Qwen) | 9B | qwen_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Yi/Yi-1.5](https://huggingface.co/01-ai) | 6B/9B/34B | yi | diff --git a/README_zh.md b/README_zh.md index b48ec0a98a..7f10bb50a7 100644 --- a/README_zh.md +++ b/README_zh.md @@ -161,7 +161,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [GLM-4V](https://huggingface.co/THUDM) | 13B(include vision) | glm4_v | +| [GLM-4V](https://huggingface.co/THUDM) | 9B | glm4_v | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | intern2 | | [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | @@ -174,7 +174,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [Phi-3](https://huggingface.co/microsoft) | 4B/7B/14B | phi | | [Phi-3-vision](https://huggingface.co/microsoft) | 4B | phi_v | | [Qwen/Qwen1.5/Qwen2 (Code/MoE)](https://huggingface.co/Qwen) | 0.5B/1.5B/4B/7B/14B/32B/72B/110B | qwen | -| [Qwen-VL](https://huggingface.co/Qwen) | 9B | qwenvl | +| [Qwen-VL](https://huggingface.co/Qwen) | 9B | qwen_vl | | [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | | [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | | [Yi/Yi-1.5](https://huggingface.co/01-ai) | 6B/9B/34B | yi | diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py index fa6324d12f..1869f8b0c0 100644 --- a/src/llamafactory/chat/hf_engine.py +++ b/src/llamafactory/chat/hf_engine.py @@ -98,7 +98,7 @@ def _process_args( ): # llava-like models messages[0]["content"] = template.image_token + messages[0]["content"] - elif image is not None and model_args.visual_inputs_type == "qwenvl_like": + elif image is not None and model_args.visual_inputs_type == "qwen_vl_like": # Add image pathlike token as vision input image_path = pathlib.Path(DEFAULT_CACHE_DIR) / f"{str(uuid4())}.png" Image.fromarray(image).convert("RGB").save(image_path) @@ -224,7 +224,7 @@ def _process_args( @staticmethod def image_clean_wrapper(func, temporary_image): - # clean up for qwenvl. + # clean up for qwen_vl. def wrapped_function(**kwargs): result = func(**kwargs) if temporary_image: diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py index 4b2610e538..4fb748b23a 100644 --- a/src/llamafactory/data/aligner.py +++ b/src/llamafactory/data/aligner.py @@ -100,7 +100,7 @@ def convert_alpaca( outputs["response"].append(response) outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - if model_args.visual_inputs_type == "qwenvl_like": + if model_args.visual_inputs_type == "qwen_vl_like": outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) else: outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) @@ -194,7 +194,7 @@ def convert_sharegpt( outputs["response"].append(response) outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - if model_args.visual_inputs_type == "qwenvl_like": + if model_args.visual_inputs_type == "qwen_vl_like": outputs["images"].append(examples[dataset_attr.images][i] if dataset_attr.images else []) else: outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) @@ -235,7 +235,7 @@ def align_dataset( "tools": {"dtype": "string", "_type": "Value"}, "images": [ {"dtype": "string", "_type": "Value"} - if model_args.visual_inputs_type == "qwenvl_like" + if model_args.visual_inputs_type == "qwen_vl_like" else {"_type": "Image"} ], } diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index 6a27b57f0b..b2fa0d6e86 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -108,8 +108,8 @@ def preprocess_supervised_dataset( examples["prompt"][i][-1]["content"] = ( template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] ) - elif model_args.visual_inputs_type == "qwenvl_like": - assert len(examples["images"][i]) <= 1, "Qwenvl only support 1 image train yet." + elif model_args.visual_inputs_type == "qwen_vl_like": + assert len(examples["images"][i]) <= 1, "qwen_vl only support 1 image train yet." examples["prompt"][i][-1]["content"] = ( template.format_image.apply(content=os.path.join(data_args.dataset_dir, examples["images"][i][-1]))[0] + examples["prompt"][i][-1]["content"] diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index 78f21af052..6d1aae83a0 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -795,7 +795,7 @@ def get_template_and_fix_tokenizer( _register_template( - name="qwenvl", + name="qwen_vl", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 8dd148c4e0..990a44a7b9 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -1258,9 +1258,9 @@ def register_model_group( DownloadSource.MODELSCOPE: "qwen/wen-VL", }, }, - template="qwenvl", + template="qwen_vl", vision=True, - vision_type="qwenvl_like", + vision_type="qwen_vl_like", ) diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index dc1de9fbb2..1a7ba7bfc2 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -37,7 +37,7 @@ "none": "", "vision_tower": "vision_tower", "glm4v_like": "vision", - "qwenvl_like": "visual", + "qwen_vl_like": "visual", "phi3v_like": "vision_embed_tokens", } diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index afdb075e7a..f7264552f4 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -50,7 +50,7 @@ def create_top() -> Dict[str, "Component"]: booster = gr.Radio(choices=["auto", "flashattn2", "unsloth"], value="auto", scale=2) visual_inputs = gr.Checkbox(scale=1) visual_inputs_type = gr.Dropdown( - choices=["none", "vision_tower", "qwenvl_like", "glm4v_like", "phi3v_like"], value="none", scale=4 + choices=["none", "vision_tower", "qwen_vl_like", "glm4v_like", "phi3v_like"], value="none", scale=4 ) model_name.change( From eb38fe2ee1f8294ad949ecdd3e9202b1f43aeccf Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Wed, 3 Jul 2024 12:47:24 +0800 Subject: [PATCH 18/30] modify glm_4v 9B desc --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 207879b657..79c0b1f62a 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Supported Models | Model | Model size | Template | -| ------------------------------------------------------------ | -------------------------------- | --------- | +| ------------------------------------------------------------ |----------------------------------| --------- | | [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | | [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | @@ -161,7 +161,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [GLM-4](https://huggingface.co/THUDM) | 9B | glm4 | -| [GLM-4V](https://huggingface.co/THUDM) | 13B(include vision) | glm4_v | +| [GLM-4V](https://huggingface.co/THUDM) | 9B | glm4_v | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | intern2 | | [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | From 51931b9607d7608ec60c9e86068c83804e5f6407 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Wed, 3 Jul 2024 12:53:47 +0800 Subject: [PATCH 19/30] add torchvision to pass test --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7380add46e..4b1628513f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ fire packaging pyyaml numpy<2.0.0 +torchvision \ No newline at end of file From a0ad0b54e073daadadf64d5a92e2de06b654b66c Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Wed, 3 Jul 2024 13:04:24 +0800 Subject: [PATCH 20/30] modify dict in common --- src/llamafactory/webui/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py index 8ba32bb4da..e3e9956fcd 100644 --- a/src/llamafactory/webui/common.py +++ b/src/llamafactory/webui/common.py @@ -123,7 +123,7 @@ def get_prefix(model_name: str) -> str: return model_name.split("-")[0] -def get_model_info(model_name: str) -> Tuple[str, str, bool]: +def get_model_info(model_name: str) -> Tuple[str, str, bool, str]: r""" Gets the necessary information of this model. @@ -145,7 +145,7 @@ def get_template(model_name: str) -> str: return "default" -def get_visual(model_name: str) -> dict[bool, str]: +def get_visual(model_name: str) -> Tuple[bool, str]: r""" Judges if the model is a vision language model. """ From 3acefbc8ac0d8b5dae1ce9261dfe21606573885e Mon Sep 17 00:00:00 2001 From: marko1616 Date: Wed, 3 Jul 2024 19:41:42 +0800 Subject: [PATCH 21/30] Support latest glm4v. --- .../data/processors/supervised.py | 9 +-- src/llamafactory/train/dpo/workflow.py | 4 +- src/llamafactory/train/kto/workflow.py | 4 +- src/llamafactory/train/ppo/workflow.py | 4 +- src/llamafactory/train/pt/workflow.py | 4 +- src/llamafactory/train/rm/workflow.py | 4 +- src/llamafactory/train/sft/workflow.py | 4 +- src/llamafactory/train/trainer_utils.py | 58 ------------------- 8 files changed, 11 insertions(+), 80 deletions(-) diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index b2fa0d6e86..110f30d046 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -102,17 +102,18 @@ def preprocess_supervised_dataset( if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue + if model_args.visual_inputs_type == "glm4v_like": assert len(examples["images"][i]) <= 1, "GLM4v only support 1 image train yet." model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) - examples["prompt"][i][-1]["content"] = ( + examples["prompt"][i][0]["content"] = ( template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] ) elif model_args.visual_inputs_type == "qwen_vl_like": assert len(examples["images"][i]) <= 1, "qwen_vl only support 1 image train yet." - examples["prompt"][i][-1]["content"] = ( - template.format_image.apply(content=os.path.join(data_args.dataset_dir, examples["images"][i][-1]))[0] - + examples["prompt"][i][-1]["content"] + examples["prompt"][i][0]["content"] = ( + template.format_image.apply(content=os.path.join(data_args.dataset_dir, examples["images"][i][0]))[0] + + examples["prompt"][i][0]["content"] ) input_ids, labels = _encode_supervised_example( diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 8af96c9d34..431b52856b 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -22,7 +22,7 @@ from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, create_ref_model, glm4v_compute_loss_warpper +from ..trainer_utils import create_modelcard_and_push, create_ref_model from .trainer import CustomDPOTrainer @@ -73,8 +73,6 @@ def run_dpo( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) - if model_args.visual_inputs_type == "glm4v_like": - trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/kto/workflow.py b/src/llamafactory/train/kto/workflow.py index 55a43b1369..8182a1844e 100644 --- a/src/llamafactory/train/kto/workflow.py +++ b/src/llamafactory/train/kto/workflow.py @@ -22,7 +22,7 @@ from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, create_ref_model, glm4v_compute_loss_warpper +from ..trainer_utils import create_modelcard_and_push, create_ref_model from .trainer import CustomKTOTrainer @@ -70,8 +70,6 @@ def run_kto( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) - if model_args.visual_inputs_type == "glm4v_like": - trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/ppo/workflow.py b/src/llamafactory/train/ppo/workflow.py index 3e0dbfad65..df22dae5d3 100644 --- a/src/llamafactory/train/ppo/workflow.py +++ b/src/llamafactory/train/ppo/workflow.py @@ -23,7 +23,7 @@ from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import fix_valuehead_checkpoint -from ..trainer_utils import create_ref_model, create_reward_model, glm4v_compute_loss_warpper +from ..trainer_utils import create_ref_model, create_reward_model from .trainer import CustomPPOTrainer @@ -67,8 +67,6 @@ def run_ppo( data_collator=data_collator, **tokenizer_module, ) - if model_args.visual_inputs_type == "glm4v_like": - ppo_trainer.compute_loss = glm4v_compute_loss_warpper(ppo_trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/pt/workflow.py b/src/llamafactory/train/pt/workflow.py index aa5afaee12..b84a0e7da8 100644 --- a/src/llamafactory/train/pt/workflow.py +++ b/src/llamafactory/train/pt/workflow.py @@ -23,7 +23,7 @@ from ...data import get_dataset, split_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, glm4v_compute_loss_warpper +from ..trainer_utils import create_modelcard_and_push from .trainer import CustomTrainer @@ -56,8 +56,6 @@ def run_pt( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) - if model_args.visual_inputs_type == "glm4v_like": - trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/rm/workflow.py b/src/llamafactory/train/rm/workflow.py index 6f94174e35..e0b32b7782 100644 --- a/src/llamafactory/train/rm/workflow.py +++ b/src/llamafactory/train/rm/workflow.py @@ -43,7 +43,7 @@ from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer from ..callbacks import fix_valuehead_checkpoint -from ..trainer_utils import create_modelcard_and_push, glm4v_compute_loss_warpper +from ..trainer_utils import create_modelcard_and_push from .metric import compute_accuracy from .trainer import PairwiseTrainer @@ -81,8 +81,6 @@ def run_rm( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) - if model_args.visual_inputs_type == "glm4v_like": - trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Training if training_args.do_train: diff --git a/src/llamafactory/train/sft/workflow.py b/src/llamafactory/train/sft/workflow.py index 21a89fd79e..0c3f9b11e3 100644 --- a/src/llamafactory/train/sft/workflow.py +++ b/src/llamafactory/train/sft/workflow.py @@ -24,7 +24,7 @@ from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..trainer_utils import create_modelcard_and_push, glm4v_compute_loss_warpper +from ..trainer_utils import create_modelcard_and_push from .metric import ComputeMetrics, compute_accuracy, eval_logit_processor from .trainer import CustomSeq2SeqTrainer @@ -77,8 +77,6 @@ def run_sft( **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) - if model_args.visual_inputs_type == "glm4v_like": - trainer.compute_loss = glm4v_compute_loss_warpper(trainer.compute_loss) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index 4e340da197..4de4cc2323 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -425,61 +425,3 @@ def get_batch_logps( labels[labels == label_pad_token_id] = 0 # dummy token per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1) - - -def glm4v_compute_loss_warpper(old_compute_loss): - def compute_loss(self, model, inputs, return_outputs=False): - # Padding for labels and attention masks cuz modeling_glm4 will auto filling 1600 image tokens. - boi_ids = self.tokenizer.all_special_ids[self.tokenizer.all_special_tokens.index("<|begin_of_image|>")] - padded_ids = None - padded_labels = None - padded_attention_masks = None - if any(boi_ids == inputs["input_ids"].flatten()): - for input_id, label, attention_mask in zip( - inputs["input_ids"], inputs["labels"], inputs["attention_mask"] - ): - if any(boi_ids == input_id.flatten()): - boi_index = input_id.tolist().index(boi_ids) - # GLM will auto filling this. - input_id_padded = input_id.unsqueeze(0) - label_padded = torch.cat( - ( - label[:boi_index], - -100 * torch.ones(1600, device=label.device, dtype=label.dtype), - label[boi_index + 1 :], - ) - ).unsqueeze(0) - attention_mask_padded = torch.cat( - ( - attention_mask[:boi_index], - torch.ones(1600, device=attention_mask.device, dtype=attention_mask.dtype), - attention_mask[boi_index + 1 :], - ) - ).unsqueeze(0) - else: - input_id_padded = torch.cat( - (input_id, torch.ones(1600, device=input_id.device, dtype=input_id.dtype)) - ).unsqueeze(0) - label_padded = torch.cat( - (label, -100 * torch.ones(1600, device=label.device, dtype=label.dtype)) - ).unsqueeze(0) - attention_mask_padded = torch.cat( - ( - attention_mask, - torch.zeros(1600, device=attention_mask.device, dtype=attention_mask.dtype), - ) - ).unsqueeze(0) - padded_ids = input_id_padded if padded_ids is None else torch.cat((padded_ids, input_id_padded)) - padded_labels = label_padded if padded_labels is None else torch.cat((padded_labels, label_padded)) - padded_attention_masks = ( - attention_mask_padded - if padded_attention_masks is None - else torch.cat((padded_attention_masks, attention_mask_padded)) - ) - inputs["input_ids"] = padded_ids.contiguous() - inputs["labels"] = padded_labels.contiguous() - inputs["attention_mask"] = padded_attention_masks.contiguous() - - return old_compute_loss(model, inputs, return_outputs) - - return compute_loss From 41462427eed4931e27fe322facb00a14fb0569b5 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Wed, 3 Jul 2024 21:16:19 +0800 Subject: [PATCH 22/30] Phi3v lora sft fix. --- src/llamafactory/model/model_utils/misc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py index 8fbaf28207..d43ee89b8d 100644 --- a/src/llamafactory/model/model_utils/misc.py +++ b/src/llamafactory/model/model_utils/misc.py @@ -36,6 +36,10 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision: bool) -> Li forbidden_modules.add("output") elif model.config.model_type in ["llava", "paligemma"]: forbidden_modules.add("multi_modal_projector") + elif model.config.model_type == "phi3_v": + forbidden_modules.add("0") + forbidden_modules.add("1") + forbidden_modules.add("2") if freeze_vision: forbidden_modules.add("vision_tower") From 70ac8ea4fdaf294b2284a43b72cc5902fc2af855 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Wed, 3 Jul 2024 21:35:30 +0800 Subject: [PATCH 23/30] fix get_template. --- src/llamafactory/data/processors/supervised.py | 2 +- src/llamafactory/extras/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index 110f30d046..b14b194498 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -102,7 +102,7 @@ def preprocess_supervised_dataset( if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue - + if model_args.visual_inputs_type == "glm4v_like": assert len(examples["images"][i]) <= 1, "GLM4v only support 1 image train yet." model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index 990a44a7b9..5f7279231e 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -549,7 +549,7 @@ def register_model_group( register_model_group( models={ - "GLM-4v-9B": { + "GLM-4v-9B-Chat": { DownloadSource.DEFAULT: "THUDM/glm-4v-9b", DownloadSource.MODELSCOPE: "ZhipuAI/glm-4v-9b", } @@ -931,7 +931,7 @@ def register_model_group( register_model_group( models={ - "Phi-3-vision-128k-instruct": { + "Phi-3-vision-128k-Chat": { DownloadSource.DEFAULT: "microsoft/Phi-3-vision-128k-instruct", DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct", } From ea60231b160112366800ec5d5e99039ea1f3bfb4 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Thu, 4 Jul 2024 19:53:13 +0800 Subject: [PATCH 24/30] Update for unsupervised dataset. --- src/llamafactory/data/preprocess.py | 1 + .../data/processors/unsupervised.py | 23 ++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py index e6ff34371a..10f1382e5f 100644 --- a/src/llamafactory/data/preprocess.py +++ b/src/llamafactory/data/preprocess.py @@ -93,6 +93,7 @@ def get_preprocess_and_print_func( tokenizer=tokenizer, processor=processor, data_args=data_args, + model_args=model_args, ) print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py index b3fc85c929..84ecfe5627 100644 --- a/src/llamafactory/data/processors/unsupervised.py +++ b/src/llamafactory/data/processors/unsupervised.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple from ...extras.logging import get_logger @@ -22,7 +23,7 @@ if TYPE_CHECKING: from transformers import PreTrainedTokenizer, ProcessorMixin - from ...hparams import DataArguments + from ...hparams import DataArguments, ModelArguments from ..template import Template @@ -67,19 +68,35 @@ def preprocess_unsupervised_dataset( tokenizer: "PreTrainedTokenizer", processor: Optional["ProcessorMixin"], data_args: "DataArguments", + model_args: "ModelArguments", ) -> Dict[str, List[List[int]]]: # build inputs with format ` X` and labels with format `Y ` model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - if processor is not None: + if processor is not None and model_args.visual_inputs_type == "vision_tower": model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] + elif model_args.visual_inputs_type == "glm4v_like": + model_inputs["image_inputs"] = [] for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1: logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue + if model_args.visual_inputs_type == "glm4v_like": + assert len(examples["images"][i]) <= 1, "GLM4v only support 1 image train yet." + model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) + examples["prompt"][i][0]["content"] = ( + template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] + ) + elif model_args.visual_inputs_type == "qwen_vl_like": + assert len(examples["images"][i]) <= 1, "qwen_vl only support 1 image train yet." + examples["prompt"][i][0]["content"] = ( + template.format_image.apply(content=os.path.join(data_args.dataset_dir, examples["images"][i][0]))[0] + + examples["prompt"][i][0]["content"] + ) + input_ids, labels = _encode_unsupervised_example( prompt=examples["prompt"][i], response=examples["response"][i], @@ -93,7 +110,7 @@ def preprocess_unsupervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - if processor is not None: + if processor is not None and model_args.visual_inputs_type == "vision_tower": model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) From b932bc0b8e7343f324c8ef7c05763be419403048 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Sat, 6 Jul 2024 20:45:21 +0800 Subject: [PATCH 25/30] Phi3v dataset processor fix. --- src/llamafactory/data/processors/supervised.py | 6 +++--- src/llamafactory/data/processors/unsupervised.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index b14b194498..2c1fbbd20e 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -91,7 +91,7 @@ def preprocess_supervised_dataset( # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - if processor is not None and model_args.visual_inputs_type == "vision_tower": + if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] @@ -107,7 +107,7 @@ def preprocess_supervised_dataset( assert len(examples["images"][i]) <= 1, "GLM4v only support 1 image train yet." model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) examples["prompt"][i][0]["content"] = ( - template.format_image.apply()[0] + examples["prompt"][i][-1]["content"] + template.format_image.apply()[0] + examples["prompt"][i][0]["content"] ) elif model_args.visual_inputs_type == "qwen_vl_like": assert len(examples["images"][i]) <= 1, "qwen_vl only support 1 image train yet." @@ -129,7 +129,7 @@ def preprocess_supervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - if processor is not None and model_args.visual_inputs_type == "vision_tower": + if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py index 84ecfe5627..8df5f0cc32 100644 --- a/src/llamafactory/data/processors/unsupervised.py +++ b/src/llamafactory/data/processors/unsupervised.py @@ -72,7 +72,7 @@ def preprocess_unsupervised_dataset( ) -> Dict[str, List[List[int]]]: # build inputs with format ` X` and labels with format `Y ` model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - if processor is not None and model_args.visual_inputs_type == "vision_tower": + if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] @@ -110,7 +110,7 @@ def preprocess_unsupervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - if processor is not None and model_args.visual_inputs_type == "vision_tower": + if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) From 3c2ecbab75e104256bfebe61e901a686e641901a Mon Sep 17 00:00:00 2001 From: marko1616 Date: Fri, 19 Jul 2024 03:53:04 +0800 Subject: [PATCH 26/30] Conflict fix --- src/llamafactory/data/loader.py | 5 +++-- src/llamafactory/data/preprocess.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index 05767bb445..52d1053000 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -159,6 +159,7 @@ def _get_preprocessed_dataset( dataset: Optional[Union["Dataset", "IterableDataset"]], data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", + model_args: "ModelArguments", stage: Literal["pt", "sft", "rm", "ppo", "kto"], template: "Template", tokenizer: "PreTrainedTokenizer", @@ -169,7 +170,7 @@ def _get_preprocessed_dataset( return None preprocess_func, print_function = get_preprocess_and_print_func( - data_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval) + data_args, model_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval) ) column_names = list(next(iter(dataset)).keys()) kwargs = {} @@ -238,7 +239,7 @@ def get_dataset( dataset, data_args, training_args, model_args, stage, template, tokenizer, processor, is_eval=False ) eval_dataset = _get_preprocessed_dataset( - eval_dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=True + eval_dataset, data_args, training_args, model_args, stage, template, tokenizer, processor, is_eval=True ) if data_args.val_size > 1e-6: diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py index 72a7d5b5c3..6ad3c72b7f 100644 --- a/src/llamafactory/data/preprocess.py +++ b/src/llamafactory/data/preprocess.py @@ -35,7 +35,6 @@ def get_preprocess_and_print_func( data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", model_args: "ModelArguments", stage: Literal["pt", "sft", "rm", "ppo", "kto"], template: "Template", From 3f9ccb321c8135c7d754e255ec413e8158daefb3 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Fri, 19 Jul 2024 18:55:53 +0800 Subject: [PATCH 27/30] RLHF support. --- src/llamafactory/data/preprocess.py | 1 + src/llamafactory/data/processors/pairwise.py | 23 +++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/llamafactory/data/preprocess.py b/src/llamafactory/data/preprocess.py index 6ad3c72b7f..bcd4b2aef4 100644 --- a/src/llamafactory/data/preprocess.py +++ b/src/llamafactory/data/preprocess.py @@ -88,6 +88,7 @@ def __init__(self, data, **kwargs): tokenizer=tokenizer, processor=processor, data_args=data_args, + model_args=model_args, ) print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) elif stage == "kto": diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py index 9084c68377..61a97ac649 100644 --- a/src/llamafactory/data/processors/pairwise.py +++ b/src/llamafactory/data/processors/pairwise.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple from ...extras.constants import IGNORE_INDEX @@ -22,7 +23,7 @@ if TYPE_CHECKING: from transformers import PreTrainedTokenizer, ProcessorMixin - from ...hparams import DataArguments + from ...hparams import DataArguments, ModelArguments from ..template import Template @@ -76,6 +77,7 @@ def preprocess_pairwise_dataset( tokenizer: "PreTrainedTokenizer", processor: Optional["ProcessorMixin"], data_args: "DataArguments", + model_args: "ModelArguments" ) -> Dict[str, List[List[int]]]: # build input pairs with format ` X`, `Y1 ` and `Y2 ` model_inputs = { @@ -86,17 +88,32 @@ def preprocess_pairwise_dataset( "rejected_attention_mask": [], "rejected_labels": [], } - if processor is not None: + if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["chosen_token_type_ids"] = [] model_inputs["rejected_token_type_ids"] = [] + elif model_args.visual_inputs_type == "glm4v_like": + model_inputs["image_inputs"] = [] for i in range(len(examples["prompt"])): if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2: logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue + if model_args.visual_inputs_type == "glm4v_like": + assert len(examples["images"][i]) <= 1, "GLM4v only support 1 image train yet." + model_inputs["image_inputs"].append(get_pixel_values(examples["images"][i], None, "glm4v_like")) + examples["prompt"][i][0]["content"] = ( + template.format_image.apply()[0] + examples["prompt"][i][0]["content"] + ) + elif model_args.visual_inputs_type == "qwen_vl_like": + assert len(examples["images"][i]) <= 1, "qwen_vl only support 1 image train yet." + examples["prompt"][i][0]["content"] = ( + template.format_image.apply(content=os.path.join(data_args.dataset_dir, examples["images"][i][0]))[0] + + examples["prompt"][i][0]["content"] + ) + chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels = _encode_pairwise_example( prompt=examples["prompt"][i], response=examples["response"][i], @@ -113,7 +130,7 @@ def preprocess_pairwise_dataset( model_inputs["rejected_input_ids"].append(rejected_input_ids) model_inputs["rejected_attention_mask"].append([1] * len(rejected_input_ids)) model_inputs["rejected_labels"].append(rejected_labels) - if processor is not None: + if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["chosen_token_type_ids"].append( From 9c6587e303967766a9cce2d7c66d16bbd1e77f2f Mon Sep 17 00:00:00 2001 From: marko1616 Date: Sat, 20 Jul 2024 04:11:24 +0800 Subject: [PATCH 28/30] glm4v pairwise dataset support --- src/llamafactory/data/collator.py | 3 +++ src/llamafactory/data/loader.py | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/llamafactory/data/collator.py b/src/llamafactory/data/collator.py index a603a7e853..34c2eff922 100644 --- a/src/llamafactory/data/collator.py +++ b/src/llamafactory/data/collator.py @@ -103,6 +103,9 @@ def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tenso if "pixel_values" in feature: target_feature["pixel_values"] = feature["pixel_values"] + if "images" in feature: + target_feature["images"] = feature["images"] + if "{}_token_type_ids".format(key) in feature: target_feature["token_type_ids"] = feature["{}_token_type_ids".format(key)] diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index 52d1053000..5782731521 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -242,6 +242,11 @@ def get_dataset( eval_dataset, data_args, training_args, model_args, stage, template, tokenizer, processor, is_eval=True ) + if model_args.visual_inputs_type == "glm4v_like": + # Datasets can't set column images because of images is a feature of examples. + dataset = dataset.rename_column("image_inputs", "images") + eval_dataset = eval_dataset.rename_column("image_inputs", "images") if eval_dataset is not None else None + if data_args.val_size > 1e-6: dataset_dict = split_dataset(dataset, data_args, seed=training_args.seed) else: @@ -260,9 +265,6 @@ def get_dataset( dataset_dict = DatasetDict(dataset_dict) - if model_args.visual_inputs_type == "glm4v_like": - dataset = dataset.rename_column("image_inputs", "images") - if data_args.tokenized_path is not None: if training_args.should_save: dataset_dict.save_to_disk(data_args.tokenized_path) From e9d902b6ef01355005f5ce1a8218dad7aec88f75 Mon Sep 17 00:00:00 2001 From: marko1616 Date: Thu, 22 Aug 2024 10:55:16 +0800 Subject: [PATCH 29/30] Name fix. --- src/llamafactory/hparams/finetuning_args.py | 2 +- src/llamafactory/model/adapter.py | 8 ++++---- src/llamafactory/model/model_utils/misc.py | 4 ++-- src/llamafactory/train/trainer_utils.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py index 5f0a322645..e04fee0f7d 100644 --- a/src/llamafactory/hparams/finetuning_args.py +++ b/src/llamafactory/hparams/finetuning_args.py @@ -359,7 +359,7 @@ def split_arg(arg): self.lora_target: List[str] = split_arg(self.lora_target) self.additional_target: Optional[List[str]] = split_arg(self.additional_target) self.galore_target: List[str] = split_arg(self.galore_target) - self.freeze_vision = self.freeze_vision or self.train_mm_proj_only + self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only self.use_ref_model = self.stage == "dpo" and self.pref_loss not in ["orpo", "simpo"] assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method." diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index 1a7ba7bfc2..2fb7d19b53 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -54,7 +54,7 @@ def _setup_full_tuning( logger.info("Fine-tuning method: Full") forbidden_modules = set() - if model_args.visual_inputs and finetuning_args.freeze_vision: + if model_args.visual_inputs and finetuning_args.freeze_vision_tower: forbidden_modules.add(VISION_FREEZE_MAP[model_args.visual_inputs_type]) if model_args.visual_inputs and finetuning_args.train_mm_proj_only: @@ -138,7 +138,7 @@ def _setup_freeze_tuning( trainable_layers.append(module_name) forbidden_modules = set() - if model_args.visual_inputs and finetuning_args.freeze_vision: + if model_args.visual_inputs and finetuning_args.freeze_vision_tower: forbidden_modules.add(VISION_FREEZE_MAP[model_args.visual_inputs_type]) for name, param in model.named_parameters(): @@ -211,14 +211,14 @@ def _setup_lora_tuning( if is_trainable and adapter_to_resume is None: # create new lora weights while training if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": - target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision) + target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower) else: target_modules = finetuning_args.lora_target if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) - if finetuning_args.freeze_vision and model_args.visual_inputs_type != "none": + if finetuning_args.freeze_vision_tower and model_args.visual_inputs_type != "none": target_modules = f"^(?!.*{VISION_FREEZE_MAP[model_args.visual_inputs_type]})." + "*(?:{}).*".format( "|".join(target_modules) ) diff --git a/src/llamafactory/model/model_utils/misc.py b/src/llamafactory/model/model_utils/misc.py index d43ee89b8d..8a63735bbd 100644 --- a/src/llamafactory/model/model_utils/misc.py +++ b/src/llamafactory/model/model_utils/misc.py @@ -24,7 +24,7 @@ logger = get_logger(__name__) -def find_all_linear_modules(model: "PreTrainedModel", freeze_vision: bool) -> List[str]: +def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool) -> List[str]: r""" Finds all available modules to apply lora or galore. """ @@ -41,7 +41,7 @@ def find_all_linear_modules(model: "PreTrainedModel", freeze_vision: bool) -> Li forbidden_modules.add("1") forbidden_modules.add("2") - if freeze_vision: + if freeze_vision_tower: forbidden_modules.add("vision_tower") module_names = set() diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index c13b587eab..2fa149a06c 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -188,7 +188,7 @@ def _create_galore_optimizer( finetuning_args: "FinetuningArguments", ) -> "torch.optim.Optimizer": if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all": - galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision) + galore_targets = find_all_linear_modules(model, finetuning_args.freeze_vision_tower) else: galore_targets = finetuning_args.galore_target From 65b64be1869353d7811941ac6e024d5a2dac496a Mon Sep 17 00:00:00 2001 From: marko1616 Date: Tue, 27 Aug 2024 11:34:56 +0800 Subject: [PATCH 30/30] ruff pass. --- src/llamafactory/data/loader.py | 8 +++++++- src/llamafactory/data/processors/pairwise.py | 6 +++--- src/llamafactory/data/processors/supervised.py | 4 ++-- src/llamafactory/data/processors/unsupervised.py | 4 ++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/llamafactory/data/loader.py b/src/llamafactory/data/loader.py index 5782731521..b8d1ca56b9 100644 --- a/src/llamafactory/data/loader.py +++ b/src/llamafactory/data/loader.py @@ -170,7 +170,13 @@ def _get_preprocessed_dataset( return None preprocess_func, print_function = get_preprocess_and_print_func( - data_args, model_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval) + data_args, + model_args, + stage, + template, + tokenizer, + processor, + do_generate=(training_args.predict_with_generate and is_eval), ) column_names = list(next(iter(dataset)).keys()) kwargs = {} diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py index e8890fcf1c..17d716c54a 100644 --- a/src/llamafactory/data/processors/pairwise.py +++ b/src/llamafactory/data/processors/pairwise.py @@ -76,7 +76,7 @@ def preprocess_pairwise_dataset( tokenizer: "PreTrainedTokenizer", processor: Optional["ProcessorMixin"], data_args: "DataArguments", - model_args: "ModelArguments" + model_args: "ModelArguments", ) -> Dict[str, List[List[int]]]: # build input pairs with format ` X`, `Y1 ` and `Y2 ` model_inputs = { @@ -87,7 +87,7 @@ def preprocess_pairwise_dataset( "rejected_attention_mask": [], "rejected_labels": [], } - if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if processor is not None and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["chosen_token_type_ids"] = [] @@ -129,7 +129,7 @@ def preprocess_pairwise_dataset( model_inputs["rejected_input_ids"].append(rejected_input_ids) model_inputs["rejected_attention_mask"].append([1] * len(rejected_input_ids)) model_inputs["rejected_labels"].append(rejected_labels) - if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if processor is not None and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["chosen_token_type_ids"].append( diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index a70906e653..4729d3021e 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -105,7 +105,7 @@ def preprocess_supervised_dataset( # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if processor is not None and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] @@ -145,7 +145,7 @@ def preprocess_supervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if processor is not None and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py index f7dffed1a1..77d2c8a9f9 100644 --- a/src/llamafactory/data/processors/unsupervised.py +++ b/src/llamafactory/data/processors/unsupervised.py @@ -72,7 +72,7 @@ def preprocess_unsupervised_dataset( ) -> Dict[str, List[List[int]]]: # build inputs with format ` X` and labels with format `Y ` model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if processor is not None and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: model_inputs["pixel_values"] = [] if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] @@ -110,7 +110,7 @@ def preprocess_unsupervised_dataset( model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) - if processor is not None and model_args.visual_inputs_type in ["vision_tower","phi3v_like"]: + if processor is not None and model_args.visual_inputs_type in ["vision_tower", "phi3v_like"]: model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor))