From 7175c695b7e6ac650ffcd9b5b45f7ef39a25e800 Mon Sep 17 00:00:00 2001 From: yaswanth Date: Sat, 25 Jan 2025 21:01:32 +0530 Subject: [PATCH 1/7] Iterative generation using input embeds --- src/transformers/generation/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 45558bd22a4e..1fd9da870217 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -383,7 +383,9 @@ def prepare_inputs_for_generation( # (we can't check exception 3 while compiling) if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if ( + if inputs_embeds is not None and input_ids.shape[1]==0: + inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] + elif ( inputs_embeds is not None # Exception 1 or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 ): @@ -393,9 +395,9 @@ def prepare_inputs_for_generation( # 3. Prepare base model inputs input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step for every prompt. if not self.config.is_encoder_decoder: - if inputs_embeds is not None and cache_position[0] == 0: + if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: model_inputs[input_ids_key] = None model_inputs["inputs_embeds"] = inputs_embeds else: From 5d6d37a67e36e177b9574c189bcc37bc32c639e7 Mon Sep 17 00:00:00 2001 From: yaswant19 Date: Wed, 5 Feb 2025 22:22:03 +0530 Subject: [PATCH 2/7] Add Janus model --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/janus.md | 55 ++ src/transformers/__init__.py | 18 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + src/transformers/models/auto/modeling_auto.py | 3 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/janus/__init__.py | 27 + .../models/janus/configuration_janus.py | 358 ++++++++++ .../janus/convert_janus_weights_to_hf.py | 204 ++++++ .../models/janus/image_processing_janus.py | 284 ++++++++ .../models/janus/modular_janus.py | 0 .../models/janus/processing_janus.py | 99 +++ tests/models/janus/test_modeling_janus.py | 647 ++++++++++++++++++ 15 files changed, 1702 insertions(+) create mode 100644 docs/source/en/model_doc/janus.md create mode 100644 src/transformers/models/janus/__init__.py create mode 100644 src/transformers/models/janus/configuration_janus.py create mode 100644 src/transformers/models/janus/convert_janus_weights_to_hf.py create mode 100644 src/transformers/models/janus/image_processing_janus.py create mode 100644 src/transformers/models/janus/modular_janus.py create mode 100644 src/transformers/models/janus/processing_janus.py create mode 100644 tests/models/janus/test_modeling_janus.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 2a2cf4512af4..5bd70c3ee648 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -888,6 +888,8 @@ title: InstructBLIP - local: model_doc/instructblipvideo title: InstructBlipVideo + - local: model_doc/janus + title: Janus - local: model_doc/kosmos-2 title: KOSMOS-2 - local: model_doc/layoutlm diff --git a/docs/source/en/model_doc/janus.md b/docs/source/en/model_doc/janus.md new file mode 100644 index 000000000000..a4f57b54082d --- /dev/null +++ b/docs/source/en/model_doc/janus.md @@ -0,0 +1,55 @@ + + +# Janus + +# Janus + +# Janus + +# Janus + +# Janus + +# Janus + +# Janus + +## Overview + +The Janus model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + + +## JanusConfig + +[[autodoc]] JanusConfig + +## JanusForConditionalGeneration + +[[autodoc]] JanusForConditionalGeneration + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ae92f21dcc2a..08eb48773919 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -558,6 +558,10 @@ "LlavaConfig", "LlavaProcessor", ], + "models.janus": [ + "JanusConfig", + + ], "models.llava_next": [ "LlavaNextConfig", "LlavaNextProcessor", @@ -2720,6 +2724,12 @@ "LlavaPreTrainedModel", ] ) + _import_structure["models.janus"].extend( + [ + "JanusForConditionalGeneration", + "JanusPreTrainedModel", + ] + ) _import_structure["models.llava_next"].extend( [ "LlavaNextForConditionalGeneration", @@ -5635,6 +5645,10 @@ LlavaConfig, LlavaProcessor, ) + from .models.janus import ( + JanusConfig, + + ) from .models.llava_next import ( LlavaNextConfig, LlavaNextProcessor, @@ -7580,6 +7594,10 @@ LlavaForConditionalGeneration, LlavaPreTrainedModel, ) + from .models.janus import ( + JanusForConditionalGeneration, + JanusPreTrainedModel, + ) from .models.llava_next import ( LlavaNextForConditionalGeneration, LlavaNextPreTrainedModel, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index f62d5d71672b..bfab93812215 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -143,6 +143,7 @@ lilt, llama, llava, + janus, llava_next, llava_next_video, llava_onevision, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 699e307ac1b6..071a5b0534f8 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -164,6 +164,7 @@ ("lilt", "LiltConfig"), ("llama", "LlamaConfig"), ("llava", "LlavaConfig"), + ("janus", "JanusConfig"), ("llava_next", "LlavaNextConfig"), ("llava_next_video", "LlavaNextVideoConfig"), ("llava_onevision", "LlavaOnevisionConfig"), @@ -495,6 +496,7 @@ ("llama2", "Llama2"), ("llama3", "Llama3"), ("llava", "LLaVa"), + ("janus", "Janus"), ("llava_next", "LLaVA-NeXT"), ("llava_next_video", "LLaVa-NeXT-Video"), ("llava_onevision", "LLaVA-Onevision"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 3b023251e1d9..28f20b8e50be 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -342,6 +342,7 @@ ("idefics3", "Idefics3ForConditionalGeneration"), ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), @@ -784,6 +785,7 @@ ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), @@ -820,6 +822,7 @@ ("instructblip", "InstructBlipForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), ("mllama", "MllamaForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index f329d9e465e5..72b63e9b6d13 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -76,6 +76,7 @@ ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), ("llava", "LlavaProcessor"), + ("janus", "JanusProcessor"), ("llava_next", "LlavaNextProcessor"), ("llava_next_video", "LlavaNextVideoProcessor"), ("llava_onevision", "LlavaOnevisionProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 5ee4f612285f..3a49dd4b7880 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -271,6 +271,7 @@ ), ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("janus", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/janus/__init__.py b/src/transformers/models/janus/__init__.py new file mode 100644 index 000000000000..04340e3a59e5 --- /dev/null +++ b/src/transformers/models/janus/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_janus import * + from .modeling_janus import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/janus/configuration_janus.py b/src/transformers/models/janus/configuration_janus.py new file mode 100644 index 000000000000..70eb00e688b1 --- /dev/null +++ b/src/transformers/models/janus/configuration_janus.py @@ -0,0 +1,358 @@ +# coding=utf-8 +# Copyright 2025 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Janus model configuration""" +import copy +import os +from typing import Any, Dict, Optional, Union + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + +from ..auto import CONFIG_MAPPING, AutoConfig +from ...modeling_rope_utils import rope_config_validation + + +logger = logging.get_logger(__name__) + + +class JanusEncoderVisionConfig(PretrainedConfig): + """Encoder Vision config in this case its the SIGLIP model""" + + model_type = "siglip_vision_model" + base_config_key = "encoder_vision_config" + + def __init__( + self, + hidden_size=1024, + mlp_ratio=4.0, + projection_dim=1024, + num_hidden_layers=24, + num_attention_heads=16, + num_channels=3, + num_frames=2, + image_size=384, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=1e-6, + add_kv_bias=False, + attention_dropout=0.0, + drop_path_rate=0.0, + initializer_range=0.02, + initializer_factor=1.0, + logit_scale_init_value=None, + learnable_logit_scale=False, + select_feature = "same", + select_layer = -1, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.mlp_ratio = mlp_ratio + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.num_frames = num_frames + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.add_kv_bias = add_kv_bias + self.attention_dropout = attention_dropout + self.drop_path_rate = drop_path_rate + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.logit_scale_init_value = logit_scale_init_value + self.learnable_logit_scale = learnable_logit_scale + self.feature_size = image_size + self.intermediate_size = int(hidden_size * mlp_ratio) + +class JanusTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the LLaMA-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_attention_heads + + ```python + >>> from transformers import LlamaModel, LlamaConfig + + >>> # Initializing a LLaMA llama-7b style configuration + >>> configuration = LlamaConfig() + + >>> # Initializing a model from the llama-7b style configuration + >>> model = LlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llama" + base_config_key = "text_config" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `LlamaModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + +class JanusDecoderVisionConfig(PretrainedConfig): + """A custom VQ config model""" + # TODO + def __init__(self): + pass + +class JanusConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`JanusForConditionalGeneration`]. It is used to instantiate an + Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Janus-9B. + + e.g. [janus-hf/janus-9b](https://huggingface.co/janus-hf/janus-9b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + image_token_index (`int`, *optional*, defaults to 32000): + The image token index to encode the image prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. + vision_feature_layer (`Union[int, List[int]]`, *optional*, defaults to -2): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + image_seq_length (`int`, *optional*, defaults to 576): + Sequence length of one image embedding. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. + + Example: + + ```python + >>> from transformers import JanusForConditionalGeneration, JanusConfig, CLIPVisionConfig, LlamaConfig + + >>> # Initializing a CLIP-vision config + >>> vision_config = CLIPVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a Janus janus-1.5-7b style configuration + >>> configuration = JanusConfig(vision_config, text_config) + + >>> # Initializing a model from the janus-1.5-7b style configuration + >>> model = JanusForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "janus" + sub_configs = {"text_config": JanusTextConfig, "encoder_vision_config": JanusEncoderVisionConfig, "decoder_vision_config": JanusDecoderVisionConfig} + + def __init__(self, text_config, encoder_vision_config,decoder_vision_config, **kwargs): + super.__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is None. Initializaing with default JanusTextConfig values") + + if encoder_vision_config is None: + encoder_vision_config = {} + logger.info("`encodr_vision_config` is None. Initializaing with default JanusEncoderVisionConfig values") + + if decoder_vision_config is None: + decoder_vision_config = {} + logger.info("`text_config` is None. Initializaing with default JanusDecoderVisionConfig values") + + text_config = JanusTextConfig(**text_config) + encoder_vision_config = JanusEncoderVisionConfig(**encoder_vision_config) + decoder_vision_config = JanusDecoderVisionConfig(**decoder_vision_config) + +__all__ = ["JanusDecoderVisionConfig","JanusTextConfig","JanusEncoderVisionConfig","JanusConfig"] diff --git a/src/transformers/models/janus/convert_janus_weights_to_hf.py b/src/transformers/models/janus/convert_janus_weights_to_hf.py new file mode 100644 index 000000000000..f18d3d5a85aa --- /dev/null +++ b/src/transformers/models/janus/convert_janus_weights_to_hf.py @@ -0,0 +1,204 @@ +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import glob + +import torch +from huggingface_hub import file_exists, hf_hub_download, snapshot_download +from safetensors import safe_open + +from transformers import ( + AddedToken, + AutoConfig, + AutoImageProcessor, + AutoTokenizer, + JanusConfig, + JanusForConditionalGeneration, + LlavaProcessor, + SiglipVisionConfig, +) + + +EPILOG_TXT = """Example: + python transformers/src/transformers/models/janus/convert_janus_weights_to_hf.py --text_model_id lmsys/vicuna-7b-v1.5 --vision_model_id openai/clip-vit-large-patch14-336 --output_hub_path org/janus-v1.5-7b-conv --old_state_dict_id liuhaotian/janus-v1.5-7b + +Example for creating the old state dict file with Python: + + import torch + from janus.model.language_model.janus_llama import JanusLlamaForCausalLM + + # load model + kwargs = {"device_map": "auto", "torch_dtype": torch.float16} + model = JanusLlamaForCausalLM.from_pretrained("liuhaotian/janus-v1.5-7b", low_cpu_mem_usage=True, **kwargs) + + # load vision tower + model.get_vision_tower().load_model() + + # Save state dict + torch.save(model.state_dict(), "tmp/hf_models/janus-v1.5-7b/model_state_dict.bin") +""" + +KEYS_TO_MODIFY_MAPPING = { + "model.vision_tower.": "", + ".vision_resampler": "", # all lmms-lab models do avg pooling, so no vision_resampler + "model.mm_projector": "multi_modal_projector", + "model": "model.model", + "vision_model.model": "vision_model", + "lm_head": "language_model.lm_head", + "model.model": "language_model.model", + "multi_modal_projector.0": "multi_modal_projector.linear_1", + "multi_modal_projector.2": "multi_modal_projector.linear_2", +} + + +def load_original_state_dict(model_id): + directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) + + original_state_dict = {} + for path in glob.glob(f"{directory_path}/*"): + if path.endswith(".safetensors"): + with safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + original_state_dict[key] = f.get_tensor(key) + + # tied wieghts so lm.head is not saved. Let's clone to load state dict + if "lm_head.weight" not in original_state_dict: + original_state_dict["lm_head.weight"] = original_state_dict["model.embed_tokens.weight"].clone() + + if "model.image_newline" in original_state_dict: + # not used in the original implementation because "merge_type=flat" + del original_state_dict["model.image_newline"] + return original_state_dict + + +# used only for janus-interlave +# for ex: Qwen/Qwen1.5-0.5B-Chat google/siglip-so400m-patch14-384 lmms-lab/janus-next-interleave-qwen-0.5b +def convert_state_dict_to_hf(state_dict): + new_state_dict = {} + for key, value in state_dict.items(): + if key.endswith(".inv_freq"): + continue + for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in key: + key = key.replace(key_to_modify, new_key) + + new_state_dict[key] = value + return new_state_dict + + +def convert_janus_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): + torch.set_default_dtype(torch.float16) + text_config = AutoConfig.from_pretrained(text_model_id) + + tokenizer = AutoTokenizer.from_pretrained(text_model_id) + tokenizer.add_tokens(AddedToken("", special=True, normalized=False), special_tokens=True) + if "Qwen" not in text_model_id: # qwen already has a pad token + tokenizer.add_special_tokens({"pad_token": ""}) + + image_processor = AutoImageProcessor.from_pretrained(vision_model_id) + processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor) + + if "siglip" in vision_model_id: + vision_config = SiglipVisionConfig( + hidden_size=1152, + image_size=384, + intermediate_size=4304, + num_attention_heads=16, + num_hidden_layers=26, + patch_size=14, + vision_use_head=False, + ).to_dict() + else: + vision_config = None + + config = JanusConfig( + text_config=text_config, + vision_config=vision_config, + ) + + # llms-lab interleeave models do not use any selection startegy except for last hidden state + if "Qwen" in text_model_id: + config.image_token_index = 151646 + if "siglip" in vision_model_id: + config.vision_feature_select_strategy = "full" + config.vision_feature_layer = -1 + else: + config.pad_token_id = 32001 + config.image_token_index = 32000 + + with torch.device("meta"): + model = JanusForConditionalGeneration(config) + + # Some janus variants like microsoft/janus-med-v1.5-mistral-7b use safetensors to store weights + if file_exists(old_state_dict_id, "model_state_dict.bin"): + state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin") + state_dict = torch.load(state_dict_path, map_location="cpu", weights_only=True) + else: + state_dict = load_original_state_dict(old_state_dict_id) + + state_dict = convert_state_dict_to_hf(state_dict) + model.load_state_dict(state_dict, strict=True, assign=True) + + pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data + mu = torch.mean(pre_expansion_embeddings, dim=0).float() + n = pre_expansion_embeddings.size()[0] + sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n + dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma) + + # We add an image token so we resize the model and pad to 64 for performance reasons + pad_shape = 64 + vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape) + model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( + tuple( + (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])) + ), + dim=0, + ) + model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( + tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), + dim=0, + ) + + model.push_to_hub(output_hub_path) + processor.push_to_hub(output_hub_path) + + +def main(): + parser = argparse.ArgumentParser( + epilog=EPILOG_TXT, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--text_model_id", + help="Hub location of the text model", + ) + parser.add_argument( + "--vision_model_id", + help="Hub location of the vision model", + ) + parser.add_argument( + "--output_hub_path", + help="Location on the hub of the converted model", + ) + parser.add_argument( + "--old_state_dict_id", + help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", + ) + args = parser.parse_args() + convert_janus_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py new file mode 100644 index 000000000000..805ca300808d --- /dev/null +++ b/src/transformers/models/janus/image_processing_janus.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Janus.""" + +from typing import Dict, List, Optional, Union +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from PIL import Image + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL +def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + +# Directly copied from siglip image processing file +class JanusImageProcessor(BaseImageProcessor): + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + image_size: int = 1024, + min_size: int = 14, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + self.do_resize = do_resize + self.image_size = image_size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + self.min_size = min_size + + if image_mean is None: + self.background_color = (127, 127, 127) + else: + self.background_color = tuple([int(x * 255) for x in image_mean]) + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + print(image.shape) + height, width, _ = image.shape + max_size = max(height,width) + output_size = [ + max(int(height / max_size * self.image_size), self.min_size), + max(int(width / max_size * self.image_size), self.min_size), + ] + + image = resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + return_numpy=False, + **kwargs, + ) + # expand and pad the images + image = expand2square(image, self.background_color) + image = to_numpy_array(image) + return image + + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: bool = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + do_convert_rgb: bool = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + # validate_preprocess_arguments( + # do_rescale=do_rescale, + # rescale_factor=rescale_factor, + # do_normalize=do_normalize, + # image_mean=image_mean, + # image_std=image_std, + # do_resize=do_resize, + # resample=resample, + # ) + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + print(input_data_format) #ideally should be channel first + + if do_resize: + images = [ + self.resize(image=image, resample=resample, input_data_format=input_data_format) + for image in images + ] + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["JanusImageProcessor"] diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py new file mode 100644 index 000000000000..84bba54e6291 --- /dev/null +++ b/src/transformers/models/janus/processing_janus.py @@ -0,0 +1,99 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for PaliGemma. +""" + +from typing import List, Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images +from ...processing_utils import ( + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + TextKwargs, + Unpack, + _validate_images_text_input_order, +) +from ...tokenization_utils_base import ( + AddedToken, + PreTokenizedInput, + TextInput, +) +from ...utils import logging + + +logger = logging.get_logger(__name__) + +IMAGE_TOKEN = "" #576 image placeholder tokens + +# Copied from transformers.models.idefics2.processing_idefics2.is_url +def is_url(val) -> bool: + return isinstance(val, str) and val.startswith("http") + + +# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url +def is_image_or_image_url(elem): + return is_url(elem) or is_valid_image(elem) + + +def _is_str_or_image(elem): + return isinstance(elem, (str)) or is_image_or_image_url(elem) + +class JanusProcessor(ProcessorMixin): + r""" + Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor. + + [`JanusProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~JanusProcessor.__call__`] and [`~JanusProcessor.decode`] for more information. + + Args: + image_processor ([`SiglipImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ['image_processor','tokenizer'] + valid_kwargs = ['chat_template'] + image_processing_class = ["SiglipImageProcessor"] + tokenizer_class = ["LLamaTokenizer","LlamaTokenizerFast"] + + def __init__(self, image_processor, tokenizer, chat_template, **kwargs): + + if image_processor is None: + raise ValueError("You need to specify an `image_processor`.") + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + if not hasattr(image_processor, "image_seq_length"): + raise ValueError("Image processor is missing an `image_seq_length` attribute.") + + self.image_seq_length = image_processor.image_seq_length + + if not hasattr(tokenizer, "image_token"): + image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True) + tokens_to_add = {"additional_special_tokens": [image_token]} + tokenizer.add_special_tokens(tokens_to_add) + self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + else: + self.image_token_id = tokenizer.image_token_id + + tokenizer.add_bos_token = False + tokenizer.add_eos_token = False + + super().__init__(image_processor, tokenizer, chat_template=chat_template) \ No newline at end of file diff --git a/tests/models/janus/test_modeling_janus.py b/tests/models/janus/test_modeling_janus.py new file mode 100644 index 000000000000..cbfafd3268af --- /dev/null +++ b/tests/models/janus/test_modeling_janus.py @@ -0,0 +1,647 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Janus model.""" + +import unittest + +import requests +from parameterized import parameterized + +from transformers import ( + AutoProcessor, + AutoTokenizer, + JanusConfig, + JanusForConditionalGeneration, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + cleanup, + require_bitsandbytes, + require_torch, + require_vision, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch + + +if is_vision_available(): + from PIL import Image + + +class JanusVisionText2TextModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=0, + projector_hidden_act="gelu", + seq_length=7, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + text_config={ + "model_type": "llama", + "seq_length": 7, + "is_training": True, + "use_input_mask": True, + "use_token_type_ids": False, + "use_labels": True, + "vocab_size": 99, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 512, + "type_vocab_size": 16, + "type_sequence_label_size": 2, + "initializer_range": 0.02, + "num_labels": 3, + "num_choices": 4, + "pad_token_id": 1, + }, + is_training=True, + vision_config={ + "image_size": 8, + "patch_size": 2, + "num_channels": 3, + "is_training": True, + "hidden_size": 32, + "projection_dim": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 37, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + ): + self.parent = parent + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.pad_token_id = text_config["pad_token_id"] + + self.num_hidden_layers = text_config["num_hidden_layers"] + self.vocab_size = text_config["vocab_size"] + self.hidden_size = text_config["hidden_size"] + self.num_attention_heads = text_config["num_attention_heads"] + self.is_training = is_training + + self.batch_size = 3 + self.num_channels = 3 + self.image_size = 336 + self.num_image_tokens = (self.vision_config["image_size"] // self.vision_config["patch_size"]) ** 2 + self.seq_length = seq_length + self.num_image_tokens + self.encoder_seq_length = self.seq_length + + def get_config(self): + return JanusConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ignore_index=self.ignore_index, + image_token_index=self.image_token_index, + projector_hidden_act=self.projector_hidden_act, + vision_feature_select_strategy=self.vision_feature_select_strategy, + vision_feature_layer=self.vision_feature_layer, + image_seq_length=self.num_image_tokens, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 + attention_mask = input_ids.ne(1).to(torch_device) + input_ids[input_ids == config.image_token_index] = self.pad_token_id + input_ids[:, : self.num_image_tokens] = config.image_token_index + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + def create_and_check_janus_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask): + model = JanusForConditionalGeneration(config=config) + model.to(torch_device) + model.eval() + with torch.autocast(device_type="cuda", dtype=torch.float16): + logits = model( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values.to(torch.bfloat16), + return_dict=True, + )["logits"] + self.parent.assertFalse(torch.isnan(logits).any().item()) + + +@require_torch +class JanusForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `JanusForConditionalGeneration`. + """ + + all_model_classes = (JanusForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (JanusForConditionalGeneration,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = JanusVisionText2TextModelTester(self) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=JanusConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + torch.testing.assert_close(out_embeds, out_ids) + + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + @parameterized.expand( + [ + (-1,), + ([-1],), + ([-1, -2],), + ], + ) + def test_vision_feature_layers(self, vision_feature_layer): + """ + Test that we can use either one vision feature layer, or a list of + vision feature layers. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.vision_feature_layer = vision_feature_layer + + num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer) + hidden_size = config.vision_config.hidden_size + expected_features = hidden_size * num_feature_layers + + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + # We should have the right number of input features, + # and should be able to run a forward pass without exploding + assert model.multi_modal_projector.linear_1.in_features == expected_features + model(**input_dict) + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_compile_dynamic(self): + pass + + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_dispatch_on_flash(self): + pass + + @unittest.skip("FlashAttention only support fp16 and bf16 data type") + def test_flash_attn_2_fp32_ln(self): + pass + + @unittest.skip( + "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" + ) + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + pass + + +@require_torch +class JanusForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("janus-hf/bakJanus-v1-hf") + + def tearDown(self): + cleanup(torch_device, gc_collect=True) + + @slow + @require_bitsandbytes + def test_small_model_integration_test(self): + # Let' s make sure we test the preprocessing to replace what is used + model = JanusForConditionalGeneration.from_pretrained("janus-hf/bakJanus-v1-hf", load_in_4bit=True) + + prompt = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:" + image_file = "https://janus-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20) + EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_single(self): + # Let' s make sure we test the preprocessing to replace what is used + model_id = "deepseek-ai/Janus-Pro-1B" + + model = JanusForConditionalGeneration.from_pretrained("deepseek-ai/Janus-Pro-1B", load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:" + image_file = "https://janus-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + output = model.generate(**inputs, max_new_tokens=900, do_sample=False) + EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip + + self.assertEqual( + processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_batched(self): + # Let' s make sure we test the preprocessing to replace what is used + model_id = "deepseek-ai/Janus-Pro-1B" + + model = JanusForConditionalGeneration.from_pretrained("deepseek-ai/Janus-Pro-1B", load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:", + "USER: \nWhat is this? ASSISTANT:", + ] + image1 = Image.open(requests.get("https://janus-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER: \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on'] # fmt: skip + + self.assertEqual( + processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_batch(self): + # Let' s make sure we test the preprocessing to replace what is used + model = JanusForConditionalGeneration.from_pretrained("janus-hf/bakJanus-v1-hf", load_in_4bit=True) + # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!. + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", + "USER: \nWhat is this?\nASSISTANT:", + ] + image1 = Image.open(requests.get("https://janus-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to( + torch_device + ) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = [ + 'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.', + 'USER: \nWhat is this?\nASSISTANT: Cats' + ] # fmt: skip + self.assertEqual( + self.processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_batched_regression(self): + # Let' s make sure we test the preprocessing to replace what is used + model_id = "deepseek-ai/Janus-Pro-1B" + + # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before) + model = JanusForConditionalGeneration.from_pretrained( + "deepseek-ai/Janus-Pro-1B", load_in_4bit=True, attn_implementation="eager" + ) + processor = AutoProcessor.from_pretrained(model_id, pad_token="") + + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", + "USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT:", + ] + image1 = Image.open(requests.get("https://janus-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True).to( + torch_device + ) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT: A cat sleeping on a bed.'] # fmt: skip + + self.assertEqual( + processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_torch + @require_vision + def test_batched_generation(self): + model = JanusForConditionalGeneration.from_pretrained("deepseek-ai/Janus-Pro-1B", load_in_4bit=True) + + processor = AutoProcessor.from_pretrained("deepseek-ai/Janus-Pro-1B") + + prompt1 = "\n\nUSER: What's the the difference of two images?\nASSISTANT:" + prompt2 = "\nUSER: Describe the image.\nASSISTANT:" + prompt3 = "\nUSER: Describe the image.\nASSISTANT:" + url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" + url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" + image1 = Image.open(requests.get(url1, stream=True).raw) + image2 = Image.open(requests.get(url2, stream=True).raw) + + inputs = processor( + images=[image1, image2, image1, image2], + text=[prompt1, prompt2, prompt3], + return_tensors="pt", + padding=True, + ).to(torch_device) + + model = model.eval() + + EXPECTED_OUTPUT = [ + "\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while", + "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small", + "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the", + ] + + generate_ids = model.generate(**inputs, max_new_tokens=20) + outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + self.assertEqual(outputs, EXPECTED_OUTPUT) + + def test_tokenizer_integration(self): + slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/janus-v1.6-34b", use_fast=False) + slow_tokenizer.add_tokens("", True) + + fast_tokenizer = AutoTokenizer.from_pretrained( + "liuhaotian/janus-v1.6-34b", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + from_slow=True, + legacy=False, + ) + fast_tokenizer.add_tokens("", True) + + prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" + EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] # fmt: skip + self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + + @slow + @require_bitsandbytes + def test_generation_no_images(self): + model_id = "deepseek-ai/Janus-Pro-1B" + model = JanusForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + # Prepare inputs with no images + inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device) + + # Make sure that `generate` works + _ = model.generate(**inputs, max_new_tokens=20) + + @slow + @require_bitsandbytes + def test_generation_siglip_backbone(self): + model_id = "janus-hf/janus-interleave-qwen-0.5b-hf" + model = JanusForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + # check processing with expansion of inputs (w/o expansion should work with any backbone) + processor.vision_feature_select_strategy = "default" + processor.patch_size = 14 + + image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor( + text="<|im_start|>user\n\nWhat are these?<|im_end|>\n<|im_start|>assistant", + images=raw_image, + return_tensors="pt", + ).to(torch_device, torch.float16) + + # Make sure that `generate` works + output = model.generate(**inputs, max_new_tokens=30) + + EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat" + self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT) + + @slow + def test_pixtral(self): + model_id = "mistral-community/pixtral-12b" + model = JanusForConditionalGeneration.from_pretrained(model_id) + processor = AutoProcessor.from_pretrained(model_id) + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw), + ] + PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]" + + # image = Image.open(requests.get(url, stream=True).raw) + inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(model.device) + generate_ids = model.generate(**inputs, max_new_tokens=500) + ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + print(ouptut) + + # fmt: off + EXPECTED_GENERATION = """ +Describe the images. +Certainly! Here are the descriptions of the images: + +1. **Image 1**: This image features a black dog with a glossy coat sitting on a wooden surface. The dog has a calm and attentive expression, looking directly at the camera. The wooden background has a rustic appearance with visible grain and texture. + +2. **Image 2**: This image captures a breathtaking view of a mountainous landscape. The mountains are rugged and covered with patches of green vegetation. The sky above is clear, and the scene conveys a sense of tranquility and natural beauty. + +3. **Image 3**: This image shows a beach scene during sunset. The waves are gently rolling onto the shore, and several people can be seen in the water, possibly surfing or swimming. The sky is painted with warm hues of orange and yellow, creating a serene and picturesque atmosphere. + +4. **Image 4**: This image depicts a narrow, winding path that cuts through a lush, green landscape. On either side of the path, there is dense grass and various trees, including a prominent tree with white blossoms. The sky is clear and blue, adding to the peaceful and inviting ambiance of the scene. + +These descriptions provide a detailed overview of the content and atmosphere of each image. +""" + # fmt: on + # check that both inputs are handled correctly and generate the same output + self.assertEqual(ouptut, EXPECTED_GENERATION) + + @slow + @require_bitsandbytes + def test_pixtral_4bit(self): + model_id = "mistral-community/pixtral-12b" + model = JanusForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), + ] + PROMPT = "[INST][IMG][IMG]Describe the images.[/INST]" + + inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to(torch_device, torch.float16) + generate_ids = model.generate(**inputs, max_new_tokens=50) + output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + EXPECTED_GENERATION = "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador" # fmt: skip + self.assertEqual(output, EXPECTED_GENERATION) + + @slow + @require_bitsandbytes + def test_pixtral_batched(self): + model_id = "mistral-community/pixtral-12b" + model = JanusForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/17/150/500", stream=True).raw), + ] + PROMPT = [ + "[INST][IMG]What breed is the dog?[/INST]", + "[INST][IMG]What is shown in this image?[/INST]", + ] + + inputs = processor(text=PROMPT, images=IMG_URLS, padding=True, return_tensors="pt").to( + torch_device, torch.float16 + ) + generate_ids = model.generate(**inputs, max_new_tokens=50) + output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + + EXPECTED_GENERATION = [ + 'What breed is the dog?The dog in the image is a black Labrador Retriever.', + 'What is shown in this image?The image depicts a narrow, winding dirt path surrounded by lush greenery. The path is flanked by grass and shrubs on both sides. On the left side, there are tall trees and dense foliage, while on the right side, there' + ] # fmt: skip + self.assertEqual(output, EXPECTED_GENERATION) From fb1b57ec37a92a7405418bdfda1e253ca6d6bc95 Mon Sep 17 00:00:00 2001 From: yaswant19 Date: Wed, 5 Feb 2025 22:23:16 +0530 Subject: [PATCH 3/7] discard changes --- src/transformers/generation/utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 1fd9da870217..45558bd22a4e 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -383,9 +383,7 @@ def prepare_inputs_for_generation( # (we can't check exception 3 while compiling) if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None and input_ids.shape[1]==0: - inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] - elif ( + if ( inputs_embeds is not None # Exception 1 or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 ): @@ -395,9 +393,9 @@ def prepare_inputs_for_generation( # 3. Prepare base model inputs input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step for every prompt. + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if not self.config.is_encoder_decoder: - if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: + if inputs_embeds is not None and cache_position[0] == 0: model_inputs[input_ids_key] = None model_inputs["inputs_embeds"] = inputs_embeds else: From 730fcd767f5357059f4f8d19adfeb7459e54d976 Mon Sep 17 00:00:00 2001 From: yaswant19 Date: Tue, 11 Feb 2025 00:40:39 +0530 Subject: [PATCH 4/7] Janus imports --- src/transformers/__init__.py | 18 +++++++++--------- src/transformers/models/__init__.py | 2 +- .../models/auto/configuration_auto.py | 4 ++-- src/transformers/models/auto/modeling_auto.py | 6 +++--- .../models/auto/processing_auto.py | 2 +- .../models/auto/tokenization_auto.py | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 08eb48773919..749aa9def0ef 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -560,7 +560,8 @@ ], "models.janus": [ "JanusConfig", - + "JanusImageProcessor", + "JanusProcessor" ], "models.llava_next": [ "LlavaNextConfig", @@ -5613,6 +5614,9 @@ InstructBlipVideoVisionConfig, ) from .models.jamba import JambaConfig + from .models.janus import ( + JanusConfig, + ) from .models.jetmoe import JetMoeConfig from .models.kosmos2 import ( Kosmos2Config, @@ -5645,10 +5649,6 @@ LlavaConfig, LlavaProcessor, ) - from .models.janus import ( - JanusConfig, - - ) from .models.llava_next import ( LlavaNextConfig, LlavaNextProcessor, @@ -7529,6 +7529,10 @@ JambaModel, JambaPreTrainedModel, ) + from .models.janus import ( + JanusForConditionalGeneration, + JanusPreTrainedModel, + ) from .models.jetmoe import ( JetMoeForCausalLM, JetMoeForSequenceClassification, @@ -7594,10 +7598,6 @@ LlavaForConditionalGeneration, LlavaPreTrainedModel, ) - from .models.janus import ( - JanusForConditionalGeneration, - JanusPreTrainedModel, - ) from .models.llava_next import ( LlavaNextForConditionalGeneration, LlavaNextPreTrainedModel, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index bfab93812215..b9cebbba34bc 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -132,6 +132,7 @@ instructblip, instructblipvideo, jamba, + janus, jetmoe, kosmos2, layoutlm, @@ -143,7 +144,6 @@ lilt, llama, llava, - janus, llava_next, llava_next_video, llava_onevision, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 071a5b0534f8..a6c1ece2e2d3 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -153,6 +153,7 @@ ("instructblip", "InstructBlipConfig"), ("instructblipvideo", "InstructBlipVideoConfig"), ("jamba", "JambaConfig"), + ("janus", "JanusConfig"), ("jetmoe", "JetMoeConfig"), ("jukebox", "JukeboxConfig"), ("kosmos-2", "Kosmos2Config"), @@ -164,7 +165,6 @@ ("lilt", "LiltConfig"), ("llama", "LlamaConfig"), ("llava", "LlavaConfig"), - ("janus", "JanusConfig"), ("llava_next", "LlavaNextConfig"), ("llava_next_video", "LlavaNextVideoConfig"), ("llava_onevision", "LlavaOnevisionConfig"), @@ -482,6 +482,7 @@ ("instructblip", "InstructBLIP"), ("instructblipvideo", "InstructBlipVideo"), ("jamba", "Jamba"), + ("janus", "Janus"), ("jetmoe", "JetMoe"), ("jukebox", "Jukebox"), ("kosmos-2", "KOSMOS-2"), @@ -496,7 +497,6 @@ ("llama2", "Llama2"), ("llama3", "Llama3"), ("llava", "LLaVa"), - ("janus", "Janus"), ("llava_next", "LLaVA-NeXT"), ("llava_next_video", "LLaVa-NeXT-Video"), ("llava_onevision", "LLaVA-Onevision"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 28f20b8e50be..260f322d42ff 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -340,9 +340,9 @@ ("idefics", "IdeficsForVisionText2Text"), ("idefics2", "Idefics2ForConditionalGeneration"), ("idefics3", "Idefics3ForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), - ("janus", "JanusForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), @@ -783,9 +783,9 @@ ("idefics3", "Idefics3ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), - ("janus", "JanusForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), @@ -820,9 +820,9 @@ ("idefics2", "Idefics2ForConditionalGeneration"), ("idefics3", "Idefics3ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), - ("janus", "JanusForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), ("mllama", "MllamaForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 72b63e9b6d13..17706265f560 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -72,11 +72,11 @@ ("idefics3", "Idefics3Processor"), ("instructblip", "InstructBlipProcessor"), ("instructblipvideo", "InstructBlipVideoProcessor"), + ("janus", "JanusProcessor"), ("kosmos-2", "Kosmos2Processor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), ("llava", "LlavaProcessor"), - ("janus", "JanusProcessor"), ("llava_next", "LlavaNextProcessor"), ("llava_next_video", "LlavaNextVideoProcessor"), ("llava_onevision", "LlavaOnevisionProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 3a49dd4b7880..0233645d27f3 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -242,6 +242,7 @@ "LlamaTokenizerFast" if is_tokenizers_available() else None, ), ), + ("janus", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ( "jetmoe", ( @@ -271,7 +272,6 @@ ), ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("janus", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), From 0aab45ad4a033b00e94f8c0899285ae0987dd028 Mon Sep 17 00:00:00 2001 From: yaswant19 Date: Tue, 11 Feb 2025 00:42:29 +0530 Subject: [PATCH 5/7] Refactor config and processor --- .../models/janus/configuration_janus.py | 61 +++--- .../models/janus/image_processing_janus.py | 18 +- .../models/janus/processing_janus.py | 185 ++++++++++++++---- 3 files changed, 196 insertions(+), 68 deletions(-) diff --git a/src/transformers/models/janus/configuration_janus.py b/src/transformers/models/janus/configuration_janus.py index 70eb00e688b1..deaaa85745f2 100644 --- a/src/transformers/models/janus/configuration_janus.py +++ b/src/transformers/models/janus/configuration_janus.py @@ -12,21 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. """Janus model configuration""" -import copy -import os -from typing import Any, Dict, Optional, Union from ...configuration_utils import PretrainedConfig -from ...utils import logging - -from ..auto import CONFIG_MAPPING, AutoConfig from ...modeling_rope_utils import rope_config_validation +from ...utils import logging logger = logging.get_logger(__name__) -class JanusEncoderVisionConfig(PretrainedConfig): +class JanusVisionEncoderConfig(PretrainedConfig): """Encoder Vision config in this case its the SIGLIP model""" model_type = "siglip_vision_model" @@ -37,23 +32,28 @@ def __init__( hidden_size=1024, mlp_ratio=4.0, projection_dim=1024, - num_hidden_layers=24, + num_hidden_layers=4, num_attention_heads=16, num_channels=3, - num_frames=2, image_size=384, - patch_size=14, + patch_size=16, hidden_act="gelu", layer_norm_eps=1e-6, - add_kv_bias=False, + qkv_bias=False, attention_dropout=0.0, drop_path_rate=0.0, initializer_range=0.02, initializer_factor=1.0, logit_scale_init_value=None, learnable_logit_scale=False, - select_feature = "same", - select_layer = -1, + select_feature="same", + select_layer=-1, + num_register_tokens=0, + hidden_dropout_rate=0, + projection_dropout=0, + use_qk_norm = False, + layerscale_value=None, + vision_use_head = True, **kwargs, ): super().__init__(**kwargs) @@ -64,20 +64,23 @@ def __init__( self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_channels = num_channels - self.num_frames = num_frames self.patch_size = patch_size self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.add_kv_bias = add_kv_bias + self.qkv_bias = qkv_bias self.attention_dropout = attention_dropout self.drop_path_rate = drop_path_rate self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - self.logit_scale_init_value = logit_scale_init_value - self.learnable_logit_scale = learnable_logit_scale - self.feature_size = image_size self.intermediate_size = int(hidden_size * mlp_ratio) + self.num_register_tokens = num_register_tokens + self.hidden_dropout_rate = hidden_dropout_rate + self.projection_dropout = projection_dropout + self.use_qk_norm = use_qk_norm + self.layerscale_value = layerscale_value + self.select_layer = select_layer + self.select_feature = select_feature + self.vision_use_head = vision_use_head + class JanusTextConfig(PretrainedConfig): r""" @@ -272,12 +275,15 @@ def __init__( **kwargs, ) + class JanusDecoderVisionConfig(PretrainedConfig): """A custom VQ config model""" + # TODO def __init__(self): pass + class JanusConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`JanusForConditionalGeneration`]. It is used to instantiate an @@ -334,9 +340,13 @@ class JanusConfig(PretrainedConfig): ```""" model_type = "janus" - sub_configs = {"text_config": JanusTextConfig, "encoder_vision_config": JanusEncoderVisionConfig, "decoder_vision_config": JanusDecoderVisionConfig} + sub_configs = { + "text_config": JanusTextConfig, + "encoder_vision_config": JanusVisionEncoderConfig, + "decoder_vision_config": JanusDecoderVisionConfig, + } - def __init__(self, text_config, encoder_vision_config,decoder_vision_config, **kwargs): + def __init__(self, text_config, encoder_vision_config, decoder_vision_config, **kwargs): super.__init__(**kwargs) if text_config is None: @@ -345,14 +355,15 @@ def __init__(self, text_config, encoder_vision_config,decoder_vision_config, **k if encoder_vision_config is None: encoder_vision_config = {} - logger.info("`encodr_vision_config` is None. Initializaing with default JanusEncoderVisionConfig values") + logger.info("`encodr_vision_config` is None. Initializaing with default JanusVisionEncoderConfig values") if decoder_vision_config is None: decoder_vision_config = {} logger.info("`text_config` is None. Initializaing with default JanusDecoderVisionConfig values") text_config = JanusTextConfig(**text_config) - encoder_vision_config = JanusEncoderVisionConfig(**encoder_vision_config) + encoder_vision_config = JanusVisionEncoderConfig(**encoder_vision_config) decoder_vision_config = JanusDecoderVisionConfig(**decoder_vision_config) -__all__ = ["JanusDecoderVisionConfig","JanusTextConfig","JanusEncoderVisionConfig","JanusConfig"] + +__all__ = ["JanusDecoderVisionConfig", "JanusTextConfig", "JanusVisionEncoderConfig", "JanusConfig"] diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py index 805ca300808d..223de4397336 100644 --- a/src/transformers/models/janus/image_processing_janus.py +++ b/src/transformers/models/janus/image_processing_janus.py @@ -14,10 +14,12 @@ # limitations under the License. """Image processor class for Janus.""" -from typing import Dict, List, Optional, Union +from typing import List, Optional, Union + import numpy as np +from PIL import Image -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_processing_utils import BaseImageProcessor, BatchFeature from ...image_transforms import ( convert_to_rgb, resize, @@ -37,7 +39,6 @@ validate_preprocess_arguments, ) from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging -from PIL import Image logger = logging.get_logger(__name__) @@ -45,6 +46,8 @@ if is_vision_available(): import PIL + + def expand2square(pil_img, background_color): width, height = pil_img.size if width == height: @@ -58,6 +61,7 @@ def expand2square(pil_img, background_color): result.paste(pil_img, ((height - width) // 2, 0)) return result + # Directly copied from siglip image processing file class JanusImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] @@ -131,9 +135,8 @@ def resize( Returns: `np.ndarray`: The resized image. """ - print(image.shape) height, width, _ = image.shape - max_size = max(height,width) + max_size = max(height, width) output_size = [ max(int(height / max_size * self.image_size), self.min_size), max(int(width / max_size * self.image_size), self.min_size), @@ -153,7 +156,6 @@ def resize( image = to_numpy_array(image) return image - @filter_out_non_signature_kwargs() def preprocess( self, @@ -254,12 +256,10 @@ def preprocess( if input_data_format is None: # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) - print(input_data_format) #ideally should be channel first if do_resize: images = [ - self.resize(image=image, resample=resample, input_data_format=input_data_format) - for image in images + self.resize(image=image, resample=resample, input_data_format=input_data_format) for image in images ] if do_rescale: images = [ diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py index 84bba54e6291..b24c4b432735 100644 --- a/src/transformers/models/janus/processing_janus.py +++ b/src/transformers/models/janus/processing_janus.py @@ -13,32 +13,42 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Processor class for PaliGemma. +Processor class for Janus. """ -from typing import List, Optional, Union - +from ...image_utils import is_valid_image from ...feature_extraction_utils import BatchFeature -from ...image_utils import ImageInput, is_valid_image, make_flat_list_of_images +from ...image_processing_utils import select_best_resolution +from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType, logging +from typing import TYPE_CHECKING, List, Optional, Union from ...processing_utils import ( - ImagesKwargs, - ProcessingKwargs, ProcessorMixin, - TextKwargs, - Unpack, - _validate_images_text_input_order, ) from ...tokenization_utils_base import ( AddedToken, - PreTokenizedInput, - TextInput, ) from ...utils import logging +import torch logger = logging.get_logger(__name__) -IMAGE_TOKEN = "" #576 image placeholder tokens +IMAGE_TOKEN = "" + +DEFAULT_SYSTEM_PROMPT = ( + "You are a helpful language and vision assistant. " + "You are able to understand the visual content that the user provides, " + "and assist the user with a variety of tasks using natural language.\n\n" + ) + +# messages = [{"role":"User", +# "content":[{'type':"text","text":"\nConvert the formula into latex code.\n"}]}, +# {"role": "Assistant", "content": " "}, +# ] +# Here as a hack I have added \n after user content but ideally chat template should add it # Copied from transformers.models.idefics2.processing_idefics2.is_url def is_url(val) -> bool: @@ -53,6 +63,15 @@ def is_image_or_image_url(elem): def _is_str_or_image(elem): return isinstance(elem, (str)) or is_image_or_image_url(elem) +class JanusProcessorKwargs(ProcessingKwargs, total=False): + # see processing_utils.ProcessingKwargs documentation for usage. + _defaults = { + "text_kwargs": { + "padding": False, + "return_tensors":"pt" + } + } + class JanusProcessor(ProcessorMixin): r""" Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor. @@ -69,31 +88,129 @@ class JanusProcessor(ProcessorMixin): in a chat into a tokenizable string. """ - attributes = ['image_processor','tokenizer'] - valid_kwargs = ['chat_template'] - image_processing_class = ["SiglipImageProcessor"] - tokenizer_class = ["LLamaTokenizer","LlamaTokenizerFast"] - - def __init__(self, image_processor, tokenizer, chat_template, **kwargs): + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] + image_processor_class = "JanusImageProcessor" + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + def __init__(self, image_processor, tokenizer, chat_template=None,use_default_system_prompt=True, **kwargs): if image_processor is None: raise ValueError("You need to specify an `image_processor`.") if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") - if not hasattr(image_processor, "image_seq_length"): - raise ValueError("Image processor is missing an `image_seq_length` attribute.") - - self.image_seq_length = image_processor.image_seq_length - - if not hasattr(tokenizer, "image_token"): - image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True) - tokens_to_add = {"additional_special_tokens": [image_token]} - tokenizer.add_special_tokens(tokens_to_add) - self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) - else: - self.image_token_id = tokenizer.image_token_id - - tokenizer.add_bos_token = False - tokenizer.add_eos_token = False - super().__init__(image_processor, tokenizer, chat_template=chat_template) \ No newline at end of file + self.num_image_tokens = 10 # revert back to 576 or fetch from pre_processor config + self.image_start_token = "" + self.image_end_token = "" + self.use_default_system_prompt = use_default_system_prompt + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: ImageInput = None, + **kwargs: Unpack[JanusProcessorKwargs] + ) -> BatchFeature: + """Construct a Janus processor with JanusImage procesor and Llama text tokenizer""" + + output_kwargs = self._merge_kwargs(JanusProcessorKwargs,tokenizer_init_kwargs=self.tokenizer.init_kwargs,**kwargs) + + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + + data = {} + if text is not None: + if isinstance(text, str): + text = [text] + elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + # Replace the image token with explanded imaeg tokens. + prompt_strings = [] + one_img_tokens = self.image_start_token + (IMAGE_TOKEN * self.num_image_tokens) + self.image_end_token + for sample in text: + sample = sample.strip() + sample = sample.replace(IMAGE_TOKEN, one_img_tokens) + if self.use_default_system_prompt: + sample = DEFAULT_SYSTEM_PROMPT + sample + prompt_strings.append(sample) + + + data = self.tokenizer(prompt_strings,**output_kwargs['text_kwargs']) + + if images is not None: + # How to pass image kwargs and it returns the pixel values aso append it to the output. + data['pixel_values'] = self.image_processor(images=images,return_tensors="pt")['pixel_values'] + + input_ids = data["input_ids"] + batch_size, _ = input_ids.shape + + # Compute special tokens IDs + image_token_id = self.tokenizer.vocab.get(IMAGE_TOKEN) + image_start_id = self.tokenizer.vocab.get(self.image_start_token) + + # Compute image sequence mask + images_seq_mask = (input_ids == image_token_id) | (input_ids == image_start_id) + + # Compute image embedding mask dynamically + max_n_images = max(1,len(images)) + images_emb_mask = torch.zeros((batch_size, max_n_images, self.num_image_tokens + 1), dtype=torch.bool) + + for i in range(batch_size): + img_positions = (input_ids[i] == image_start_id).nonzero(as_tuple=True)[0] + for j, start_idx in enumerate(img_positions): + end_idx = start_idx + self.num_image_tokens + 1 # Account for + images_emb_mask[i, j, : min(end_idx - start_idx, self.num_image_tokens + 1)] = True + + # Process images if provided + if images is not None: + data["pixel_values"] = self.image_processor(images=images, return_tensors="pt")["pixel_values"] + + # Add masks to the output + data.update({ + "images_seq_mask": images_seq_mask, + "images_emb_mask": images_emb_mask + }) + + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode( + generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["JanusProcessor"] \ No newline at end of file From df0672139d88db3a7a20a2d5b3eed271121d18ba Mon Sep 17 00:00:00 2001 From: yaswant19 Date: Tue, 11 Feb 2025 00:43:01 +0530 Subject: [PATCH 6/7] Added Vision tower of Janus --- .../models/janus/modeling_janus.py | 563 ++++++++++++++++++ .../models/janus/modular_janus.py | 486 +++++++++++++++ 2 files changed, 1049 insertions(+) create mode 100644 src/transformers/models/janus/modeling_janus.py diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py new file mode 100644 index 000000000000..5a64b0c2d68a --- /dev/null +++ b/src/transformers/models/janus/modeling_janus.py @@ -0,0 +1,563 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/janus/modular_janus.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_janus.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +from typing import Optional, Tuple, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from .configuration_janus import JanusVisionEncoderConfig + + +class PatchDropout(nn.Module): + """ + https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220 + """ + + return_indices: torch.jit.Final[bool] + + def __init__( + self, + prob: float = 0.5, + num_prefix_tokens: int = 1, + ordered: bool = False, + return_indices: bool = False, + ): + super().__init__() + assert 0 <= prob < 1.0 + self.prob = prob + self.num_prefix_tokens = num_prefix_tokens # exclude CLS token (or other prefix tokens) + self.ordered = ordered + self.return_indices = return_indices + + def forward(self, x) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]: + if not self.training or self.prob == 0.0: + if self.return_indices: + return x, None + return x + + if self.num_prefix_tokens: + prefix_tokens, x = x[:, : self.num_prefix_tokens], x[:, self.num_prefix_tokens :] + else: + prefix_tokens = None + + B = x.shape[0] + L = x.shape[1] + num_keep = max(1, int(L * (1.0 - self.prob))) + keep_indices = torch.argsort(torch.randn(B, L, device=x.device), dim=-1)[:, :num_keep] + if self.ordered: + # NOTE does not need to maintain patch order in typical transformer use, + # but possibly useful for debug / visualization + keep_indices = keep_indices.sort(dim=-1)[0] + x = x.gather(1, keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:])) + + if prefix_tokens is not None: + x = torch.cat((prefix_tokens, x), dim=1) + + if self.return_indices: + return x, keep_indices + return x + + +class JanusVisionEncoderPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class JanusVisionEncoderEmbeddings(nn.Module): + def __init__( + self, + config: JanusVisionEncoderConfig, + use_special_tokens: bool, + ): + super().__init__() + + self.use_special_tokens = use_special_tokens + self.cls_token = nn.Parameter(torch.rand(1, 1, config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + # Currently using hidden_drop_rate instead of positional_dropout_rate, is it necessary? + self.dropout = nn.Dropout(config.hidden_dropout_rate) + self.patch_embeddings = JanusVisionEncoderPatchEmbeddings(config) + self.num_patches = self.patch_embeddings.num_patches + + num_prefix_tokens = config.num_register_tokens + 1 + pos_embed_len = self.num_patches + num_prefix_tokens if use_special_tokens else self.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, pos_embed_len, config.hidden_size) * 0.02) + + # Used to reduce computationality. + self.patch_dropout = ( + PatchDropout(config.drop_path_rate, num_prefix_tokens) if config.drop_path_rate else nn.Identity() + ) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + # Add CLS tokens + special_token_embeddings = [] + cls_token_embeddings = self.cls_token.expand((batch_size, -1, -1)) + special_token_embeddings.append(cls_token_embeddings) + + if self.register_tokens.shape[1]: + register_token_embeddings = self.register_tokens.expand((batch_size, -1, -1)) + special_token_embeddings.append(register_token_embeddings) + + if self.use_special_tokens and not self.cls_token and not self.register_tokens: + raise ValueError("You have passed to use special tokens but the CLS and Register Tokens are None.") + + if self.use_special_tokens: + embeddings = embeddings + self.position_embeddings + embeddings = torch.cat(special_token_embeddings + [embeddings], dim=1) + else: + # embeddings = torch.cat(special_token_embeddings+[embeddings], dim=1) + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + # Perform Patch dropout + embeddings = self.patch_dropout(embeddings) + return embeddings + + +# Todo: introduce compatiability for cache +class JanusVisionEncoderAttention(nn.Module): + """Attention Class for Janus Vision Encoder""" + + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + proj_dropout = config.projection_dropout + qk_norm = config.use_qk_norm + + # Split the weights manually and checkif getting correct output or not + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias) + self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim) + self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity() + + self.query_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity() + self.key_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = hidden_states.size() + + qkv = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim) + query_states, key_states, value_states = qkv.unbind(2) + # batch numhead, seq len, head dim + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + query_states = self.query_norm(query_states) + key_states = self.key_norm(key_states) + + # Is it a bug or deliberate change? + query_states = query_states * self.scale + + # batch, num head,seqlen,seqlen + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Only apply attention dropout during training. + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (batch_size, self.num_heads, seq_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, seq_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, seq_len, self.head_dim): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, seq_len, self.head_dim)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, seq_len, self.embed_dim) + + output = self.projection_layer(attn_output) + output = self.projection_dropout(output) + + outputs = (output, attn_weights) if output_attentions else (output, None) + return outputs + + +class JanusVisionEncoderLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class JanusVisionEncoderDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class JanusVisionEncoderMLP(nn.Module): + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] # Gelu act + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout1 = nn.Dropout(config.hidden_dropout_rate) + self.dropout2 = nn.Dropout(config.hidden_dropout_rate) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.dropout1(hidden_states) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout2(hidden_states) + return hidden_states + + +class JanusVisionEncoderLayer(nn.Module): + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.self_attn = JanusVisionEncoderAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim) + self.layer_norm2 = nn.LayerNorm(self.embed_dim) + + self.layer_scale1 = JanusVisionEncoderLayerScale(config) if config.layerscale_value else nn.Identity() + self.layer_scale2 = JanusVisionEncoderLayerScale(config) if config.layerscale_value else nn.Identity() + self.drop_path1 = ( + JanusVisionEncoderDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + self.drop_path2 = ( + JanusVisionEncoderDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + self.mlp = JanusVisionEncoderMLP(config) + + # Ignore copy + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + + # Pre-Norm before applying attention. + norm_hidden_states = self.layer_norm1(hidden_states) + attn_output, attn_weights = self.self_attn( + norm_hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + + # Apply DropPath & LayerScale to Attention Output and then residual connection. + hidden_states = hidden_states + self.drop_path1(self.layer_scale1(attn_output)) + + hidden_states = self.layer_norm2(hidden_states) + mlp_output = self.mlp(hidden_states) + + hidden_states = hidden_states + self.drop_path2(self.layer_scale2(mlp_output)) + + return (hidden_states, attn_weights if output_attentions else None) + + +# copied from SiglipMultiheadAttentionPoolingHead +# class JanusAttentionPoolLatent(SiglipMultiheadAttentionPoolingHead): +# pass + + +class JanusAttentionPoolLatent(nn.Module): + """Hugging Face-compatible Attention Pooling with Manual QKV""" + + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + + self.latent_len = getattr(config, "latent_len", 1) + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.mlp_ratio = getattr(config, "mlp_ratio", 4.0) + self.scale = self.head_dim**-0.5 # Scaling factor for attention + + # Learnable latent query (probe) + self.latent = nn.Parameter(torch.zeros(1, self.latent_len, self.hidden_size)) + + # Linear layers for QKV projection + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size) + self.kv_proj = nn.Linear(self.hidden_size, self.hidden_size * 2) + self.proj = nn.Linear(self.hidden_size, self.hidden_size) + + # Normalization & MLP + self.norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps) + self.mlp = JanusVisionEncoderMLP(config) + + self.proj_drop = nn.Dropout(getattr(config, "dropout", 0.0)) + + def forward(self, hidden_state): + batch_size, seq_len, _ = hidden_state.shape + + # Expand learnable latent tokens for batch + q_latent = self.latent.expand(batch_size, -1, -1) # (B, latent_len, hidden_size) + + # Compute Q projection from latent tokens + q = self.q_proj(q_latent) # (B, latent_len, hidden_size) + + # Compute combined KV projection + kv = self.kv_proj(hidden_state) # (B, seq_len, 2 * hidden_size) + k, v = kv.view(batch_size, seq_len, 2, self.num_heads, self.head_dim).unbind(2) + # Batch_sisxe, num_heads, seq_len, head_dim + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + # Reshape Q for multi-head attention (B, N, H) → (B, num_heads, N, head_dim) + q = q.view(batch_size, self.latent_len, self.num_heads, self.head_dim).transpose( + 1, 2 + ) # (B, num_heads, latent_len, head_dim) + + # Scaled dot-product attention (without `torch.nn.MultiheadAttention`) + attn_weights = (q @ k.transpose(2, 3)) * self.scale # (B, num_heads, latent_len, seq_len) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + x = attn_weights @ v # (B, num_heads, latent_len, head_dim) + + x = x.transpose(1, 2).reshape(batch_size, self.latent_len, self.hidden_size) + + x = self.proj(x) + x = self.proj_drop(x) + + # Residual connection + MLP + residual = x + x = self.norm(x) + x = residual + self.mlp(x) + + return x[:, 0] # Return first latent token (like CLS token) + + +# Copied from siglip encoder +class JanusVisionEncoder(nn.Module): + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([JanusVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + # Ignore copy + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class JanusPreTrainedModel: + """An abstract class to load pretrained weigths""" + + pass + + +# Copied from siglip vision transformer +class JanusVisionEncoderTransformer(nn.Module): + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.embeddings = JanusVisionEncoderEmbeddings(config, use_special_tokens=False) + self.encoder = JanusVisionEncoder(config) + self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head + if self.use_head: + self.head = JanusAttentionPoolLatent(config) + # Won't be using as a standalone classifier head hence no num classes + + def forward( + self, + pixel_values, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooler_output = self.head(hidden_states) + + if not return_dict: + return (last_hidden_state) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooler_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index e69de29bb2d1..2c946b6beb82 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -0,0 +1,486 @@ +import math +import warnings +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn.init import _calculate_fan_in_and_fan_out + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, + torch_int, +) + +from .configuration_janus import JanusVisionEncoderConfig +from ..vit.modeling_vit import ViTPatchEmbeddings +from ..dinov2_with_registers.modeling_dinov2_with_registers import Dinov2WithRegistersLayerScale, Dinov2WithRegistersDropPath +from ..siglip.modeling_siglip import SiglipEncoder, SiglipVisionTransformer, SiglipVisionModel, SiglipMultiheadAttentionPoolingHead + +class PatchDropout(nn.Module): + """ + https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220 + """ + return_indices: torch.jit.Final[bool] + + def __init__( + self, + prob: float = 0.5, + num_prefix_tokens: int = 1, + ordered: bool = False, + return_indices: bool = False, + ): + super().__init__() + assert 0 <= prob < 1. + self.prob = prob + self.num_prefix_tokens = num_prefix_tokens # exclude CLS token (or other prefix tokens) + self.ordered = ordered + self.return_indices = return_indices + + def forward(self, x) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]: + if not self.training or self.prob == 0.: + if self.return_indices: + return x, None + return x + + if self.num_prefix_tokens: + prefix_tokens, x = x[:, :self.num_prefix_tokens], x[:, self.num_prefix_tokens:] + else: + prefix_tokens = None + + B = x.shape[0] + L = x.shape[1] + num_keep = max(1, int(L * (1. - self.prob))) + keep_indices = torch.argsort(torch.randn(B, L, device=x.device), dim=-1)[:, :num_keep] + if self.ordered: + # NOTE does not need to maintain patch order in typical transformer use, + # but possibly useful for debug / visualization + keep_indices = keep_indices.sort(dim=-1)[0] + x = x.gather(1, keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:])) + + if prefix_tokens is not None: + x = torch.cat((prefix_tokens, x), dim=1) + + if self.return_indices: + return x, keep_indices + return x + +class JanusVisionEncoderPatchEmbeddings(ViTPatchEmbeddings): + pass + + +class JanusVisionEncoderEmbeddings(nn.Module): + def __init__(self, config:JanusVisionEncoderConfig, use_special_tokens: bool,): + super().__init__() + + self.use_special_tokens = use_special_tokens + self.cls_token = nn.Parameter(torch.rand(1,1,config.hidden_size)) + self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size)) + # Currently using hidden_drop_rate instead of positional_dropout_rate, is it necessary? + self.dropout = nn.Dropout(config.hidden_dropout_rate) + self.patch_embeddings = JanusVisionEncoderPatchEmbeddings(config) + self.num_patches = self.patch_embeddings.num_patches + + num_prefix_tokens = config.num_register_tokens + 1 + pos_embed_len = self.num_patches + num_prefix_tokens if use_special_tokens else self.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, pos_embed_len, config.hidden_size) * 0.02) + + # Used to reduce computationality. + self.patch_dropout = PatchDropout(config.drop_path_rate, num_prefix_tokens) if config.drop_path_rate else nn.Identity() + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + # Add CLS tokens + special_token_embeddings = [] + cls_token_embeddings = self.cls_token.expand((batch_size, -1,-1)) + special_token_embeddings.append(cls_token_embeddings) + + if self.register_tokens.shape[1]: + register_token_embeddings = self.register_tokens.expand((batch_size, -1,-1)) + special_token_embeddings.append(register_token_embeddings) + + if self.use_special_tokens and not self.cls_token and not self.register_tokens: + raise ValueError("You have passed to use special tokens but the CLS and Register Tokens are None.") + + if self.use_special_tokens: + embeddings = embeddings + self.position_embeddings + embeddings = torch.cat(special_token_embeddings+[embeddings], dim=1) + else: + # embeddings = torch.cat(special_token_embeddings+[embeddings], dim=1) + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + + # Perform Patch dropout + embeddings = self.patch_dropout(embeddings) + return embeddings + +# Todo: introduce compatiability for cache +class JanusVisionEncoderAttention(nn.Module): + """Attention Class for Janus Vision Encoder """ + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + proj_dropout = config.projection_dropout + qk_norm = config.use_qk_norm + + # Split the weights manually and checkif getting correct output or not + self.qkv = nn.Linear(self.embed_dim, 3*self.embed_dim, bias=config.qkv_bias) + self.projection_layer = nn.Linear(self.embed_dim, self.embed_dim) + self.projection_dropout = nn.Dropout(proj_dropout) if proj_dropout > 0 else nn.Identity() + + self.query_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity() + self.key_norm = nn.LayerNorm(self.embed_dim) if qk_norm else nn.Identity() + + def forward(self,hidden_states:torch.Tensor,attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[torch.Tensor] = None): + batch_size , seq_len, _ = hidden_states.size() + + qkv = self.qkv(hidden_states).reshape(batch_size, seq_len, 3,self.num_heads ,self.head_dim) + query_states, key_states, value_states = qkv.unbind(2) + # batch numhead, seq len, head dim + query_states = query_states.transpose(1,2) + key_states = key_states.transpose(1,2) + value_states = value_states.transpose(1,2) + + query_states = self.query_norm(query_states) + key_states = self.key_norm(key_states) + + # Is it a bug or deliberate change? + query_states = query_states * self.scale + + # batch, num head,seqlen,seqlen + attn_weights = torch.matmul(query_states, key_states.transpose(2,3)) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + # Only apply attention dropout during training. + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (batch_size, self.num_heads, seq_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, seq_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + if attention_mask is not None: + if attention_mask.size() != (batch_size,1, seq_len, self.head_dim): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, seq_len, self.head_dim)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + attn_output = attn_output.transpose(1,2).contiguous() + attn_output = attn_output.reshape( batch_size, seq_len, self.embed_dim ) + + output = self.projection_layer(attn_output) + output = self.projection_dropout(output) + + outputs = (output, attn_weights) if output_attentions else (output, None) + return outputs + +class JanusVisionEncoderLayerScale(Dinov2WithRegistersLayerScale): + pass +class JanusVisionEncoderDropPath(Dinov2WithRegistersDropPath): + pass + +class JanusVisionEncoderMLP(nn.Module): + def __init__(self, config:JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] # Gelu act + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout1 = nn.Dropout(config.hidden_dropout_rate) + self.dropout2 = nn.Dropout(config.hidden_dropout_rate) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.dropout1(hidden_states) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout2(hidden_states) + return hidden_states + +class JanusVisionEncoderLayer(nn.Module): + def __init__(self,config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.self_attn = JanusVisionEncoderAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim) + self.layer_norm2 = nn.LayerNorm(self.embed_dim) + + self.layer_scale1 = JanusVisionEncoderLayerScale(config) if config.layerscale_value else nn.Identity() + self.layer_scale2 = JanusVisionEncoderLayerScale(config) if config.layerscale_value else nn.Identity() + self.drop_path1 = ( + JanusVisionEncoderDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + self.drop_path2 = ( + JanusVisionEncoderDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + ) + self.mlp = JanusVisionEncoderMLP(config) + + # Ignore copy + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + + # Pre-Norm before applying attention. + norm_hidden_states = self.layer_norm1(hidden_states) + attn_output, attn_weights = self.self_attn( + norm_hidden_states,attention_mask=attention_mask, output_attentions=output_attentions + ) + + # Apply DropPath & LayerScale to Attention Output and then residual connection. + hidden_states = hidden_states + self.drop_path1(self.layer_scale1(attn_output)) + + hidden_states = self.layer_norm2(hidden_states) + mlp_output = self.mlp(hidden_states) + + hidden_states = hidden_states + self.drop_path2(self.layer_scale2(mlp_output)) + + return (hidden_states, attn_weights if output_attentions else None) + +# copied from SiglipMultiheadAttentionPoolingHead +# class JanusAttentionPoolLatent(SiglipMultiheadAttentionPoolingHead): +# pass + +class JanusAttentionPoolLatent(nn.Module): + """ Hugging Face-compatible Attention Pooling with Manual QKV """ + + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + + self.latent_len = getattr(config, "latent_len", 1) + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.mlp_ratio = getattr(config, "mlp_ratio", 4.0) + self.scale = self.head_dim ** -0.5 # Scaling factor for attention + + # Learnable latent query (probe) + self.latent = nn.Parameter(torch.zeros(1, self.latent_len, self.hidden_size)) + + # Linear layers for QKV projection + self.q_proj = nn.Linear(self.hidden_size, self.hidden_size) + self.kv_proj = nn.Linear(self.hidden_size, self.hidden_size * 2) + self.proj = nn.Linear(self.hidden_size, self.hidden_size) + + # Normalization & MLP + self.norm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps) + self.mlp = JanusVisionEncoderMLP(config) + + self.proj_drop = nn.Dropout(getattr(config, "dropout", 0.0)) + + def forward(self, hidden_state): + batch_size, seq_len, _ = hidden_state.shape + + # Expand learnable latent tokens for batch + q_latent = self.latent.expand(batch_size, -1, -1) # (B, latent_len, hidden_size) + + # Compute Q projection from latent tokens + q = self.q_proj(q_latent) # (B, latent_len, hidden_size) + + # Compute combined KV projection + kv = self.kv_proj(hidden_state) # (B, seq_len, 2 * hidden_size) + k, v = kv.view(batch_size, seq_len, 2, self.num_heads, self.head_dim).unbind(2) + # Batch_sisxe, num_heads, seq_len, head_dim + k = k.transpose(1,2) + v = v.transpose(1,2) + + # Reshape Q for multi-head attention (B, N, H) → (B, num_heads, N, head_dim) + q = q.view(batch_size, self.latent_len, self.num_heads, self.head_dim).transpose(1, 2) # (B, num_heads, latent_len, head_dim) + + # Scaled dot-product attention (without `torch.nn.MultiheadAttention`) + attn_weights = (q @ k.transpose(2,3)) * self.scale # (B, num_heads, latent_len, seq_len) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + x = attn_weights @ v # (B, num_heads, latent_len, head_dim) + + x = x.transpose(1, 2).reshape(batch_size, self.latent_len, self.hidden_size) + + x = self.proj(x) + x = self.proj_drop(x) + + # Residual connection + MLP + residual = x + x = self.norm(x) + x = residual + self.mlp(x) + + return x[:, 0] # Return first latent token (like CLS token) + + +# Copied from siglip encoder +class JanusVisionEncoder(nn.Module): + + def __init__(self,config:JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([JanusVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + # Ignore copy + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class JanusPreTrainedModel(): + """An abstract class to load pretrained weigths""" + pass + +# Copied from siglip vision transformer +class JanusVisionEncoderTransformer(nn.Module): + def __init__(self, config: JanusVisionEncoderConfig): + super().__init__() + self.config = config + self.embeddings = JanusVisionEncoderEmbeddings(config,use_special_tokens=False) + self.encoder = JanusVisionEncoder(config) + self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head + if self.use_head: + self.head = JanusAttentionPoolLatent(config) + # Won't be using as a standalone classifier head hence no num classes + + def forward( + self, + pixel_values, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooler_output = self.head(hidden_states) + + + if not return_dict: + return (last_hidden_state) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooler_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) \ No newline at end of file From c92a32dd48686c599c9d360d013a58dfa9dc6b25 Mon Sep 17 00:00:00 2001 From: yaswant19 Date: Tue, 11 Feb 2025 00:44:54 +0530 Subject: [PATCH 7/7] Import Janus Image processor --- src/transformers/models/auto/image_processing_auto.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 0c4bb9b9a7f7..cb3e926d7d42 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -158,6 +158,7 @@ ("xclip", ("CLIPImageProcessor",)), ("yolos", ("YolosImageProcessor",)), ("zoedepth", ("ZoeDepthImageProcessor",)), + ("janus",('JanusImageProcessor')) ] )