-
Notifications
You must be signed in to change notification settings - Fork 5.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Question]: How to use VLMs from HuggingFace for Multimodal rag? #16056
Comments
To use HuggingFace models for multimodal retrieval-augmented generation (RAG) in your application, you can leverage the existing support for HuggingFace models in the
This setup allows you to use HuggingFace models in a similar manner to how you used For more details, you can refer to the existing implementations of HuggingFace models in the |
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext
import qdrant_client
client = qdrant_client.QdrantClient(path="qdrant_mm_db")
text_store = QdrantVectorStore(
client=client, collection_name="text_collection"
)
image_store = QdrantVectorStore(
client=client, collection_name="image_collection"
)
storage_context = StorageContext.from_defaults(
vector_store=text_store, image_store=image_store
)
documents = SimpleDirectoryReader("imgs").load_data(show_progress=True)
index = MultiModalVectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
image_embed_model="local:openai/clip-vit-large-patch14"
) Error log: No sentence-transformers model found with name openai/clip-vit-large-patch14. Creating a new one with MEAN pooling.
{
"name": "AttributeError",
"message": "'CLIPConfig' object has no attribute 'hidden_size'",
"stack": "---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[4], line 1
----> 1 index = MultiModalVectorStoreIndex.from_documents(
2 documents,
3 storage_context=storage_context,
4 image_embed_model=\"local:openai/clip-vit-large-patch14\"
5 )
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\llama_index\\core\\indices\\base.py:145, in BaseIndex.from_documents(cls, documents, storage_context, show_progress, callback_manager, transformations, service_context, **kwargs)
136 docstore.set_document_hash(doc.get_doc_id(), doc.hash)
138 nodes = run_transformations(
139 documents, # type: ignore
140 transformations,
141 show_progress=show_progress,
142 **kwargs,
143 )
--> 145 return cls(
146 nodes=nodes,
147 storage_context=storage_context,
148 callback_manager=callback_manager,
149 show_progress=show_progress,
150 transformations=transformations,
151 service_context=service_context,
152 **kwargs,
153 )
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\llama_index\\core\\indices\\multi_modal\\base.py:80, in MultiModalVectorStoreIndex.__init__(self, nodes, index_struct, embed_model, storage_context, use_async, store_nodes_override, show_progress, image_vector_store, image_embed_model, is_image_to_text, is_image_vector_store_empty, is_text_vector_store_empty, service_context, **kwargs)
56 def __init__(
57 self,
58 nodes: Optional[Sequence[BaseNode]] = None,
(...)
77 **kwargs: Any,
78 ) -> None:
79 \"\"\"Initialize params.\"\"\"
---> 80 image_embed_model = resolve_embed_model(
81 image_embed_model, callback_manager=kwargs.get(\"callback_manager\", None)
82 )
83 assert isinstance(image_embed_model, MultiModalEmbedding)
84 self._image_embed_model = image_embed_model
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\llama_index\\core\\embeddings\\utils.py:110, in resolve_embed_model(embed_model, callback_manager)
107 cache_folder = os.path.join(get_cache_dir(), \"models\")
108 os.makedirs(cache_folder, exist_ok=True)
--> 110 embed_model = HuggingFaceEmbedding(
111 model_name=model_name, cache_folder=cache_folder
112 )
113 except ImportError:
114 raise ImportError(
115 \"`llama-index-embeddings-huggingface` package not found, \"
116 \"please run `pip install llama-index-embeddings-huggingface`\"
117 )
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\llama_index\\embeddings\\huggingface\\base.py:150, in HuggingFaceEmbedding.__init__(self, model_name, tokenizer_name, pooling, max_length, query_instruction, text_instruction, normalize, model, tokenizer, embed_batch_size, cache_folder, trust_remote_code, device, callback_manager, parallel_process, target_devices, **model_kwargs)
147 if model_name is None:
148 raise ValueError(\"The `model_name` argument must be provided.\")
--> 150 model = SentenceTransformer(
151 model_name,
152 device=device,
153 cache_folder=cache_folder,
154 trust_remote_code=trust_remote_code,
155 prompts={
156 \"query\": query_instruction
157 or get_query_instruct_for_model_name(model_name),
158 \"text\": text_instruction
159 or get_text_instruct_for_model_name(model_name),
160 },
161 **model_kwargs,
162 )
163 if max_length:
164 model.max_seq_length = max_length
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\sentence_transformers\\SentenceTransformer.py:205, in SentenceTransformer.__init__(self, model_name_or_path, modules, device, prompts, default_prompt_name, cache_folder, trust_remote_code, revision, token, use_auth_token, truncate_dim)
197 modules = self._load_sbert_model(
198 model_name_or_path,
199 token=token,
(...)
202 trust_remote_code=trust_remote_code,
203 )
204 else:
--> 205 modules = self._load_auto_model(
206 model_name_or_path,
207 token=token,
208 cache_folder=cache_folder,
209 revision=revision,
210 trust_remote_code=trust_remote_code,
211 )
213 if modules is not None and not isinstance(modules, OrderedDict):
214 modules = OrderedDict([(str(idx), module) for idx, module in enumerate(modules)])
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\sentence_transformers\\SentenceTransformer.py:1203, in SentenceTransformer._load_auto_model(self, model_name_or_path, token, cache_folder, revision, trust_remote_code)
1192 logger.warning(
1193 \"No sentence-transformers model found with name {}. Creating a new one with MEAN pooling.\".format(
1194 model_name_or_path
1195 )
1196 )
1197 transformer_model = Transformer(
1198 model_name_or_path,
1199 cache_dir=cache_folder,
1200 model_args={\"token\": token, \"trust_remote_code\": trust_remote_code, \"revision\": revision},
1201 tokenizer_args={\"token\": token, \"trust_remote_code\": trust_remote_code, \"revision\": revision},
1202 )
-> 1203 pooling_model = Pooling(transformer_model.get_word_embedding_dimension(), \"mean\")
1204 return [transformer_model, pooling_model]
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\sentence_transformers\\models\\Transformer.py:114, in Transformer.get_word_embedding_dimension(self)
113 def get_word_embedding_dimension(self) -> int:
--> 114 return self.auto_model.config.hidden_size
File c:\\Users\\Cihan\\Desktop\\llamaindex\\lindex\\Lib\\site-packages\\transformers\\configuration_utils.py:202, in PretrainedConfig.__getattribute__(self, key)
200 if key != \"attribute_map\" and key in super().__getattribute__(\"attribute_map\"):
201 key = super().__getattribute__(\"attribute_map\")[key]
--> 202 return super().__getattribute__(key)
AttributeError: 'CLIPConfig' object has no attribute 'hidden_size'"
} |
To resolve the In the provided code, the Here is the relevant part of the def __init__(
self,
nodes: Optional[Sequence[BaseNode]] = None,
index_struct: Optional[MultiModelIndexDict] = None,
embed_model: Optional[BaseEmbedding] = None,
storage_context: Optional[StorageContext] = None,
use_async: bool = False,
store_nodes_override: bool = False,
show_progress: bool = False,
# Image-related kwargs
image_vector_store: Optional[BasePydanticVectorStore] = None,
image_embed_model: EmbedType = "clip:ViT-B/32",
is_image_to_text: bool = False,
is_image_vector_store_empty: bool = False,
is_text_vector_store_empty: bool = False,
**kwargs: Any,
) -> None:
"""Initialize params."""
image_embed_model = resolve_embed_model(
image_embed_model, callback_manager=kwargs.get("callback_manager", None)
)
assert isinstance(image_embed_model, MultiModalEmbedding)
self._image_embed_model = image_embed_model
self._is_image_to_text = is_image_to_text
self._is_image_vector_store_empty = is_image_vector_store_empty
self._is_text_vector_store_empty = is_text_vector_store_empty
storage_context = storage_context or StorageContext.from_defaults()
if image_vector_store is not None:
if self.image_namespace not in storage_context.vector_stores:
storage_context.add_vector_store(
image_vector_store, self.image_namespace
)
else:
# overwrite image_store from storage_context
storage_context.vector_stores[self.image_namespace] = image_vector_store
if self.image_namespace not in storage_context.vector_stores:
storage_context.add_vector_store(SimpleVectorStore(), self.image_namespace)
self._image_vector_store = storage_context.vector_stores[self.image_namespace]
super().__init__(
nodes=nodes,
index_struct=index_struct,
embed_model=embed_model,
storage_context=storage_context,
show_progress=show_progress,
use_async=use_async,
store_nodes_override=store_nodes_override,
**kwargs,
) Ensure that the The class ClipEmbedding(MultiModalEmbedding):
"""CLIP embedding models for encoding text and image for Multi-Modal purpose."""
embed_batch_size: int = Field(default=DEFAULT_EMBED_BATCH_SIZE, gt=0)
_clip: Any = PrivateAttr()
_model: Any = PrivateAttr()
_preprocess: Any = PrivateAttr()
_device: Any = PrivateAttr()
def __init__(
self,
*,
embed_batch_size: int = DEFAULT_EMBED_BATCH_SIZE,
model_name: str = DEFAULT_CLIP_MODEL,
**kwargs: Any,
):
"""Initializes the ClipEmbedding class."""
if embed_batch_size <= 0:
raise ValueError(f"Embed batch size {embed_batch_size} must be > 0.")
try:
import clip
import torch
except ImportError:
raise ImportError(
"ClipEmbedding requires `pip install git+https://github.com/openai/CLIP.git` and torch."
)
super().__init__(
embed_batch_size=embed_batch_size, model_name=model_name, **kwargs
)
self._device = "cuda" if torch.cuda.is_available() else "cpu"
if self.model_name not in AVAILABLE_CLIP_MODELS:
raise ValueError(
f"Model name {self.model_name} is not available in CLIP."
)
self._model, self._preprocess = clip.load(
self.model_name, device=self._device
) Ensure that your |
@g-hano Hmm, we actually don't have a multi-modal huggingface llm class yet. If you want to use open-source VLMs, you could look into using Ollama Contributions also welcome for a multi modal huggingface class! |
I coded my own HuggingFaceMultiModal class that supports 4 different architectures and sent a PR |
Question Validation
Question
I want to use "Qwen/Qwen2-VL-2B-Instruct" on my multimodal rag app. I tried OllamaMultiModal from llama_index.multi_modal_llms.ollama and it works fine. I want to use huggingface models as well
The text was updated successfully, but these errors were encountered: