diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/Makefile b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md new file mode 100644 index 0000000000000..4b698974d5ea3 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/README.md @@ -0,0 +1,85 @@ +# LlamaIndex Multi_Modal_Llms Integration: Huggingface + +This project integrates Hugging Face's multimodal language models into the LlamaIndex framework, enabling advanced multimodal capabilities for various AI applications. + +## Features + +- Seamless integration of Hugging Face multimodal models with LlamaIndex +- Support for multiple state-of-the-art vision-language models and their **finetunes**: + - [Qwen2 Vision](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d) + - [Florence2](https://huggingface.co/collections/microsoft/florence-6669f44df0d87d9c3bfb76de) + - [Phi-3.5 Vision](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) + - [PaLI-Gemma](https://huggingface.co/collections/google/paligemma-release-6643a9ffbf57de2ae0448dda) +- Easy-to-use interface for multimodal tasks like image captioning and visual question answering +- Configurable model parameters for fine-tuned performance + +--- + +## Author of that Integration [GitHub](https://github.com/g-hano) | [LinkedIn](https://www.linkedin.com/in/chanyalcin/) | [Email](mcihan.yalcin@outlook.com) + +## Installation + +```bash +pip install llama-index-multi-modal-llms-huggingface +``` + +Make sure to set your Hugging Face API token as an environment variable: + +```bash +export HF_TOKEN=your_huggingface_token_here +``` + +## Usage + +Here's a basic example of how to use the Hugging Face multimodal integration: + +```python +from llama_index.multi_modal_llms.huggingface import HuggingFaceMultiModal +from llama_index.schema import ImageDocument + +# Initialize the model +model = HuggingFaceMultiModal.from_model_name("Qwen/Qwen2-VL-2B-Instruct") + +# Prepare your image and prompt +image_document = ImageDocument(image_path="path/to/your/image.jpg") +prompt = "Describe this image in detail." + +# Generate a response +response = model.complete(prompt, image_documents=[image_document]) + +print(response.text) +``` + +You can also refer to this [Colab notebook](examples\huggingface_multimodal.ipynb) + +## Supported Models + +1. Qwen2VisionMultiModal +2. Florence2MultiModal +3. Phi35VisionMultiModal +4. PaliGemmaMultiModal + +Each model has its unique capabilities and can be selected based on your specific use case. + +## Configuration + +You can configure various parameters when initializing a model: + +```python +model = HuggingFaceMultiModal( + model_name="Qwen/Qwen2-VL-2B-Instruct", + device="cuda", # or "cpu" + torch_dtype=torch.float16, + max_new_tokens=100, + temperature=0.7, +) +``` + +## Limitations + +- Async streaming is not supported for any of the models. +- Some models have specific requirements or limitations. Please refer to the individual model classes for details. + +--- + +## Author of that Integration [GitHub](https://github.com/g-hano) | [LinkedIn](https://www.linkedin.com/in/chanyalcin/) | [Email](mcihan.yalcin@outlook.com) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/examples/huggingface_multimodal.ipynb b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/examples/huggingface_multimodal.ipynb new file mode 100644 index 0000000000000..dc5cb862b7242 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/examples/huggingface_multimodal.ipynb @@ -0,0 +1,1296 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](8.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install git+https://github.com/huggingface/transformers -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: transformers\n", + "Version: 4.45.0.dev0\n", + "Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow\n", + "Home-page: https://github.com/huggingface/transformers\n", + "Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)\n", + "Author-email: transformers@huggingface.co\n", + "License: Apache 2.0 License\n", + "Location: /usr/local/lib/python3.10/dist-packages\n", + "Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm\n", + "Required-by: \n" + ] + } + ], + "source": [ + "!pip show transformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m46.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m46.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m187.4/187.4 kB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m375.6/375.6 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.8/295.8 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.9/318.9 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!pip install llama-index -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting flash-attn\n", + " Downloading flash_attn-2.6.3.tar.gz (2.6 MB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/2.6 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r", + "\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m126.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m66.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from flash-attn) (2.4.1+cu121)\n", + "Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages (from flash-attn) (0.8.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.16.0)\n", + "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (4.12.2)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (1.13.2)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.3)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.1.4)\n", + "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2024.6.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->flash-attn) (2.1.5)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->flash-attn) (1.3.0)\n", + "Building wheels for collected packages: flash-attn\n", + " Building wheel for flash-attn (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24\n", + " Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826\n", + "Successfully built flash-attn\n", + "Installing collected packages: flash-attn\n", + "Successfully installed flash-attn-2.6.3\n" + ] + } + ], + "source": [ + "!pip install flash-attn --no-build-isolation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting timm\n", + " Downloading timm-1.0.9-py3-none-any.whl.metadata (42 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/42.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from timm) (2.4.1+cu121)\n", + "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from timm) (0.19.1+cu121)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from timm) (6.0.2)\n", + "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (from timm) (0.24.7)\n", + "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from timm) (0.4.5)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface_hub->timm) (3.16.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub->timm) (2024.6.1)\n", + "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub->timm) (24.1)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface_hub->timm) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub->timm) (4.66.5)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub->timm) (4.12.2)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->timm) (1.13.2)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->timm) (3.3)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->timm) (3.1.4)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from torchvision->timm) (1.26.4)\n", + "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->timm) (10.4.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->timm) (2.1.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub->timm) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub->timm) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub->timm) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub->timm) (2024.8.30)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->timm) (1.3.0)\n", + "Downloading timm-1.0.9-py3-none-any.whl (2.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: timm\n", + "Successfully installed timm-1.0.9\n" + ] + } + ], + "source": [ + "!pip install timm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting qwen-vl-utils\n", + " Downloading qwen_vl_utils-0.0.5-py3-none-any.whl.metadata (3.5 kB)\n", + "Collecting av (from qwen-vl-utils)\n", + " Downloading av-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)\n", + "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from qwen-vl-utils) (10.4.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from qwen-vl-utils) (2.32.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->qwen-vl-utils) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->qwen-vl-utils) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->qwen-vl-utils) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->qwen-vl-utils) (2024.8.30)\n", + "Downloading qwen_vl_utils-0.0.5-py3-none-any.whl (4.8 kB)\n", + "Downloading av-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m33.0/33.0 MB\u001b[0m \u001b[31m53.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: av, qwen-vl-utils\n", + "Successfully installed av-13.0.0 qwen-vl-utils-0.0.5\n" + ] + } + ], + "source": [ + "!pip install qwen-vl-utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Contact Me\n", + "\n", + "[![Email](https://img.shields.io/badge/Email-mcihan.yalcin%40outlook.com-blue?style=flat&logo=microsoft-outlook&logoColor=white)](mailto:mcihan.yalcin@outlook.com)\n", + "[![LinkedIn](https://img.shields.io/badge/LinkedIn-ChanYalcin-blue?style=flat&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/chanyalcin)\n", + "

\n", + " \"Dancing\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any, Dict, Sequence\n", + "from typing_extensions import override\n", + "from llama_index.core.base.llms.types import (\n", + " ChatMessage,\n", + " ChatResponse,\n", + " ChatResponseAsyncGen,\n", + " CompletionResponse,\n", + " CompletionResponseAsyncGen,\n", + ")\n", + "from llama_index.core.bridge.pydantic import Field, PrivateAttr\n", + "from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS\n", + "from llama_index.core.multi_modal_llms import MultiModalLLM, MultiModalLLMMetadata\n", + "from llama_index.core.schema import ImageDocument, ImageNode\n", + "import torch\n", + "from PIL import Image\n", + "from transformers import (\n", + " AutoProcessor,\n", + " AutoModelForCausalLM,\n", + " AutoConfig,\n", + " Qwen2VLForConditionalGeneration,\n", + " PaliGemmaForConditionalGeneration,\n", + ")\n", + "from qwen_vl_utils import (\n", + " process_vision_info,\n", + ") # We will need that in order to work with different image shapes\n", + "\n", + "DEFAULT_MULTIMODAL_MODEL = \"Qwen/Qwen2-VL-2B-Instruct\"\n", + "DEFAULT_REQUEST_TIMEOUT = 120.0\n", + "SUPPORTED_VLMS = [\n", + " \"Phi3VForCausalLM\",\n", + " \"Florence2ForConditionalGeneration\",\n", + " \"Qwen2VLForConditionalGeneration\",\n", + " \"PaliGemmaForConditionalGeneration\",\n", + "]\n", + "\n", + "\n", + "class HuggingFaceMultiModal(MultiModalLLM):\n", + " \"\"\"\n", + " This class provides a base implementation for interacting with HuggingFace multi-modal models.\n", + " It handles model initialization, input preparation, and text/image-based interaction.\n", + " \"\"\"\n", + "\n", + " model_name: str = Field(\n", + " description=\"The name of the Hugging Face multi-modal model to use.\"\n", + " )\n", + " device: str = Field(\n", + " default=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n", + " description=\"The device to run the model on.\",\n", + " )\n", + " torch_dtype: Any = Field(\n", + " default=torch.float16 if torch.cuda.is_available() else torch.float32,\n", + " description=\"The torch dtype to use.\",\n", + " )\n", + " trust_remote_code: bool = Field(\n", + " default=True, description=\"Whether to trust remote code when loading the model.\"\n", + " )\n", + " context_window: int = Field(\n", + " default=DEFAULT_CONTEXT_WINDOW,\n", + " description=\"The maximum number of context tokens for the model.\",\n", + " )\n", + " max_new_tokens: int = Field(\n", + " default=DEFAULT_NUM_OUTPUTS,\n", + " description=\"The maximum number of new tokens to generate.\",\n", + " )\n", + " temperature: float = Field(\n", + " default=0.0, description=\"The temperature to use for sampling.\"\n", + " )\n", + " additional_kwargs: Dict[str, Any] = Field(\n", + " default_factory=dict,\n", + " description=\"Additional kwargs for model initialization and generation.\",\n", + " )\n", + "\n", + " _model: Any = PrivateAttr()\n", + " _processor: Any = PrivateAttr()\n", + " _config: Any = PrivateAttr()\n", + "\n", + " def __init__(self, **kwargs: Any) -> None:\n", + " \"\"\"\n", + " Initializes the HuggingFace multi-modal model and processor based on the provided configuration.\n", + " \"\"\"\n", + " super().__init__(**kwargs)\n", + " try:\n", + " # Load model configuration\n", + " self._config = AutoConfig.from_pretrained(\n", + " self.model_name, trust_remote_code=True\n", + " )\n", + " architecture = self._config.architectures[0]\n", + " AutoModelClass = AutoModelForCausalLM # Default model class\n", + "\n", + " # Special cases for specific model architectures\n", + " if \"Qwen2VLForConditionalGeneration\" in architecture:\n", + " AutoModelClass = Qwen2VLForConditionalGeneration\n", + " if \"PaliGemmaForConditionalGeneration\" in architecture:\n", + " AutoModelClass = PaliGemmaForConditionalGeneration\n", + "\n", + " # Load the model based on the architecture\n", + " self._model = AutoModelClass.from_pretrained(\n", + " self.model_name,\n", + " device_map=self.device,\n", + " torch_dtype=self.torch_dtype,\n", + " trust_remote_code=self.trust_remote_code,\n", + " **self.additional_kwargs,\n", + " )\n", + " # Load the processor (for handling text and image inputs)\n", + " self._processor = AutoProcessor.from_pretrained(\n", + " self.model_name, trust_remote_code=self.trust_remote_code\n", + " )\n", + " except Exception as e:\n", + " raise ValueError(f\"Failed to initialize the model and processor: {str(e)}\")\n", + "\n", + " @classmethod\n", + " def class_name(cls) -> str:\n", + " \"\"\"Returns the class name for the model.\"\"\"\n", + " return \"HuggingFace_multi_modal_llm\"\n", + "\n", + " @property\n", + " def metadata(self) -> MultiModalLLMMetadata:\n", + " \"\"\"Multi Modal LLM metadata.\"\"\"\n", + " return MultiModalLLMMetadata(\n", + " context_window=self.context_window,\n", + " num_output=self.max_new_tokens,\n", + " model_name=self.model_name,\n", + " )\n", + "\n", + " # each unique model will override it\n", + " def _prepare_messages(\n", + " self, messages: Sequence[ChatMessage], image_documents: Sequence[ImageDocument]\n", + " ) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Abstract method: Prepares input messages and image documents for the model.\n", + " This must be overridden by subclasses.\n", + " \"\"\"\n", + " raise NotImplementedError\n", + "\n", + " # each unique model will override it\n", + " def _generate(self, prepared_inputs: Dict[str, Any]) -> str:\n", + " \"\"\"\n", + " Abstract method: Generates text based on the prepared inputs.\n", + " This must be overridden by subclasses.\n", + " \"\"\"\n", + " raise NotImplementedError\n", + "\n", + " # some models will override it, some won't\n", + " def complete(\n", + " self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any\n", + " ) -> CompletionResponse:\n", + " \"\"\"\n", + " Completes a task based on a text prompt and optional images.\n", + " The method prepares inputs and generates the corresponding text.\n", + " \"\"\"\n", + " prepared_inputs = self._prepare_messages(\n", + " [ChatMessage(role=\"user\", content=prompt)], image_documents\n", + " )\n", + " generated_text = self._generate(prepared_inputs)\n", + " return CompletionResponse(text=generated_text)\n", + "\n", + " # some models will override it, some won't\n", + " def chat(\n", + " self,\n", + " messages: Sequence[ChatMessage],\n", + " image_documents: Sequence[ImageDocument],\n", + " **kwargs: Any,\n", + " ) -> ChatResponse:\n", + " \"\"\"\n", + " Engages in a chat-style interaction by processing a sequence of messages and optional images.\n", + " \"\"\"\n", + " prepared_inputs = self._prepare_messages(messages, image_documents)\n", + " generated_text = self._generate(prepared_inputs)\n", + " return ChatResponse(\n", + " message=ChatMessage(role=\"assistant\", content=generated_text),\n", + " raw={\"model_output\": generated_text},\n", + " )\n", + "\n", + " async def astream_chat(\n", + " self, messages: Sequence[ChatMessage], **kwargs: Any\n", + " ) -> ChatResponseAsyncGen:\n", + " raise NotImplementedError(\n", + " \"HuggingFaceMultiModal does not support async streaming chat yet.\"\n", + " )\n", + "\n", + " async def astream_complete(\n", + " self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any\n", + " ) -> CompletionResponseAsyncGen:\n", + " raise NotImplementedError(\n", + " \"HuggingFaceMultiModal does not support async streaming completion yet.\"\n", + " )\n", + "\n", + " async def acomplete(\n", + " self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any\n", + " ) -> CompletionResponse:\n", + " raise NotImplementedError(\n", + " \"HuggingFaceMultiModal does not support async completion yet.\"\n", + " )\n", + "\n", + " async def achat(\n", + " self, messages: Sequence[ChatMessage], **kwargs: Any\n", + " ) -> ChatResponse:\n", + " raise NotImplementedError(\n", + " \"HuggingFaceMultiModal does not support async chat yet.\"\n", + " )\n", + "\n", + " async def stream_complete(\n", + " self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any\n", + " ) -> CompletionResponse:\n", + " raise NotImplementedError(\n", + " \"HuggingFaceMultiModal does not support async completion yet.\"\n", + " )\n", + "\n", + " # we check the model architecture here\n", + " @classmethod\n", + " def from_model_name(cls, model_name: str, **kwargs: Any) -> \"HuggingFaceMultiModal\":\n", + " \"\"\"Checks the model architecture and initializes the model.\"\"\"\n", + " config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)\n", + " # we check the architecture because users would want to use their own finetuned versions of VLMs\n", + " architecture = config.architectures[0]\n", + "\n", + " if \"Phi3VForCausalLM\" in architecture:\n", + " return Phi35VisionMultiModal(model_name=model_name, **kwargs)\n", + " elif \"Florence2ForConditionalGeneration\" in architecture:\n", + " return Florence2MultiModal(model_name=model_name, **kwargs)\n", + " elif \"Qwen2VLForConditionalGeneration\" in architecture:\n", + " return Qwen2VisionMultiModal(model_name=model_name, **kwargs)\n", + " elif \"PaliGemmaForConditionalGeneration\" in architecture:\n", + " return PaliGemmaMultiModal(model_name=model_name, **kwargs)\n", + " else:\n", + " raise ValueError(\n", + " f\"Unsupported model architecture: {architecture}. \"\n", + " f\"We currently support: {', '.join(SUPPORTED_VLMS)}\"\n", + " )\n", + "\n", + "\n", + "class Qwen2VisionMultiModal(HuggingFaceMultiModal):\n", + " \"\"\"\n", + " A specific implementation for the Qwen2 multi-modal model.\n", + " Handles chat-style interactions that involve both text and images.\n", + " \"\"\"\n", + "\n", + " def _prepare_messages(\n", + " self, messages: Sequence[ChatMessage], image_documents: Sequence[ImageDocument]\n", + " ) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Prepares the input messages and images for Qwen2 models. Images are appended in a custom format.\n", + " \"\"\"\n", + " conversation = []\n", + " for img_doc in image_documents:\n", + " conversation.append(\n", + " {\"type\": \"image\", \"image\": img_doc.image_path}\n", + " ) # Append images to conversation\n", + " conversation.append(\n", + " {\"type\": \"text\", \"text\": messages[0].content}\n", + " ) # Add user text message\n", + "\n", + " messages = [\n", + " {\"role\": \"user\", \"content\": conversation}\n", + " ] # Wrap conversation in a user role\n", + "\n", + " # Apply a chat template to format the message with the processor\n", + " text_prompt = self._processor.tokenizer.apply_chat_template(\n", + " messages, tokenize=False, add_generation_prompt=True\n", + " )\n", + " image_inputs, _ = process_vision_info(messages)\n", + "\n", + " # Prepare the model inputs (text + images) and convert to tensor\n", + " inputs = self._processor(\n", + " text=[text_prompt], images=image_inputs, padding=True, return_tensors=\"pt\"\n", + " )\n", + " return inputs.to(self.device)\n", + "\n", + " def _generate(self, prepared_inputs: Dict[str, Any]) -> str:\n", + " \"\"\"\n", + " Generates text based on prepared inputs. The text is decoded from token IDs generated by the model.\n", + " \"\"\"\n", + " output_ids = self._model.generate(\n", + " **prepared_inputs, max_new_tokens=self.max_new_tokens\n", + " )\n", + " generated_ids = [\n", + " output_ids[len(input_ids) :]\n", + " for input_ids, output_ids in zip(prepared_inputs[\"input_ids\"], output_ids)\n", + " ]\n", + " output_text = self._processor.batch_decode(\n", + " generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True\n", + " )\n", + " return output_text[0]\n", + "\n", + " async def stream_chat(\n", + " self, messages: Sequence[ChatMessage], **kwargs: Any\n", + " ) -> ChatResponse:\n", + " raise NotImplementedError(\n", + " \"Qwen2VisionMultiModal does not support async streaming chat yet.\"\n", + " )\n", + "\n", + "\n", + "class Florence2MultiModal(HuggingFaceMultiModal):\n", + " \"\"\"\n", + " A specific implementation for the Florence2 multi-modal model.\n", + " Handles chat-style interactions that involve both text and images.\n", + " \"\"\"\n", + "\n", + " @override\n", + " def complete(\n", + " self, task: str, image_document: ImageDocument, **kwargs: Any\n", + " ) -> CompletionResponse:\n", + " if type(image_document) is list:\n", + " print(\n", + " f\"{self.model_name} can handle only one image. Will continue with the first image.\"\n", + " )\n", + " image_document = image_document[0]\n", + "\n", + " prepared_inputs = self._prepare_messages(task, image_document)\n", + " generated_text = self._generate(prepared_inputs)\n", + " return CompletionResponse(text=generated_text)\n", + "\n", + " @override\n", + " def chat(\n", + " self, task: str, image_document: ImageDocument, **kwargs: Any\n", + " ) -> ChatResponse:\n", + " if type(image_document) is list:\n", + " print(\n", + " f\"{self.model_name} can handle only one image. Will continue with the first image.\"\n", + " )\n", + " image_document = image_document[0]\n", + "\n", + " prepared_inputs = self._prepare_messages(task, image_document)\n", + " generated_text = self._generate(prepared_inputs)\n", + " return ChatResponse(\n", + " message=ChatMessage(role=\"assistant\", content=generated_text),\n", + " raw={\"model_output\": generated_text},\n", + " )\n", + "\n", + " # TODO: Florence2 works with task_prompts, not user prompts\n", + " # Task prompts are: '', '', ''\n", + " def _prepare_messages(\n", + " self, task: str, image_document: ImageDocument\n", + " ) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Prepares the input messages and images for Qwen2 models. Images are appended in a custom format.\n", + " \"\"\"\n", + " prompt = (\n", + " task.upper()\n", + " if task.upper()\n", + " in [\"\", \"\", \"\"]\n", + " else \"\"\n", + " )\n", + " images = Image.open(image_document.image_path)\n", + " inputs = self._processor(text=prompt, images=images, return_tensors=\"pt\").to(\n", + " self.device, self.torch_dtype\n", + " )\n", + " return {\n", + " \"prompt\": prompt,\n", + " \"inputs\": inputs,\n", + " \"image_size\": (images.width, images.height),\n", + " }\n", + "\n", + " def _generate(self, prepared_inputs: Dict[str, Any]) -> str:\n", + " \"\"\"\n", + " Generates text based on prepared inputs. The text is decoded from token IDs generated by the model.\n", + " \"\"\"\n", + " inputs = prepared_inputs[\"inputs\"]\n", + " image_size = prepared_inputs[\"image_size\"]\n", + " task = prepared_inputs[\"prompt\"]\n", + "\n", + " generated_ids = self._model.generate(\n", + " input_ids=inputs[\"input_ids\"],\n", + " pixel_values=inputs[\"pixel_values\"],\n", + " max_new_tokens=self.max_new_tokens,\n", + " num_beams=3,\n", + " do_sample=False,\n", + " )\n", + "\n", + " generated_text = self._processor.batch_decode(\n", + " generated_ids, skip_special_tokens=False\n", + " )[0]\n", + "\n", + " # Use image_size from prepared_inputs to avoid storing self.image\n", + " parsed_answer = self._processor.post_process_generation(\n", + " generated_text, task=task, image_size=image_size\n", + " )\n", + " return parsed_answer[task]\n", + "\n", + " async def stream_chat(\n", + " self, messages: Sequence[ChatMessage], **kwargs: Any\n", + " ) -> ChatResponse:\n", + " raise NotImplementedError(\n", + " \"Florence2MultiModal do not support async streaming chat yet.\"\n", + " )\n", + "\n", + "\n", + "class Phi35VisionMultiModal(HuggingFaceMultiModal):\n", + " \"\"\"\n", + " A specific implementation for the Phi3.5 multi-modal model.\n", + " Handles chat-style interactions that involve both text and images.\n", + " \"\"\"\n", + "\n", + " def _prepare_messages(\n", + " self, message: ChatMessage, image_documents: Sequence[ImageDocument]\n", + " ) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Prepares the input messages and images for Phi3.5 models. Images are appended in a custom format.\n", + " \"\"\"\n", + " images = [Image.open(img_doc.image_path) for img_doc in image_documents]\n", + " placeholder = \"\".join(f\"<|image_{i+1}|>\\n\" for i in range(len(images)))\n", + "\n", + " chat_messages = [{\"role\": message.role, \"content\": message.content}]\n", + " if images:\n", + " chat_messages[-1][\"content\"] = placeholder + chat_messages[-1][\"content\"]\n", + "\n", + " prompt = self._processor.tokenizer.apply_chat_template(\n", + " chat_messages, tokenize=False, add_generation_prompt=True\n", + " )\n", + " inputs = self._processor(prompt, images, return_tensors=\"pt\").to(self.device)\n", + " return inputs\n", + "\n", + " def _generate(self, prepared_inputs: Dict[str, Any]) -> str:\n", + " \"\"\"\n", + " Generates text based on prepared inputs. The text is decoded from token IDs generated by the model.\n", + " \"\"\"\n", + " generate_ids = self._model.generate(\n", + " **prepared_inputs,\n", + " eos_token_id=self._processor.tokenizer.eos_token_id,\n", + " max_new_tokens=self.max_new_tokens,\n", + " temperature=self.temperature,\n", + " do_sample=False,\n", + " )\n", + " generate_ids = generate_ids[:, prepared_inputs[\"input_ids\"].shape[1] :]\n", + " response = self._processor.batch_decode(\n", + " generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False\n", + " )[0]\n", + " return response\n", + "\n", + " async def stream_chat(\n", + " self, messages: Sequence[ChatMessage], **kwargs: Any\n", + " ) -> ChatResponse:\n", + " raise NotImplementedError(\n", + " \"Phi35VisionMultiModal does not support async streaming chat yet.\"\n", + " )\n", + "\n", + "\n", + "class PaliGemmaMultiModal(HuggingFaceMultiModal):\n", + " \"\"\"\n", + " A specific implementation for the PaliGemma multi-modal model.\n", + " Handles chat-style interactions that involve both text and images.\n", + " \"\"\"\n", + "\n", + " @override\n", + " def complete(\n", + " self, task: str, image_document: ImageDocument, **kwargs: Any\n", + " ) -> CompletionResponse:\n", + " if type(image_document) is list:\n", + " print(\n", + " f\"{self.model_name} can handle only one image. Will continue with the first image.\"\n", + " )\n", + " image_document = image_document[0]\n", + "\n", + " prepared_inputs = self._prepare_messages(task, image_document)\n", + " generated_text = self._generate(prepared_inputs)\n", + " return CompletionResponse(text=generated_text)\n", + "\n", + " @override\n", + " def chat(\n", + " self, task: str, image_document: ImageDocument, **kwargs: Any\n", + " ) -> ChatResponse:\n", + " if type(image_document) is list:\n", + " print(\n", + " f\"{self.model_name} can handle only one image. Will continue with the first image.\"\n", + " )\n", + " image_document = image_document[0]\n", + "\n", + " prepared_inputs = self._prepare_messages(task, image_document)\n", + " generated_text = self._generate(prepared_inputs)\n", + " return ChatResponse(\n", + " message=ChatMessage(role=\"assistant\", content=generated_text),\n", + " raw={\"model_output\": generated_text},\n", + " )\n", + "\n", + " def _prepare_messages(\n", + " self, messages: ChatMessage, image_document: ImageDocument\n", + " ) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Prepares the input messages and images for PaliGemma models. Images are appended in a custom format.\n", + " \"\"\"\n", + " images = Image.open(image_document.image_path)\n", + " inputs = self._processor(text=messages, images=images, return_tensors=\"pt\").to(\n", + " self.device\n", + " )\n", + " input_len = inputs[\"input_ids\"].shape[-1]\n", + " return {\"inputs\": inputs, \"input_len\": input_len}\n", + "\n", + " def _generate(self, prepared_inputs: Dict[str, Any]) -> str:\n", + " \"\"\"\n", + " Generates text based on prepared inputs. The text is decoded from token IDs generated by the model.\n", + " \"\"\"\n", + " input_len = prepared_inputs[\"input_len\"]\n", + " inputs = prepared_inputs[\"inputs\"]\n", + " generation = self._model.generate(**inputs, max_new_tokens=100, do_sample=False)\n", + " generation = generation[0][input_len:]\n", + " decoded = self._processor.decode(generation, skip_special_tokens=True)\n", + " return decoded\n", + "\n", + " async def stream_chat(\n", + " self, messages: Sequence[ChatMessage], **kwargs: Any\n", + " ) -> ChatResponse:\n", + " raise NotImplementedError(\n", + " \"PaliGemmaMultiModal does not support async streaming chat yet.\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-09-20 20:11:50-- https://docs.google.com/uc?export=download&id=1utu3iD9XEgR5Sb7PrbtMf1qw8T1WdNmF\n", + "Resolving docs.google.com (docs.google.com)... 74.125.68.101, 74.125.68.138, 74.125.68.139, ...\n", + "Connecting to docs.google.com (docs.google.com)|74.125.68.101|:443... connected.\n", + "HTTP request sent, awaiting response... 303 See Other\n", + "Location: https://drive.usercontent.google.com/download?id=1utu3iD9XEgR5Sb7PrbtMf1qw8T1WdNmF&export=download [following]\n", + "--2024-09-20 20:11:51-- https://drive.usercontent.google.com/download?id=1utu3iD9XEgR5Sb7PrbtMf1qw8T1WdNmF&export=download\n", + "Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 172.253.118.132, 2404:6800:4003:c00::84\n", + "Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|172.253.118.132|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 62818 (61K) [image/png]\n", + "Saving to: ‘./input_images/performance_spec.png’\n", + "\n", + "./input_images/perf 100%[===================>] 61.35K --.-KB/s in 0.008s \n", + "\n", + "2024-09-20 20:11:54 (7.56 MB/s) - ‘./input_images/performance_spec.png’ saved [62818/62818]\n", + "\n", + "--2024-09-20 20:11:54-- https://docs.google.com/uc?export=download&id=1dpUakWMqaXR4Jjn1kHuZfB0pAXvjn2-i\n", + "Resolving docs.google.com (docs.google.com)... 74.125.68.101, 74.125.68.138, 74.125.68.139, ...\n", + "Connecting to docs.google.com (docs.google.com)|74.125.68.101|:443... connected.\n", + "HTTP request sent, awaiting response... 303 See Other\n", + "Location: https://drive.usercontent.google.com/download?id=1dpUakWMqaXR4Jjn1kHuZfB0pAXvjn2-i&export=download [following]\n", + "--2024-09-20 20:11:54-- https://drive.usercontent.google.com/download?id=1dpUakWMqaXR4Jjn1kHuZfB0pAXvjn2-i&export=download\n", + "Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 172.253.118.132, 2404:6800:4003:c00::84\n", + "Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|172.253.118.132|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 128556 (126K) [image/png]\n", + "Saving to: ‘./input_images/price.png’\n", + "\n", + "./input_images/pric 100%[===================>] 125.54K --.-KB/s in 0.001s \n", + "\n", + "2024-09-20 20:11:56 (111 MB/s) - ‘./input_images/price.png’ saved [128556/128556]\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "os.makedirs(\"./input_images\", exist_ok=True)\n", + "\n", + "!wget \"https://docs.google.com/uc?export=download&id=1utu3iD9XEgR5Sb7PrbtMf1qw8T1WdNmF\" -O ./input_images/performance_spec.png\n", + "!wget \"https://docs.google.com/uc?export=download&id=1dpUakWMqaXR4Jjn1kHuZfB0pAXvjn2-i\" -O ./input_images/price.png" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image.open(\"./input_images/performance_spec.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Image.open(\"./input_images/price.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n", + "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n", + "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n", + "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n", + "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0e328b4568714752bfc62915d10ada39", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00 None: + """ + Initializes the HuggingFace multi-modal model and processor based on the provided configuration. + """ + super().__init__(**kwargs) + try: + # Load model configuration + self._config = AutoConfig.from_pretrained( + self.model_name, trust_remote_code=True + ) + architecture = self._config.architectures[0] + AutoModelClass = AutoModelForCausalLM # Default model class + + # Special cases for specific model architectures + if "Qwen2VLForConditionalGeneration" in architecture: + AutoModelClass = Qwen2VLForConditionalGeneration + if "PaliGemmaForConditionalGeneration" in architecture: + AutoModelClass = PaliGemmaForConditionalGeneration + + # Load the model based on the architecture + self._model = AutoModelClass.from_pretrained( + self.model_name, + device_map=self.device, + torch_dtype=self.torch_dtype, + trust_remote_code=self.trust_remote_code, + **self.additional_kwargs, + ) + # Load the processor (for handling text and image inputs) + self._processor = AutoProcessor.from_pretrained( + self.model_name, trust_remote_code=self.trust_remote_code + ) + except Exception as e: + raise ValueError(f"Failed to initialize the model and processor: {e!s}") + + @classmethod + def class_name(cls) -> str: + """Returns the class name for the model.""" + return "HuggingFace_multi_modal_llm" + + @property + def metadata(self) -> MultiModalLLMMetadata: + """Multi Modal LLM metadata.""" + return MultiModalLLMMetadata( + context_window=self.context_window, + num_output=self.max_new_tokens, + model_name=self.model_name, + ) + + # each unique model will override it + def _prepare_messages( + self, messages: Sequence[ChatMessage], image_documents: Sequence[ImageDocument] + ) -> Dict[str, Any]: + """ + Abstract method: Prepares input messages and image documents for the model. + This must be overridden by subclasses. + """ + raise NotImplementedError + + # each unique model will override it + def _generate(self, prepared_inputs: Dict[str, Any]) -> str: + """ + Abstract method: Generates text based on the prepared inputs. + This must be overridden by subclasses. + """ + raise NotImplementedError + + # some models will override it, some won't + def complete( + self, prompt: str, image_documents: Sequence[ImageDocument], **kwargs: Any + ) -> CompletionResponse: + """ + Completes a task based on a text prompt and optional images. + The method prepares inputs and generates the corresponding text. + """ + prepared_inputs = self._prepare_messages( + [ChatMessage(role="user", content=prompt)], image_documents + ) + generated_text = self._generate(prepared_inputs) + return CompletionResponse(text=generated_text) + + # some models will override it, some won't + def chat( + self, + messages: Sequence[ChatMessage], + image_documents: Sequence[ImageDocument], + **kwargs: Any, + ) -> ChatResponse: + """ + Engages in a chat-style interaction by processing a sequence of messages and optional images. + """ + prepared_inputs = self._prepare_messages(messages, image_documents) + generated_text = self._generate(prepared_inputs) + return ChatResponse( + message=ChatMessage(role="assistant", content=generated_text), + raw={"model_output": generated_text}, + ) + + async def astream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponseAsyncGen: + raise NotImplementedError( + "HuggingFaceMultiModal does not support async streaming chat yet." + ) + + async def astream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponseAsyncGen: + raise NotImplementedError( + "HuggingFaceMultiModal does not support async streaming completion yet." + ) + + async def acomplete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + raise NotImplementedError( + "HuggingFaceMultiModal does not support async completion yet." + ) + + async def achat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + raise NotImplementedError( + "HuggingFaceMultiModal does not support async chat yet." + ) + + async def stream_complete( + self, prompt: str, image_documents: Sequence[ImageNode], **kwargs: Any + ) -> CompletionResponse: + raise NotImplementedError( + "HuggingFaceMultiModal does not support async completion yet." + ) + + # we check the model architecture here + @classmethod + def from_model_name(cls, model_name: str, **kwargs: Any) -> "HuggingFaceMultiModal": + """Checks the model architecture and initializes the model.""" + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + # we check the architecture because users would want to use their own finetuned versions of VLMs + architecture = config.architectures[0] + + if "Phi3VForCausalLM" in architecture: + return Phi35VisionMultiModal(model_name=model_name, **kwargs) + elif "Florence2ForConditionalGeneration" in architecture: + return Florence2MultiModal(model_name=model_name, **kwargs) + elif "Qwen2VLForConditionalGeneration" in architecture: + return Qwen2VisionMultiModal(model_name=model_name, **kwargs) + elif "PaliGemmaForConditionalGeneration" in architecture: + return PaliGemmaMultiModal(model_name=model_name, **kwargs) + elif "MllamaForConditionalGeneration" in architecture: + return MllamaMultiModal(model_name=model_name, **kwargs) + else: + raise ValueError( + f"Unsupported model architecture: {architecture}. " + f"We currently support: {', '.join(SUPPORTED_VLMS)}" + ) + + +class Qwen2VisionMultiModal(HuggingFaceMultiModal): + """ + A specific implementation for the Qwen2 multi-modal model. + Handles chat-style interactions that involve both text and images. + """ + + def _prepare_messages( + self, messages: Sequence[ChatMessage], image_documents: Sequence[ImageDocument] + ) -> Dict[str, Any]: + """ + Prepares the input messages and images for Qwen2 models. Images are appended in a custom format. + """ + conversation = [] + for img_doc in image_documents: + conversation.append( + {"type": "image", "image": img_doc.image_path} + ) # Append images to conversation + conversation.append( + {"type": "text", "text": messages[0].content} + ) # Add user text message + + messages = [ + {"role": "user", "content": conversation} + ] # Wrap conversation in a user role + + # Apply a chat template to format the message with the processor + text_prompt = self._processor.tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs, _ = process_vision_info(messages) + + # Prepare the model inputs (text + images) and convert to tensor + inputs = self._processor( + text=[text_prompt], images=image_inputs, padding=True, return_tensors="pt" + ) + return inputs.to(self.device) + + def _generate(self, prepared_inputs: Dict[str, Any]) -> str: + """ + Generates text based on prepared inputs. The text is decoded from token IDs generated by the model. + """ + output_ids = self._model.generate( + **prepared_inputs, max_new_tokens=self.max_new_tokens + ) + generated_ids = [ + output_ids[len(input_ids) :] + for input_ids, output_ids in zip(prepared_inputs["input_ids"], output_ids) + ] + output_text = self._processor.batch_decode( + generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + return output_text[0] + + async def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + raise NotImplementedError( + "Qwen2VisionMultiModal does not support async streaming chat yet." + ) + + +class Florence2MultiModal(HuggingFaceMultiModal): + """ + A specific implementation for the Florence2 multi-modal model. + Handles chat-style interactions that involve both text and images. + """ + + @override + def complete( + self, task: str, image_documents: ImageDocument, **kwargs: Any + ) -> CompletionResponse: + if isinstance(image_documents, list): + print( + f"{self.model_name} can handle only one image. Will continue with the first image." + ) + image_documents = image_documents[0] + + prepared_inputs = self._prepare_messages(task, image_documents) + generated_text = self._generate(prepared_inputs) + return CompletionResponse(text=generated_text) + + @override + def chat( + self, task: str, image_documents: ImageDocument, **kwargs: Any + ) -> ChatResponse: + if isinstance(image_documents, list): + print( + f"{self.model_name} can handleo only one image. Will continue with the first image." + ) + image_documents = image_documents[0] + + prepared_inputs = self._prepare_messages(task, image_documents) + generated_text = self._generate(prepared_inputs) + return ChatResponse( + message=ChatMessage(role="assistant", content=generated_text), + raw={"model_output": generated_text}, + ) + + # TODO: Florence2 works with task_prompts, not user prompts + # Task prompts are: '', '', '' + def _prepare_messages( + self, task: str, image_documents: ImageDocument + ) -> Dict[str, Any]: + """ + Prepares the input messages and images for Qwen2 models. Images are appended in a custom format. + """ + if isinstance(image_documents, list): + print( + f"{self.model_name} can handleo only one image. Will continue with the first image." + ) + image_documents = image_documents[0] + prompt = ( + task.upper() + if task.upper() + in ["", "", ""] + else "" + ) + images = Image.open(image_documents.image_path) + inputs = self._processor(text=prompt, images=images, return_tensors="pt").to( + self.device, self.torch_dtype + ) + return { + "prompt": prompt, + "inputs": inputs, + "image_size": (images.width, images.height), + } + + def _generate(self, prepared_inputs: Dict[str, Any]) -> str: + """ + Generates text based on prepared inputs. The text is decoded from token IDs generated by the model. + """ + inputs = prepared_inputs["inputs"] + image_size = prepared_inputs["image_size"] + task = prepared_inputs["prompt"] + + generated_ids = self._model.generate( + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + max_new_tokens=self.max_new_tokens, + num_beams=3, + do_sample=False, + ) + + generated_text = self._processor.batch_decode( + generated_ids, skip_special_tokens=False + )[0] + + # Use image_size from prepared_inputs to avoid storing self.image + parsed_answer = self._processor.post_process_generation( + generated_text, task=task, image_size=image_size + ) + return parsed_answer[task] + + async def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + raise NotImplementedError( + "Florence2MultiModal do not support async streaming chat yet." + ) + + +class Phi35VisionMultiModal(HuggingFaceMultiModal): + """ + A specific implementation for the Phi3.5 multi-modal model. + Handles chat-style interactions that involve both text and images. + """ + + def _prepare_messages( + self, message: ChatMessage, image_documents: Sequence[ImageDocument] + ) -> Dict[str, Any]: + """ + Prepares the input messages and images for Phi3.5 models. Images are appended in a custom format. + """ + images = [Image.open(img_doc.image_path) for img_doc in image_documents] + placeholder = "".join(f"<|image_{i+1}|>\n" for i in range(len(images))) + + chat_messages = [{"role": message.role, "content": message.content}] + if images: + chat_messages[-1]["content"] = placeholder + chat_messages[-1]["content"] + + prompt = self._processor.tokenizer.apply_chat_template( + chat_messages, tokenize=False, add_generation_prompt=True + ) + return self._processor(prompt, images, return_tensors="pt").to(self.device) + + def _generate(self, prepared_inputs: Dict[str, Any]) -> str: + """ + Generates text based on prepared inputs. The text is decoded from token IDs generated by the model. + """ + generate_ids = self._model.generate( + **prepared_inputs, + eos_token_id=self._processor.tokenizer.eos_token_id, + max_new_tokens=self.max_new_tokens, + temperature=self.temperature, + do_sample=False, + ) + generate_ids = generate_ids[:, prepared_inputs["input_ids"].shape[1] :] + return self._processor.batch_decode( + generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + + async def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + raise NotImplementedError( + "Phi35VisionMultiModal does not support async streaming chat yet." + ) + + +class PaliGemmaMultiModal(HuggingFaceMultiModal): + """ + A specific implementation for the PaliGemma multi-modal model. + Handles chat-style interactions that involve both text and images. + """ + + @override + def complete( + self, task: str, image_documents: ImageDocument, **kwargs: Any + ) -> CompletionResponse: + if isinstance(image_documents, list): + print( + f"{self.model_name} can handle only one image. Will continue with the first image." + ) + image_documents = image_documents[0] + + prepared_inputs = self._prepare_messages(task, image_documents) + generated_text = self._generate(prepared_inputs) + return CompletionResponse(text=generated_text) + + @override + def chat( + self, task: str, image_documents: ImageDocument, **kwargs: Any + ) -> ChatResponse: + if isinstance(image_documents, list): + print( + f"{self.model_name} can handle only one image. Will continue with the first image." + ) + image_documents = image_documents[0] + + prepared_inputs = self._prepare_messages(task, image_documents) + generated_text = self._generate(prepared_inputs) + return ChatResponse( + message=ChatMessage(role="assistant", content=generated_text), + raw={"model_output": generated_text}, + ) + + def _prepare_messages( + self, messages: ChatMessage, image_documents: ImageDocument + ) -> Dict[str, Any]: + """ + Prepares the input messages and images for PaliGemma models. Images are appended in a custom format. + """ + if isinstance(image_documents, list): + print( + f"{self.model_name} can handleo only one image. Will continue with the first image." + ) + image_documents = image_documents[0] + images = Image.open(image_documents.image_path) + inputs = self._processor(text=messages, images=images, return_tensors="pt").to( + self.device + ) + input_len = inputs["input_ids"].shape[-1] + return {"inputs": inputs, "input_len": input_len} + + def _generate(self, prepared_inputs: Dict[str, Any]) -> str: + """ + Generates text based on prepared inputs. The text is decoded from token IDs generated by the model. + """ + input_len = prepared_inputs["input_len"] + inputs = prepared_inputs["inputs"] + generation = self._model.generate(**inputs, max_new_tokens=100, do_sample=False) + generation = generation[0][input_len:] + return self._processor.decode(generation, skip_special_tokens=True) + + async def stream_chat( + self, messages: Sequence[ChatMessage], **kwargs: Any + ) -> ChatResponse: + raise NotImplementedError( + "PaliGemmaMultiModal does not support async streaming chat yet." + ) + + +class LlamaMultiModal(HuggingFaceMultiModal): + """ + A specific implementation for the Llama3.2 multi-modal model. + Handles chat-style interactions that involve both text and images. + """ + + def _prepare_messages( + self, messages: Sequence[ChatMessage], image_documents: Sequence[ImageDocument] + ) -> Dict[str, Any]: + """ + Prepares the input messages and images for Llama3.2 models. Images are appended in a custom format. + """ + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": messages[0].content}, + ], + } + ] + images = [] + + for img_doc in image_documents: + images.append(Image.open(img_doc.image_path)) + + # Apply a chat template to format the message with the processor + input_text = self._processor.tokenizer.apply_chat_template( + messages, add_generation_prompt=True + ) + + # Prepare the model inputs (text + images) and convert to tensor + inputs = self._processor(images, input_text, return_tensors="pt") + return inputs.to(self.device) + + def _generate(self, prepared_inputs: Dict[str, Any]) -> str: + """ + Generates text based on prepared inputs. The text is decoded from token IDs generated by the model. + """ + output_ids = self._model.generate( + **prepared_inputs, max_new_tokens=self.max_new_tokens + ) + generated_ids = [ + output_ids[len(input_ids) :] + for input_ids, output_ids in zip(prepared_inputs["input_ids"], output_ids) + ] + output_text = self._processor.batch_decode( + generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + return output_text[0] diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml new file mode 100644 index 0000000000000..2e8fd449783a3 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["M.Cihan Yalçın mcihan.yalcin@outlook.com"] +description = "llama-index multi_modal_llms HuggingFace integration by [Cihan Yalçın](https://www.linkedin.com/in/chanyalcin/)" +license = "MIT" +name = "llama-index-multi-modal-llms-huggingface" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.0" +transformers = {extras = ["torch"], version = "^4.0"} +qwen-vl-utils = ">=0.0.8" +torchvision = "^0.19.1" +Pillow = "^10.0.0" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +torchvision = "0.19.1" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/BUILD b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/BUILD new file mode 100644 index 0000000000000..8f6770c328f42 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/BUILD @@ -0,0 +1,3 @@ +python_tests( + dependencies=["llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface:poetry#torchvision"], +) diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__init__.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__pycache__/__init__.cpython-312.pyc b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000..f7ea4b320e027 Binary files /dev/null and b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__pycache__/__init__.cpython-312.pyc differ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__pycache__/test_multi_modal_llms_huggingface.cpython-312-pytest-8.3.3.pyc b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__pycache__/test_multi_modal_llms_huggingface.cpython-312-pytest-8.3.3.pyc new file mode 100644 index 0000000000000..a16440aa7e9bd Binary files /dev/null and b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/__pycache__/test_multi_modal_llms_huggingface.cpython-312-pytest-8.3.3.pyc differ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/2dogs.jpg b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/2dogs.jpg new file mode 100644 index 0000000000000..53767beb159b1 Binary files /dev/null and b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/2dogs.jpg differ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/5cats.jpg b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/5cats.jpg new file mode 100644 index 0000000000000..c3bd4d4b88055 Binary files /dev/null and b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/5cats.jpg differ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/girl_rabbit.jpg b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/girl_rabbit.jpg new file mode 100644 index 0000000000000..8d1c2c27451e3 Binary files /dev/null and b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/girl_rabbit.jpg differ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/man_read.jpg b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/man_read.jpg new file mode 100644 index 0000000000000..55993bfa9302b Binary files /dev/null and b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_images/man_read.jpg differ diff --git a/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_multi_modal_llms_huggingface.py b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_multi_modal_llms_huggingface.py new file mode 100644 index 0000000000000..e68ad308b8013 --- /dev/null +++ b/llama-index-integrations/multi_modal_llms/llama-index-multi-modal-llms-huggingface/tests/test_multi_modal_llms_huggingface.py @@ -0,0 +1,7 @@ +from llama_index.core.multi_modal_llms.base import MultiModalLLM +from llama_index.multi_modal_llms.huggingface import HuggingFaceMultiModal + + +def test_class(): + names_of_base_classes = [b.__name__ for b in HuggingFaceMultiModal.__mro__] + assert MultiModalLLM.__name__ in names_of_base_classes