From 0955829366943d87ccd1f5e0594befcb500be204 Mon Sep 17 00:00:00 2001 From: ivy-lv11 Date: Thu, 23 May 2024 14:24:18 +0800 Subject: [PATCH] update --- docs/docs/examples/llm/ipex_llm_gpu.ipynb | 297 ++++++------------ .../examples/README.md | 81 +---- .../examples/basic.py | 89 ++++++ .../examples/more_data_type.py | 2 +- .../llama-index-llms-ipex-llm/examples/rag.py | 270 ---------------- 5 files changed, 201 insertions(+), 538 deletions(-) create mode 100644 llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/basic.py delete mode 100644 llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/rag.py diff --git a/docs/docs/examples/llm/ipex_llm_gpu.ipynb b/docs/docs/examples/llm/ipex_llm_gpu.ipynb index 847f9e0380e42..13f9a0b391ea6 100644 --- a/docs/docs/examples/llm/ipex_llm_gpu.ipynb +++ b/docs/docs/examples/llm/ipex_llm_gpu.ipynb @@ -4,61 +4,75 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# IPEX-LLM \n", - "\n", + "# IPEX-LLM\n", "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency.\n", "\n", - "This example goes over how to use LlamaIndex to interact with [`ipex-llm`](https://github.com/intel-analytics/ipex-llm/) for text generation and chat on GPU. \n", + "This example goes over how to use LlamaIndex to interact with [`ipex-llm`](https://github.com/intel-analytics/ipex-llm/) for text generation and chat on intel GPU. \n", "\n", - "For more examples and usage, refer to [Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install `llama-index-llms-ipex-llm`. This will also install `ipex-llm` and its dependencies." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "> **Note**\n", + ">\n", + "> You could refer to [here](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples) for full examples of `IpexLLM`. Please note that for running on Intel GPU, please specify `-d 'xpu'` in command argument when running the examples.\n", + "\n", + "## Install Prerequisites\n", + "To benefit from IPEX-LLM on Intel GPUs, there are several prerequisite steps for tools installation and environment preparation.\n", + "\n", + "If you are a Windows user, visit the [Install IPEX-LLM on Windows with Intel GPU Guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_windows_gpu.html), and follow [**Install Prerequisites**](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_windows_gpu.html#install-prerequisites) to update GPU driver (optional) and install Conda.\n", + "\n", + "If you are a Linux user, visit the [Install IPEX-LLM on Linux with Intel GPU](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_linux_gpu.html), and follow [**Install Prerequisites**](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_linux_gpu.html#install-prerequisites) to install GPU driver, IntelĀ® oneAPI Base Toolkit 2024.0, and Conda.\n", + "\n", + "## Install `llama-index-llms-ipex-llm`\n", + "\n", + "After the prerequisites installation, you should have created a conda environment with all prerequisites installed, activate your conda environment and install `llama-index-llms-ipex-llm` as follows:\n", "\n", "```bash\n", + "conda activate \n", + "\n", "pip install llama-index-llms-ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example we'll use [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) model for demostration. It requires updating `transformers` and `tokenizers` packages." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "```\n", + "This step will also install `ipex-llm` and its dependencies.\n", + "\n", + "> **Note**\n", + ">\n", + "> You can also use `https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/` as the `extra-indel-url`.\n", + "\n", + "\n", + "## Runtime Configuration\n", + "\n", + "For optimal performance, it is recommended to set several environment variables based on your device:\n", + "\n", + "### For Windows Users with Intel Core Ultra integrated GPU\n", + "\n", + "In Anaconda Prompt:\n", + "\n", + "```\n", + "set SYCL_CACHE_PERSISTENT=1\n", + "set BIGDL_LLM_XMX_DISABLED=1\n", + "```\n", + "\n", + "### For Linux Users with Intel Arc A-Series GPU\n", + "\n", "```bash\n", - "pip install -U transformers==4.37.0 tokenizers==0.15.2\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before loading the Zephyr model, you'll need to define `completion_to_prompt` and `messages_to_prompt` for formatting prompts. This is essential for preparing inputs that the model can interpret accurately." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "# Configure oneAPI environment variables. Required step for APT or offline installed oneAPI.\n", + "# Skip this step for PIP-installed oneAPI since the environment has already been configured in LD_LIBRARY_PATH.\n", + "source /opt/intel/oneapi/setvars.sh\n", + "\n", + "# Recommended Environment Variables for optimal performance\n", + "export USE_XETLA=OFF\n", + "export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1\n", + "export SYCL_CACHE_PERSISTENT=1\n", + "```\n", + "\n", + "> **Note**\n", + ">\n", + "> For the first time that each model runs on Intel iGPU/Intel Arc A300-Series or Pro A60, it may take several minutes to compile.\n", + ">\n", + "> For other GPU type, please refer to [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#runtime-configuration) for Windows users, and [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#id5) for Linux users.\n", + "\n", + "## `IpexLLM`\n", + "\n", + "Setting `device_map=\"xpu\"` when initializing `IpexLLM` will put the embedding model on Intel GPU and benefit from IPEX-LLM optimizations:\n", + "\n", + "```python\n", "# Transform a string into input zephyr-specific input\n", "def completion_to_prompt(completion):\n", " return f\"<|system|>\\n\\n<|user|>\\n{completion}\\n<|assistant|>\\n\"\n", @@ -82,29 +96,7 @@ " # add final assistant prompt\n", " prompt = prompt + \"<|assistant|>\\n\"\n", "\n", - " return prompt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Basic Usage\n", - "\n", - "Load the Zephyr model locally using IpexLLM using `IpexLLM.from_model_id`. It will load the model directly in its Huggingface format and convert it automatically to low-bit format for inference. Use `device_map` to load the model to xpu. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(\n", - " \"ignore\", category=UserWarning, message=\".*padding_mask.*\"\n", - ")\n", + " return prompt\n", "\n", "from llama_index.llms.ipex_llm import IpexLLM\n", "\n", @@ -117,133 +109,51 @@ " completion_to_prompt=completion_to_prompt,\n", " messages_to_prompt=messages_to_prompt,\n", " device_map=\"xpu\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can proceed to use the loaded model for text completion and interactive chat. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Text Completion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + ")\n", + "```\n", + "\n", + "> Please note that in this example we'll use [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) model for demostration. It requires updating `transformers` and `tokenizers` packages.\n", + "> ```bash\n", + "> pip install -U transformers==4.37.0 tokenizers==0.15.2\n", + "> ```\n", + "\n", + "You could then conduct the completion task or chat task as normal:\n", + "\n", + "```python\n", + "print(\"----------------- Complete ------------------\")\n", "completion_response = llm.complete(\"Once upon a time, \")\n", - "print(completion_response.text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Streaming Text Completion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "print(completion_response.text)\n", + "print(\"----------------- Stream Complete ------------------\")\n", "response_iter = llm.stream_complete(\"Once upon a time, there's a little girl\")\n", "for response in response_iter:\n", - " print(response.delta, end=\"\", flush=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Chat" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " print(response.delta, end=\"\", flush=True)\n", + "print(\"----------------- Chat ------------------\")\n", "from llama_index.core.llms import ChatMessage\n", "\n", "message = ChatMessage(role=\"user\", content=\"Explain Big Bang Theory briefly\")\n", "resp = llm.chat([message])\n", - "print(resp)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Streaming Chat" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "print(resp)\n", + "print(\"----------------- Stream Chat ------------------\")\n", "message = ChatMessage(role=\"user\", content=\"What is AI?\")\n", "resp = llm.stream_chat([message], max_tokens=256)\n", "for r in resp:\n", - " print(r.delta, end=\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save/Load Low-bit Model\n", - "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To save the low-bit model, use `save_low_bit` as follows." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " print(r.delta, end=\"\")\n", + "```\n", + "\n", + "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. \n", + "\n", + "To save the low-bit model, use `save_low_bit` as follows. Then load the model from saved lowbit model path as follows. Also use `device_map` to load the model to xpu. \n", + "> Note that the saved path for the low-bit model only includes the model itself but not the tokenizers. If you wish to have everything in one place, you will need to manually download or copy the tokenizer files from the original model's directory to the location where the low-bit model is saved.\n", + "\n", + "Try stream completion using the loaded low-bit model. \n", + "```python\n", "saved_lowbit_model_path = (\n", " \"./zephyr-7b-alpha-low-bit\" # path to save low-bit model\n", ")\n", "\n", "llm._model.save_low_bit(saved_lowbit_model_path)\n", - "del llm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the model from saved lowbit model path as follows. Also use `device_map` to load the model to xpu. \n", - "> Note that the saved path for the low-bit model only includes the model itself but not the tokenizers. If you wish to have everything in one place, you will need to manually download or copy the tokenizer files from the original model's directory to the location where the low-bit model is saved." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "del llm\n", + "\n", "llm_lowbit = IpexLLM.from_model_id_low_bit(\n", " model_name=saved_lowbit_model_path,\n", " tokenizer_name=\"HuggingFaceH4/zephyr-7b-alpha\",\n", @@ -253,25 +163,12 @@ " completion_to_prompt=completion_to_prompt,\n", " generate_kwargs={\"do_sample\": False},\n", " device_map=\"xpu\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Try stream completion using the loaded low-bit model. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + ")\n", + "\n", "response_iter = llm_lowbit.stream_complete(\"What is Large Language Model?\")\n", "for response in response_iter:\n", - " print(response.delta, end=\"\", flush=True)" + " print(response.delta, end=\"\", flush=True)\n", + "```" ] } ], diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md index 6a3d8b37c32d9..aed5569641438 100644 --- a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/README.md @@ -22,6 +22,20 @@ pip install llama-index-llms-ipex-llm[xpu] --extra-index-url https://pytorch-ext ## List of Examples +### Basic Example + +The example [basic.py](./basic.py) shows how to run `IpexLLM` on Intel CPU or GPU and conduct tasks such as text completion. Run the example as following: + +```bash +python basic.py -m -d -t -q +``` + +> Please note that in this example we'll use [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) model for demonstration. It requires updating `transformers` and `tokenizers` packages. +> +> ```bash +> pip install -U transformers==4.37.0 tokenizers==0.15.2 +> ``` + ### More Data Types Example By default, `IpexLLM` loads the model in int4 format. To load a model in different data formats like `sym_int5`, `sym_int8`, etc., you can use the `load_in_low_bit` option in `IpexLLM`. To load a model on different device like `cpu` or `xpu`, you can use the `device_map` option in `IpexLLM`. @@ -34,70 +48,3 @@ python more_data_type.py -m -t -l Note: If you're using [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) model in this example, it is recommended to use transformers version > <=4.34. - -### RAG Example - -We use llama-index-ipex-llm to build a Retrieval-Augment-Generation (RAG) pipeline. - -- **Database Setup (using PostgreSQL)**: - - - Linux - - - Installation: - ```bash - sudo apt-get install postgresql-client - sudo apt-get install postgresql - ``` - - Initialization: - - Switch to the **postgres** user and launch **psql** console - - ```bash - sudo su - postgres - psql - ``` - - Then, create a new user role: - - ```bash - CREATE ROLE WITH LOGIN PASSWORD ''; - ALTER ROLE SUPERUSER; - ``` - - - Windows - - click `Download the installer` in [PostgreSQL](https://www.postgresql.org/download/windows/). - - Run the downloaded installation package as administrator, then click `next` continuously. - - Open PowerShell: - ```bash - cd C:\Program Files\PostgreSQL\14\bin - ``` - The exact path will vary depending on your PostgreSQL location. - - Then in PowerShell: - ```bash - .\psql -U postgres - ``` - Input the password you set in the previous installation. If PowerShell shows `postgres=#`, it indicates a successful connection. - - Create a new user role: - ```bash - CREATE ROLE WITH LOGIN PASSWORD ''; - ALTER ROLE SUPERUSER; - ``` - -- **Pgvector Installation**: - - - Linux - - Follow installation instructions on [pgvector's GitHub](https://github.com/pgvector/pgvector) and refer to the [installation notes](https://github.com/pgvector/pgvector#installation-notes) for additional help. - - Windows - - It is recommended to use [pgvector for Windows](https://github.com/pgvector/pgvector?tab=readme-ov-file#windows) instead of others (such as conda-force) to avoid potential errors. Some steps may require running as administrator. - -- **Data Preparation**: Download the Llama2 paper and save it as `data/llama2.pdf`, which serves as the default source file for retrieval. - ```bash - mkdir data - wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf" - ``` - -The example [rag.py](./rag.py) shows how to use RAG pipeline. Run the example as following: - -```bash -python rag.py -m -q -u -p -e -n -t -x -``` diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/basic.py new file mode 100644 index 0000000000000..0b5e147978089 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/basic.py @@ -0,0 +1,89 @@ +# Transform a string into input zephyr-specific input +def completion_to_prompt(completion): + return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" + + +# Transform a list of chat messages into zephyr-specific input +def messages_to_prompt(messages): + prompt = "" + for message in messages: + if message.role == "system": + prompt += f"<|system|>\n{message.content}\n" + elif message.role == "user": + prompt += f"<|user|>\n{message.content}\n" + elif message.role == "assistant": + prompt += f"<|assistant|>\n{message.content}\n" + + # ensure we start with a system prompt, insert blank if needed + if not prompt.startswith("<|system|>\n"): + prompt = "<|system|>\n\n" + prompt + + # add final assistant prompt + prompt = prompt + "<|assistant|>\n" + + return prompt + + +from llama_index.llms.ipex_llm import IpexLLM +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="IpexLLM Basic Usage Example") + parser.add_argument( + "--model-name", + "-m", + type=str, + default="HuggingFaceH4/zephyr-7b-alpha", + help="The huggingface repo id for the LLM model to be downloaded" + ", or the path to the huggingface checkpoint folder", + ) + parser.add_argument( + "--device", + "-d", + type=str, + default="cpu", + choices=["cpu", "xpu"], + help="The device (Intel CPU or Intel GPU) the embedding model runs on", + ) + parser.add_argument( + "--query", + "-q", + type=str, + default="What is IPEX-LLM?", + help="The sentence you prefer for query the LLM", + ) + + args = parser.parse_args() + model_name = args.model_name + device = args.device + query = args.query + + llm = IpexLLM.from_model_id( + model_name=model_name, + tokenizer_name=model_name, + context_window=512, + max_new_tokens=128, + generate_kwargs={"do_sample": False}, + completion_to_prompt=completion_to_prompt, + messages_to_prompt=messages_to_prompt, + device_map=device, + ) + + print("----------------- Complete ------------------") + completion_response = llm.complete(query) + print(completion_response.text) + print("----------------- Stream Complete ------------------") + response_iter = llm.stream_complete(query) + for response in response_iter: + print(response.delta, end="", flush=True) + print("----------------- Chat ------------------") + from llama_index.core.llms import ChatMessage + + message = ChatMessage(role="user", content=query) + resp = llm.chat([message]) + print(resp) + print("----------------- Stream Chat ------------------") + message = ChatMessage(role="user", content=query) + resp = llm.stream_chat([message], max_tokens=256) + for r in resp: + print(r.delta, end="") diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py index 71a7b9539ecd3..8486e231f6dd2 100644 --- a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py +++ b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/more_data_type.py @@ -52,7 +52,7 @@ def completion_to_prompt(completion): "-d", type=str, default="xpu", - choices=["cpu", "xpu", "auto"], + choices=["cpu", "xpu"], help="The device the model will run on.", ) diff --git a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/rag.py b/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/rag.py deleted file mode 100644 index 31a66705fe3e3..0000000000000 --- a/llama-index-integrations/llms/llama-index-llms-ipex-llm/examples/rag.py +++ /dev/null @@ -1,270 +0,0 @@ -import torch -from llama_index.embeddings.huggingface import HuggingFaceEmbedding -from sqlalchemy import make_url -from llama_index.vector_stores.postgres import PGVectorStore -import psycopg2 -from pathlib import Path -from llama_index.readers.file import PyMuPDFReader -from llama_index.core.schema import NodeWithScore -from typing import Optional -from llama_index.core.query_engine import RetrieverQueryEngine -from llama_index.core import QueryBundle -from llama_index.core.retrievers import BaseRetriever -from typing import Any, List -from llama_index.core.node_parser import SentenceSplitter -from llama_index.core.vector_stores import VectorStoreQuery -import argparse - - -def load_vector_database(username, password): - db_name = "example_db" - host = "localhost" - password = password - port = "5432" - user = username - # conn = psycopg2.connect(connection_string) - conn = psycopg2.connect( - dbname="postgres", - host=host, - password=password, - port=port, - user=user, - ) - conn.autocommit = True - - with conn.cursor() as c: - c.execute(f"DROP DATABASE IF EXISTS {db_name}") - c.execute(f"CREATE DATABASE {db_name}") - - vector_store = PGVectorStore.from_params( - database=db_name, - host=host, - password=password, - port=port, - user=user, - table_name="llama2_paper", - embed_dim=384, # openai embedding dimension - ) - return vector_store - - -def load_data(data_path): - loader = PyMuPDFReader() - documents = loader.load(file_path=data_path) - - text_parser = SentenceSplitter( - chunk_size=1024, - # separator=" ", - ) - text_chunks = [] - # maintain relationship with source doc index, to help inject doc metadata in (3) - doc_idxs = [] - for doc_idx, doc in enumerate(documents): - cur_text_chunks = text_parser.split_text(doc.text) - text_chunks.extend(cur_text_chunks) - doc_idxs.extend([doc_idx] * len(cur_text_chunks)) - - from llama_index.core.schema import TextNode - - nodes = [] - for idx, text_chunk in enumerate(text_chunks): - node = TextNode( - text=text_chunk, - ) - src_doc = documents[doc_idxs[idx]] - node.metadata = src_doc.metadata - nodes.append(node) - return nodes - - -class VectorDBRetriever(BaseRetriever): - """Retriever over a postgres vector store.""" - - def __init__( - self, - vector_store: PGVectorStore, - embed_model: Any, - query_mode: str = "default", - similarity_top_k: int = 2, - ) -> None: - """Init params.""" - self._vector_store = vector_store - self._embed_model = embed_model - self._query_mode = query_mode - self._similarity_top_k = similarity_top_k - super().__init__() - - def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: - """Retrieve.""" - query_embedding = self._embed_model.get_query_embedding(query_bundle.query_str) - vector_store_query = VectorStoreQuery( - query_embedding=query_embedding, - similarity_top_k=self._similarity_top_k, - mode=self._query_mode, - ) - query_result = self._vector_store.query(vector_store_query) - - nodes_with_scores = [] - for index, node in enumerate(query_result.nodes): - score: Optional[float] = None - if query_result.similarities is not None: - score = query_result.similarities[index] - nodes_with_scores.append(NodeWithScore(node=node, score=score)) - - return nodes_with_scores - - -def completion_to_prompt(completion): - return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" - - -# Transform a list of chat messages into zephyr-specific input -def messages_to_prompt(messages): - prompt = "" - for message in messages: - if message.role == "system": - prompt += f"<|system|>\n{message.content}\n" - elif message.role == "user": - prompt += f"<|user|>\n{message.content}\n" - elif message.role == "assistant": - prompt += f"<|assistant|>\n{message.content}\n" - - # ensure we start with a system prompt, insert blank if needed - if not prompt.startswith("<|system|>\n"): - prompt = "<|system|>\n\n" + prompt - - # add final assistant prompt - prompt = prompt + "<|assistant|>\n" - - return prompt - - -def main(args): - embed_model = HuggingFaceEmbedding(model_name=args.embedding_model_path) - - # Use custom LLM in BigDL - from ipex_llm.llamaindex.llms import IpexLLM - - llm = IpexLLM.from_model_id( - model_name=args.model_path, - tokenizer_name=args.tokenizer_path, - context_window=512, - max_new_tokens=args.n_predict, - generate_kwargs={"temperature": 0.7, "do_sample": False}, - model_kwargs={}, - messages_to_prompt=messages_to_prompt, - completion_to_prompt=completion_to_prompt, - device_map=args.device, - ) - - vector_store = load_vector_database(username=args.user, password=args.password) - nodes = load_data(data_path=args.data) - for node in nodes: - node_embedding = embed_model.get_text_embedding( - node.get_content(metadata_mode="all") - ) - node.embedding = node_embedding - - vector_store.add(nodes) - - # query_str = "Can you tell me about the key concepts for safety finetuning" - query_str = "Explain about the training data for Llama 2" - query_embedding = embed_model.get_query_embedding(query_str) - # construct vector store query - - query_mode = "default" - # query_mode = "sparse" - # query_mode = "hybrid" - - vector_store_query = VectorStoreQuery( - query_embedding=query_embedding, similarity_top_k=2, mode=query_mode - ) - # returns a VectorStoreQueryResult - query_result = vector_store.query(vector_store_query) - # print("Retrieval Results: ") - # print(query_result.nodes[0].get_content()) - - nodes_with_scores = [] - for index, node in enumerate(query_result.nodes): - score: Optional[float] = None - if query_result.similarities is not None: - score = query_result.similarities[index] - nodes_with_scores.append(NodeWithScore(node=node, score=score)) - - retriever = VectorDBRetriever( - vector_store, embed_model, query_mode="default", similarity_top_k=1 - ) - - query_engine = RetrieverQueryEngine.from_args(retriever, llm=llm) - - # query_str = "How does Llama 2 perform compared to other open-source models?" - query_str = args.question - response = query_engine.query(query_str) - - print("------------RESPONSE GENERATION---------------------") - print(str(response)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="LlamaIndex BigdlLLM Example") - parser.add_argument( - "-m", - "--model-path", - type=str, - required=True, - help="the path to transformers model", - ) - parser.add_argument( - "-q", - "--question", - type=str, - default="How does Llama 2 perform compared to other open-source models?", - help="question you want to ask.", - ) - parser.add_argument( - "-d", - "--data", - type=str, - default="./data/llama2.pdf", - help="the data used during retrieval", - ) - parser.add_argument( - "-u", - "--user", - type=str, - required=True, - help="user name in the database postgres", - ) - parser.add_argument( - "-p", - "--password", - type=str, - required=True, - help="the password of the user in the database", - ) - parser.add_argument( - "-e", - "--embedding-model-path", - default="BAAI/bge-small-en", - help="the path to embedding model path", - ) - parser.add_argument( - "-n", "--n-predict", type=int, default=32, help="max number of predict tokens" - ) - parser.add_argument( - "-t", - "--tokenizer-path", - type=str, - required=True, - help="the path to transformers tokenizer", - ) - parser.add_argument( - "-x", - "--device", - type=str, - default="xpu", - help="device to load the model and inference", - ) - args = parser.parse_args() - - main(args)