From e9b721c517cd9f84cb9771a1991843059675069f Mon Sep 17 00:00:00 2001 From: John Watson Date: Tue, 3 Dec 2024 13:23:19 -0800 Subject: [PATCH] Visualization, Powerpoint, Misc. Cleanup (#49) * wip on visualizing dataset contents and vectors * remove docling * Make datasets represent individasiual docs * hi lite * make the hover point size bigger for stronger highlighting * highlight even stronger! * refactor and add a question entry box to show where the question lies in the vector space * a bit of refactoring and fix unused imports * force mypy into happiness * Update release version to dev-testing * fix types * disable input while loading * mark loading while viz is loading * clear up some warnings * make the active tab sticky on the ds management page * small refactoring * small refactor * "wip on visualization cleanup" * "now we're thinking with UMAP tooltips" * "styling" * "viz layout, continued." * "now we're thinking with dependency hell" * "add dependencies" * add the last dependency needed for powerpoint parsing * add other powerpoints * turn on the no-kb query toggle * tidy up the chunk metadata handling * use models for models * update uv.lock from main * delete log file --------- Co-authored-by: Michael Liu Co-authored-by: actions-user Co-authored-by: Elijah Williams --- llm-service/app/ai/indexing/index.py | 4 + llm-service/app/ai/indexing/readers/pptx.py | 59 +++++++ llm-service/app/ai/vector_stores/qdrant.py | 36 ++++- .../app/ai/vector_stores/vector_store.py | 4 + .../app/routers/index/data_source/__init__.py | 17 ++ llm-service/app/services/evaluators.py | 14 +- llm-service/app/services/models.py | 3 +- llm-service/pyproject.toml | 5 + llm-service/uv.lock | 149 ++++++++++++++++++ scripts/release_version.txt | 2 +- ui/package.json | 3 + ui/pnpm-lock.yaml | 42 +++++ ui/src/api/dataSourceApi.ts | 44 ++++++ ui/src/api/ragQueryApi.ts | 9 +- ui/src/api/utils.ts | 2 + ui/src/pages/DataSources/Tabs.tsx | 46 +++++- .../DataSourceVisualization.tsx | 127 +++++++++++++++ .../VisualizationTab/VectorGraph.tsx | 149 ++++++++++++++++++ .../ChatOutput/Sources/MetaData.tsx | 81 ++++++++++ .../ChatOutput/Sources/SourceCard.tsx | 17 +- .../FooterComponents/RagChatQueryInput.tsx | 1 - 21 files changed, 775 insertions(+), 39 deletions(-) create mode 100644 llm-service/app/ai/indexing/readers/pptx.py create mode 100644 ui/src/pages/DataSources/VisualizationTab/DataSourceVisualization.tsx create mode 100644 ui/src/pages/DataSources/VisualizationTab/VectorGraph.tsx create mode 100644 ui/src/pages/RagChatTab/ChatOutput/Sources/MetaData.tsx diff --git a/llm-service/app/ai/indexing/index.py b/llm-service/app/ai/indexing/index.py index 8c76fa5a..7dd98048 100644 --- a/llm-service/app/ai/indexing/index.py +++ b/llm-service/app/ai/indexing/index.py @@ -55,6 +55,7 @@ from .readers.json import JSONReader from .readers.simple_file import SimpleFileReader from .readers.pdf import PDFReader +from .readers.pptx import PptxReader logger = logging.getLogger(__name__) @@ -63,6 +64,9 @@ ".txt": SimpleFileReader, ".md": SimpleFileReader, ".docx": DocxReader, + ".pptx": PptxReader, + ".pptm": PptxReader, + ".ppt": PptxReader, ".csv": CSVReader, ".json": JSONReader, } diff --git a/llm-service/app/ai/indexing/readers/pptx.py b/llm-service/app/ai/indexing/readers/pptx.py new file mode 100644 index 00000000..eed8d9b7 --- /dev/null +++ b/llm-service/app/ai/indexing/readers/pptx.py @@ -0,0 +1,59 @@ +# +# CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) +# (C) Cloudera, Inc. 2024 +# All rights reserved. +# +# Applicable Open Source License: Apache 2.0 +# +# NOTE: Cloudera open source products are modular software products +# made up of hundreds of individual components, each of which was +# individually copyrighted. Each Cloudera open source product is a +# collective work under U.S. Copyright Law. Your license to use the +# collective work is as provided in your written agreement with +# Cloudera. Used apart from the collective work, this file is +# licensed for your use pursuant to the open source license +# identified above. +# +# This code is provided to you pursuant a written agreement with +# (i) Cloudera, Inc. or (ii) a third-party authorized to distribute +# this code. If you do not have a written agreement with Cloudera nor +# with an authorized and properly licensed third party, you do not +# have any rights to access nor to use this code. +# +# Absent a written agreement with Cloudera, Inc. ("Cloudera") to the +# contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY +# KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED +# WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO +# IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, +# AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS +# ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE +# OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR +# CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES +# RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF +# BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF +# DATA. +# + +from pathlib import Path +from typing import Any, List + +from llama_index.core.schema import TextNode +from llama_index.readers.file import PptxReader as LlamaIndexPptxReader + +from .base_reader import BaseReader + + +class PptxReader(BaseReader): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.inner = LlamaIndexPptxReader() + + def load_chunks(self, file_path: Path) -> List[TextNode]: + documents = self.inner.load_data(file_path) + assert len(documents) == 1 + document = documents[0] + document.id_ = self.document_id + self._add_document_metadata(document, file_path) + return self._chunks_in_document(document) diff --git a/llm-service/app/ai/vector_stores/qdrant.py b/llm-service/app/ai/vector_stores/qdrant.py index 9b8d6041..74a00bca 100644 --- a/llm-service/app/ai/vector_stores/qdrant.py +++ b/llm-service/app/ai/vector_stores/qdrant.py @@ -37,7 +37,8 @@ # import os -from typing import Optional +from typing import Optional, Any +import umap import qdrant_client from llama_index.core.indices import VectorStoreIndex @@ -45,10 +46,10 @@ from llama_index.vector_stores.qdrant import ( QdrantVectorStore as LlamaIndexQdrantVectorStore, ) -from qdrant_client.http.models import CountResult +from qdrant_client.http.models import CountResult, Record -from ...services import models from .vector_store import VectorStore +from ...services import models def new_qdrant_client() -> qdrant_client.QdrantClient: @@ -60,20 +61,20 @@ def new_qdrant_client() -> qdrant_client.QdrantClient: class QdrantVectorStore(VectorStore): @staticmethod def for_chunks( - data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None + data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None ) -> "QdrantVectorStore": return QdrantVectorStore(table_name=f"index_{data_source_id}", client=client) @staticmethod def for_summaries( - data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None + data_source_id: int, client: Optional[qdrant_client.QdrantClient] = None ) -> "QdrantVectorStore": return QdrantVectorStore( table_name=f"summary_index_{data_source_id}", client=client ) def __init__( - self, table_name: str, client: Optional[qdrant_client.QdrantClient] = None + self, table_name: str, client: Optional[qdrant_client.QdrantClient] = None ): self.client = client or new_qdrant_client() self.table_name = table_name @@ -105,3 +106,26 @@ def exists(self) -> bool: def llama_vector_store(self) -> BasePydanticVectorStore: vector_store = LlamaIndexQdrantVectorStore(self.table_name, self.client) return vector_store + + def visualize(self, user_query: Optional[str] = None) -> list[tuple[tuple[float, float], str]]: + records: list[Record] + records, _ = self.client.scroll(self.table_name, limit=5000, with_vectors=True) + + if user_query: + embedding_model = models.get_embedding_model() + user_query_vector = embedding_model.get_query_embedding(user_query) + records.append(Record(vector=user_query_vector, id="abc123", payload={"file_name": "USER_QUERY"})) + + record: Record + filenames = [] + for record in records: + payload: dict[str, Any] | None = record.payload + if payload: + filenames.append(payload.get("file_name")) + + reducer = umap.UMAP() + embeddings = [record.vector for record in records] + reduced_embeddings = reducer.fit_transform(embeddings) + + # todo: figure out how to satisfy mypy on this line + return [(tuple(coordinate), filenames[i]) for i, coordinate in enumerate(reduced_embeddings.tolist())] # type: ignore diff --git a/llm-service/app/ai/vector_stores/vector_store.py b/llm-service/app/ai/vector_stores/vector_store.py index 9cb472ac..c7eb9459 100644 --- a/llm-service/app/ai/vector_stores/vector_store.py +++ b/llm-service/app/ai/vector_stores/vector_store.py @@ -66,3 +66,7 @@ def llama_vector_store(self) -> BasePydanticVectorStore: @abstractmethod def exists(self) -> bool: """Does the vector store exist?""" + + @abstractmethod + def visualize(self, user_query: Optional[str] = None) -> list[tuple[tuple[float,float], str]]: + """get a 2-d visualization of the vectors in the store""" diff --git a/llm-service/app/routers/index/data_source/__init__.py b/llm-service/app/routers/index/data_source/__init__.py index d3428870..ed985aac 100644 --- a/llm-service/app/routers/index/data_source/__init__.py +++ b/llm-service/app/routers/index/data_source/__init__.py @@ -99,6 +99,23 @@ def chunk_contents(self, chunk_id: str) -> ChunkContentsResponse: metadata=node.metadata, ) + + @router.get("/visualize") + @exceptions.propagates + def visualize(self) -> list[tuple[tuple[float,float], str]]: + return self.chunks_vector_store.visualize() + + + class VisualizationRequest(BaseModel): + user_query: str + + + @router.post("/visualize") + @exceptions.propagates + def visualize_with_query(self, request: VisualizationRequest) -> list[tuple[tuple[float,float], str]]: + return self.chunks_vector_store.visualize(request.user_query) + + @router.delete( "/", summary="Deletes the data source from the index.", response_model=None ) diff --git a/llm-service/app/services/evaluators.py b/llm-service/app/services/evaluators.py index 12516949..8ea5eba0 100644 --- a/llm-service/app/services/evaluators.py +++ b/llm-service/app/services/evaluators.py @@ -39,21 +39,15 @@ from llama_index.core.base.response.schema import Response from llama_index.core.chat_engine.types import AgentChatResponse from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator -from llama_index.llms.bedrock import Bedrock -from .llama_utils import completion_to_prompt, messages_to_prompt +from ..services import models def evaluate_response( - query: str, - chat_response: AgentChatResponse, + query: str, + chat_response: AgentChatResponse, ) -> tuple[float, float]: - evaluator_llm = Bedrock( - model="meta.llama3-8b-instruct-v1:0", - context_size=128000, - messages_to_prompt=messages_to_prompt, - completion_to_prompt=completion_to_prompt, - ) + evaluator_llm = models.get_llm("meta.llama3-8b-instruct-v1:0") relevancy_evaluator = RelevancyEvaluator(llm=evaluator_llm) relevance = relevancy_evaluator.evaluate_response( diff --git a/llm-service/app/services/models.py b/llm-service/app/services/models.py index 236e4d70..b1510ae2 100644 --- a/llm-service/app/services/models.py +++ b/llm-service/app/services/models.py @@ -44,6 +44,7 @@ from llama_index.core.llms import LLM from llama_index.embeddings.bedrock import BedrockEmbedding from llama_index.llms.bedrock import Bedrock +from llama_index.llms.bedrock.utils import BEDROCK_FOUNDATION_LLMS from .caii import get_caii_embedding_models, get_caii_llm_models from .caii import get_embedding_model as caii_embedding @@ -67,7 +68,7 @@ def get_llm(model_name: str = None) -> LLM: ) return Bedrock( model=model_name, - context_size=128000, + context_size=BEDROCK_FOUNDATION_LLMS.get(model_name, 8192), messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, ) diff --git a/llm-service/pyproject.toml b/llm-service/pyproject.toml index da8e803e..dcbe8d7c 100644 --- a/llm-service/pyproject.toml +++ b/llm-service/pyproject.toml @@ -21,6 +21,11 @@ dependencies = [ "docx2txt>=0.8", "pandas>=2.2.3", "fastapi-utils>=0.8.0", + "umap-learn>=0.5.7", + "python-pptx>=1.0.2", + "torch>=2.5.1", + "pillow>=10.4.0", + "transformers>=4.46.3", "docling>=2.7.0", ] requires-python = "==3.10.*" diff --git a/llm-service/uv.lock b/llm-service/uv.lock index 78556a7d..f644907c 100644 --- a/llm-service/uv.lock +++ b/llm-service/uv.lock @@ -1085,8 +1085,13 @@ dependencies = [ { name = "llama-index-readers-file" }, { name = "llama-index-vector-stores-qdrant" }, { name = "pandas" }, + { name = "pillow" }, { name = "pydantic" }, { name = "pydantic-settings" }, + { name = "python-pptx" }, + { name = "torch" }, + { name = "transformers" }, + { name = "umap-learn" }, ] [package.dev-dependencies] @@ -1114,8 +1119,13 @@ requires-dist = [ { name = "llama-index-readers-file", specifier = "==0.1.33" }, { name = "llama-index-vector-stores-qdrant", specifier = "==0.2.17" }, { name = "pandas", specifier = ">=2.2.3" }, + { name = "pillow", specifier = ">=10.4.0" }, { name = "pydantic", specifier = "==2.8.2" }, { name = "pydantic-settings", specifier = "==2.3.4" }, + { name = "python-pptx", specifier = ">=1.0.2" }, + { name = "torch", specifier = ">=2.5.1" }, + { name = "transformers", specifier = ">=4.46.3" }, + { name = "umap-learn", specifier = ">=0.5.7" }, ] [package.metadata.requires-dev] @@ -1127,6 +1137,19 @@ dev = [ { name = "ruff", specifier = ">=0.7.4" }, ] +[[package]] +name = "llvmlite" +version = "0.43.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9f/3d/f513755f285db51ab363a53e898b85562e950f79a2e6767a364530c2f645/llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5", size = 157069 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/23/ff/6ca7e98998b573b4bd6566f15c35e5c8bea829663a6df0c7aa55ab559da9/llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761", size = 31064408 }, + { url = "https://files.pythonhosted.org/packages/ca/5c/a27f9257f86f0cda3f764ff21d9f4217b9f6a0d45e7a39ecfa7905f524ce/llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc", size = 28793153 }, + { url = "https://files.pythonhosted.org/packages/7e/3c/4410f670ad0a911227ea2ecfcba9f672a77cf1924df5280c4562032ec32d/llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead", size = 42857276 }, + { url = "https://files.pythonhosted.org/packages/c6/21/2ffbab5714e72f2483207b4a1de79b2eecd9debbf666ff4e7067bcc5c134/llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a", size = 43871781 }, + { url = "https://files.pythonhosted.org/packages/f2/26/b5478037c453554a61625ef1125f7e12bb1429ae11c6376f47beba9b0179/llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed", size = 28123487 }, +] + [[package]] name = "lxml" version = "5.3.0" @@ -1379,6 +1402,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl", hash = "sha256:4fa26829c5b00715afe3061398a8989dc643b92ce7dd93fb4585a70930d168a1", size = 1505442 }, ] +[[package]] +name = "numba" +version = "0.60.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "llvmlite" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/93/2849300a9184775ba274aba6f82f303343669b0592b7bb0849ea713dabb0/numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16", size = 2702171 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/cf/baa13a7e3556d73d9e38021e6d6aa4aeb30d8b94545aa8b70d0f24a1ccc4/numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651", size = 2647627 }, + { url = "https://files.pythonhosted.org/packages/ac/ba/4b57fa498564457c3cc9fc9e570a6b08e6086c74220f24baaf04e54b995f/numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b", size = 2650322 }, + { url = "https://files.pythonhosted.org/packages/28/98/7ea97ee75870a54f938a8c70f7e0be4495ba5349c5f9db09d467c4a5d5b7/numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781", size = 3407390 }, + { url = "https://files.pythonhosted.org/packages/79/58/cb4ac5b8f7ec64200460aef1fed88258fb872ceef504ab1f989d2ff0f684/numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e", size = 3699694 }, + { url = "https://files.pythonhosted.org/packages/1c/b0/c61a93ca947d12233ff45de506ddbf52af3f752066a0b8be4d27426e16da/numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198", size = 2687030 }, +] + [[package]] name = "numpy" version = "1.26.4" @@ -1828,6 +1868,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", size = 1205513 }, ] +[[package]] +name = "pynndescent" +version = "0.5.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "llvmlite" }, + { name = "numba" }, + { name = "scikit-learn" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/58/560a4db5eb3794d922fe55804b10326534ded3d971e1933c1eef91193f5e/pynndescent-0.5.13.tar.gz", hash = "sha256:d74254c0ee0a1eeec84597d5fe89fedcf778593eeabe32c2f97412934a9800fb", size = 2975955 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/53/d23a97e0a2c690d40b165d1062e2c4ccc796be458a1ce59f6ba030434663/pynndescent-0.5.13-py3-none-any.whl", hash = "sha256:69aabb8f394bc631b6ac475a1c7f3994c54adf3f51cd63b2730fefba5771b949", size = 56850 }, +] + [[package]] name = "pypdf" version = "4.3.1" @@ -2190,6 +2246,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/66/05/7957af15543b8c9799209506df4660cba7afc4cf94bfb60513827e96bed6/s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e", size = 83175 }, ] +[[package]] +name = "safetensors" +version = "0.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cb/46/a1c56ed856c6ac3b1a8b37abe5be0cac53219367af1331e721b04d122577/safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310", size = 65702 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/10/0798ec2c8704c2d172620d8a3725bed92cdd75516357b1a3e64d4229ea4e/safetensors-0.4.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a63eaccd22243c67e4f2b1c3e258b257effc4acd78f3b9d397edc8cf8f1298a7", size = 392312 }, + { url = "https://files.pythonhosted.org/packages/2b/9e/9648d8dbb485c40a4a0212b7537626ae440b48156cc74601ca0b7a7615e0/safetensors-0.4.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:23fc9b4ec7b602915cbb4ec1a7c1ad96d2743c322f20ab709e2c35d1b66dad27", size = 381858 }, + { url = "https://files.pythonhosted.org/packages/8b/67/49556aeacc00df353767ed31d68b492fecf38c3f664c52692e4d92aa0032/safetensors-0.4.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6885016f34bef80ea1085b7e99b3c1f92cb1be78a49839203060f67b40aee761", size = 441382 }, + { url = "https://files.pythonhosted.org/packages/5d/ce/e9f4869a37bb11229e6cdb4e73a6ef23b4f360eee9dca5f7e40982779704/safetensors-0.4.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:133620f443450429322f238fda74d512c4008621227fccf2f8cf4a76206fea7c", size = 439001 }, + { url = "https://files.pythonhosted.org/packages/a0/27/aee8cf031b89c34caf83194ec6b7f2eed28d053fff8b6da6d00c85c56035/safetensors-0.4.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4fb3e0609ec12d2a77e882f07cced530b8262027f64b75d399f1504ffec0ba56", size = 478026 }, + { url = "https://files.pythonhosted.org/packages/da/33/1d9fc4805c623636e7d460f28eec92ebd1856f7a552df8eb78398a1ef4de/safetensors-0.4.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d0f1dd769f064adc33831f5e97ad07babbd728427f98e3e1db6902e369122737", size = 495545 }, + { url = "https://files.pythonhosted.org/packages/b9/df/6f766b56690709d22e83836e4067a1109a7d84ea152a6deb5692743a2805/safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6d156bdb26732feada84f9388a9f135528c1ef5b05fae153da365ad4319c4c5", size = 435016 }, + { url = "https://files.pythonhosted.org/packages/90/fa/7bc3f18086201b1e55a42c88b822ae197d0158e12c54cd45c887305f1b7e/safetensors-0.4.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9e347d77e2c77eb7624400ccd09bed69d35c0332f417ce8c048d404a096c593b", size = 456273 }, + { url = "https://files.pythonhosted.org/packages/3e/59/2ae50150d37a65c1c5f01aec74dc737707b8bbecdc76307e5a1a12c8a376/safetensors-0.4.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9f556eea3aec1d3d955403159fe2123ddd68e880f83954ee9b4a3f2e15e716b6", size = 619669 }, + { url = "https://files.pythonhosted.org/packages/fe/43/10f0bb597aef62c9c154152e265057089f3c729bdd980e6c32c3ec2407a4/safetensors-0.4.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9483f42be3b6bc8ff77dd67302de8ae411c4db39f7224dec66b0eb95822e4163", size = 605212 }, + { url = "https://files.pythonhosted.org/packages/7c/75/ede6887ea0ceaba55730988bfc7668dc147a8758f907fa6db26fbb681b8e/safetensors-0.4.5-cp310-none-win32.whl", hash = "sha256:7389129c03fadd1ccc37fd1ebbc773f2b031483b04700923c3511d2a939252cc", size = 272652 }, + { url = "https://files.pythonhosted.org/packages/ba/f0/919c72a9eef843781e652d0650f2819039943e69b69d5af2d0451a23edc3/safetensors-0.4.5-cp310-none-win_amd64.whl", hash = "sha256:e98ef5524f8b6620c8cdef97220c0b6a5c1cef69852fcd2f174bb96c2bb316b1", size = 285879 }, + { url = "https://files.pythonhosted.org/packages/cf/ff/037ae4c0ee32db496669365e66079b6329906c6814722b159aa700e67208/safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410", size = 392951 }, + { url = "https://files.pythonhosted.org/packages/f1/d6/6621e16b35bf83ae099eaab07338f04991a26c9aa43879d05f19f35e149c/safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c", size = 383417 }, + { url = "https://files.pythonhosted.org/packages/ae/88/3068e1bb16f5e9f9068901de3cf7b3db270b9bfe6e7d51d4b55c1da0425d/safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597", size = 442311 }, + { url = "https://files.pythonhosted.org/packages/f7/15/a2bb77ebbaa76b61ec2e9f731fe4db7f9473fd855d881957c51b3a168892/safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920", size = 436678 }, + { url = "https://files.pythonhosted.org/packages/ec/79/9608c4546cdbfe3860dd7aa59e3562c9289113398b1a0bd89b68ce0a9d41/safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a", size = 457316 }, + { url = "https://files.pythonhosted.org/packages/0f/23/b17b483f2857835962ad33e38014efd4911791187e177bc23b057d35bee8/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab", size = 620565 }, + { url = "https://files.pythonhosted.org/packages/19/46/5d11dc300feaad285c2f1bd784ff3f689f5e0ab6be49aaf568f3a77019eb/safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f", size = 606660 }, +] + [[package]] name = "scikit-image" version = "0.24.0" @@ -2213,6 +2296,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/89/3fcd68d034db5d29c974e964d03deec9d0fbf9410ff0a0b95efff70947f6/scikit_image-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:7ac7913b028b8aa780ffae85922894a69e33d1c0bf270ea1774f382fe8bf95e7", size = 12864601 }, ] +[[package]] +name = "scikit-learn" +version = "1.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numpy" }, + { name = "scipy" }, + { name = "threadpoolctl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/37/59/44985a2bdc95c74e34fef3d10cb5d93ce13b0e2a7baefffe1b53853b502d/scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d", size = 7001680 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/89/be41419b4bec629a4691183a5eb1796f91252a13a5ffa243fd958cad7e91/scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6", size = 12106070 }, + { url = "https://files.pythonhosted.org/packages/bf/e0/3b6d777d375f3b685f433c93384cdb724fb078e1dc8f8ff0950467e56c30/scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0", size = 10971758 }, + { url = "https://files.pythonhosted.org/packages/7b/31/eb7dd56c371640753953277de11356c46a3149bfeebb3d7dcd90b993715a/scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540", size = 12500080 }, + { url = "https://files.pythonhosted.org/packages/4c/1e/a7c7357e704459c7d56a18df4a0bf08669442d1f8878cc0864beccd6306a/scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8", size = 13347241 }, + { url = "https://files.pythonhosted.org/packages/48/76/154ebda6794faf0b0f3ccb1b5cd9a19f0a63cb9e1f3d2c61b6114002677b/scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113", size = 11000477 }, +] + [[package]] name = "scipy" version = "1.14.1" @@ -2371,6 +2473,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/3f/8ba87d9e287b9d385a02a7114ddcef61b26f86411e121c9003eb509a1773/tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687", size = 28165 }, ] +[[package]] +name = "threadpoolctl" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/55/b5148dcbf72f5cde221f8bfe3b6a540da7aa1842f6b491ad979a6c8b84af/threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107", size = 41936 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/2c/ffbf7a134b9ab11a67b0cf0726453cedd9c5043a4fe7a35d1cefa9a1bcfb/threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467", size = 18414 }, +] + [[package]] name = "tifffile" version = "2024.9.20" @@ -2500,6 +2611,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2b/78/57043611a16c655c8350b4c01b8d6abfb38cc2acb475238b62c2146186d7/tqdm-4.67.0-py3-none-any.whl", hash = "sha256:0cd8af9d56911acab92182e88d763100d4788bdf421d251616040cc4d44863be", size = 78590 }, ] +[[package]] +name = "transformers" +version = "4.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock" }, + { name = "huggingface-hub" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "regex" }, + { name = "requests" }, + { name = "safetensors" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/37/5a/58f96c83e566f907ae39f16d4401bbefd8bb85c60bd1e6a95c419752ab90/transformers-4.46.3.tar.gz", hash = "sha256:8ee4b3ae943fe33e82afff8e837f4b052058b07ca9be3cb5b729ed31295f72cc", size = 8627944 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/51/b87caa939fedf307496e4dbf412f4b909af3d9ca8b189fc3b65c1faa456f/transformers-4.46.3-py3-none-any.whl", hash = "sha256:a12ef6f52841fd190a3e5602145b542d03507222f2c64ebb7ee92e8788093aef", size = 10034536 }, +] + [[package]] name = "triton" version = "3.1.0" @@ -2581,6 +2713,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bf/ecd14d3cf6127f8a990b01f0ad20e257f5619a555f47d707c57d39934894/ujson-5.10.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:baed37ea46d756aca2955e99525cc02d9181de67f25515c468856c38d52b5f3b", size = 42224 }, ] +[[package]] +name = "umap-learn" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numba" }, + { name = "numpy" }, + { name = "pynndescent" }, + { name = "scikit-learn" }, + { name = "scipy" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6f/d4/9ed627905f7993349671283b3c5bf2d9f543ef79229fa1c7e01324eb900c/umap-learn-0.5.7.tar.gz", hash = "sha256:b2a97973e4c6ffcebf241100a8de589a4c84126a832ab40f296c6d9fcc5eb19e", size = 92680 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/8f/671c0e1f2572ba625cbcc1faeba9435e00330c3d6962858711445cf1e817/umap_learn-0.5.7-py3-none-any.whl", hash = "sha256:6a7e0be2facfa365a5ed6588447102bdbef32a0ef449535c25c97ea7e680073c", size = 88815 }, +] + [[package]] name = "urllib3" version = "2.2.3" diff --git a/scripts/release_version.txt b/scripts/release_version.txt index 706203d3..32150ea1 100644 --- a/scripts/release_version.txt +++ b/scripts/release_version.txt @@ -1 +1 @@ -export RELEASE_TAG=1.3.0-release +export RELEASE_TAG=dev-testing diff --git a/ui/package.json b/ui/package.json index 8c7d5552..7c9cf7e7 100644 --- a/ui/package.json +++ b/ui/package.json @@ -20,9 +20,12 @@ "@tanstack/react-query-devtools": "^5.59.20", "@tanstack/react-router": "^1.81.4", "antd": "^5.22.0", + "chart.js": "^4.4.6", + "chartjs-plugin-datalabels": "^2.2.0", "date-fns": "^4.1.0", "lodash": "^4.17.21", "react": "^18.3.1", + "react-chartjs-2": "^5.2.0", "react-dom": "^18.3.1" }, "devDependencies": { diff --git a/ui/pnpm-lock.yaml b/ui/pnpm-lock.yaml index 894ea04c..7e03b55e 100644 --- a/ui/pnpm-lock.yaml +++ b/ui/pnpm-lock.yaml @@ -23,6 +23,12 @@ importers: antd: specifier: ^5.22.0 version: 5.22.0(date-fns@4.1.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + chart.js: + specifier: ^4.4.6 + version: 4.4.6 + chartjs-plugin-datalabels: + specifier: ^2.2.0 + version: 2.2.0(chart.js@4.4.6) date-fns: specifier: ^4.1.0 version: 4.1.0 @@ -32,6 +38,9 @@ importers: react: specifier: ^18.3.1 version: 18.3.1 + react-chartjs-2: + specifier: ^5.2.0 + version: 5.2.0(chart.js@4.4.6)(react@18.3.1) react-dom: specifier: ^18.3.1 version: 18.3.1(react@18.3.1) @@ -655,6 +664,9 @@ packages: '@jridgewell/trace-mapping@0.3.25': resolution: {integrity: sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==} + '@kurkle/color@0.3.4': + resolution: {integrity: sha512-M5UknZPHRu3DEDWoipU6sE8PdkZ6Z/S+v4dD+Ke8IaNlpdSQah50lz1KtcFBa2vsdOnwbbnxJwVM4wty6udA5w==} + '@nodelib/fs.scandir@2.1.5': resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==} engines: {node: '>= 8'} @@ -1311,6 +1323,15 @@ packages: resolution: {integrity: sha512-dLitG79d+GV1Nb/VYcCDFivJeK1hiukt9QjRNVOsUtTy1rR1YJsmpGGTZ3qJos+uw7WmWF4wUwBd9jxjocFC2w==} engines: {node: ^12.17.0 || ^14.13 || >=16.0.0} + chart.js@4.4.6: + resolution: {integrity: sha512-8Y406zevUPbbIBA/HRk33khEmQPk5+cxeflWE/2rx1NJsjVWMPw/9mSP9rxHP5eqi6LNoPBVMfZHxbwLSgldYA==} + engines: {pnpm: '>=8'} + + chartjs-plugin-datalabels@2.2.0: + resolution: {integrity: sha512-14ZU30lH7n89oq+A4bWaJPnAG8a7ZTk7dKf48YAzMvJjQtjrgg5Dpk9f+LbjCF6bpx3RAGTeL13IXpKQYyRvlw==} + peerDependencies: + chart.js: '>=3.0.0' + check-error@2.1.1: resolution: {integrity: sha512-OAlb+T7V4Op9OwdkjmguYRqncdlx5JiofwOAUkmTF+jNdHwzTaTs4sRAGpzLF3oOz5xAyDGrPgeIDFQmDOTiJw==} engines: {node: '>= 16'} @@ -2560,6 +2581,12 @@ packages: react: '>=16.9.0' react-dom: '>=16.9.0' + react-chartjs-2@5.2.0: + resolution: {integrity: sha512-98iN5aguJyVSxp5U3CblRLH67J8gkfyGNbiK3c+l1QI/G4irHMPQw44aEPmjVag+YKTyQ260NcF82GTQ3bdscA==} + peerDependencies: + chart.js: ^4.1.1 + react: ^16.8.0 || ^17.0.0 || ^18.0.0 + react-dom@18.3.1: resolution: {integrity: sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==} peerDependencies: @@ -3520,6 +3547,8 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.0 + '@kurkle/color@0.3.4': {} + '@nodelib/fs.scandir@2.1.5': dependencies: '@nodelib/fs.stat': 2.0.5 @@ -4310,6 +4339,14 @@ snapshots: chalk@5.3.0: {} + chart.js@4.4.6: + dependencies: + '@kurkle/color': 0.3.4 + + chartjs-plugin-datalabels@2.2.0(chart.js@4.4.6): + dependencies: + chart.js: 4.4.6 + check-error@2.1.1: {} chokidar@3.6.0: @@ -5759,6 +5796,11 @@ snapshots: react: 18.3.1 react-dom: 18.3.1(react@18.3.1) + react-chartjs-2@5.2.0(chart.js@4.4.6)(react@18.3.1): + dependencies: + chart.js: 4.4.6 + react: 18.3.1 + react-dom@18.3.1(react@18.3.1): dependencies: loose-envify: 1.4.0 diff --git a/ui/src/api/dataSourceApi.ts b/ui/src/api/dataSourceApi.ts index 0fb30617..2fbe04b6 100644 --- a/ui/src/api/dataSourceApi.ts +++ b/ui/src/api/dataSourceApi.ts @@ -40,6 +40,7 @@ import { queryOptions, useMutation, useQuery } from "@tanstack/react-query"; import { deleteRequest, getRequest, + llmServicePath, MutationKeys, paths, postRequest, @@ -74,6 +75,8 @@ export type DataSourceType = DataSourceBaseType & { documentCount: number; }; +export type Point2d = [[number, number], string]; + export const useCreateDataSourceMutation = ({ onSuccess, onError, @@ -154,6 +157,47 @@ const getDataSourceByIdQuery = async ( return await getRequest(`${ragPath}/${paths.dataSources}/${dataSourceId}`); }; +export const getVisualizeDataSource = (dataSourceId: string) => { + return queryOptions({ + queryKey: [QueryKeys.getVisualizeDataSource, { dataSourceId }], + queryFn: () => getVisualizeDataSourceQuery(dataSourceId), + }); +}; + +const getVisualizeDataSourceQuery = async ( + dataSourceId: string, +): Promise => { + return await getRequest( + `${llmServicePath}/data_sources/${dataSourceId}/visualize`, + ); +}; + +export const useVisualizeDataSourceWithUserQuery = ({ + onSuccess, + onError, +}: UseMutationType) => { + return useMutation({ + mutationKey: [MutationKeys.visualizeDataSourceWithUserQuery], + mutationFn: visualizeDataSourceWithUserQuery, + onSuccess, + onError, + }); +}; + +export interface VisualizationRequest { + dataSourceId: string; + userQuery: string; +} + +const visualizeDataSourceWithUserQuery = async ( + request: VisualizationRequest, +): Promise => { + return await postRequest( + `${llmServicePath}/data_sources/${request.dataSourceId}/visualize`, + { user_query: request.userQuery }, + ); +}; + export const getCdfConfigQuery = async ( dataSourceId: string, ): Promise => { diff --git a/ui/src/api/ragQueryApi.ts b/ui/src/api/ragQueryApi.ts index 5e8d88e9..4a109145 100644 --- a/ui/src/api/ragQueryApi.ts +++ b/ui/src/api/ragQueryApi.ts @@ -94,9 +94,14 @@ const suggestQuestionsQuery = async ( ); }; -interface ChunkContentsResponse { +export interface ChunkMetadata { + row_number?: number; + page_label?: string; +} + +export interface ChunkContentsResponse { text: string; - metadata: Record; + metadata: ChunkMetadata; } interface ChunkContentsRequest { diff --git a/ui/src/api/utils.ts b/ui/src/api/utils.ts index 40bbe4a5..bf3ed9de 100644 --- a/ui/src/api/utils.ts +++ b/ui/src/api/utils.ts @@ -67,6 +67,7 @@ export enum MutationKeys { "updateAmp" = "updateAmp", "testLlmModel" = "testLlmModel", "testEmbeddingModel" = "testEmbeddingModel", + "visualizeDataSourceWithUserQuery" = "visualizeDataSourceWithUserQuery", } export enum QueryKeys { @@ -83,6 +84,7 @@ export enum QueryKeys { "getLlmModels" = "getLlmModels", "getEmbeddingModels" = "getEmbeddingModels", "getModelSource" = "getModelSource", + "getVisualizeDataSource" = "getVisualizeDataSource", } export const commonHeaders = { diff --git a/ui/src/pages/DataSources/Tabs.tsx b/ui/src/pages/DataSources/Tabs.tsx index b22a92af..10468a5b 100644 --- a/ui/src/pages/DataSources/Tabs.tsx +++ b/ui/src/pages/DataSources/Tabs.tsx @@ -40,29 +40,67 @@ import { Flex, Tabs, TabsProps } from "antd"; import FileManagement from "pages/DataSources/ManageTab/FileManagement.tsx"; import IndexSettings from "pages/DataSources/IndexSettingsTab/IndexSettings.tsx"; import DataSourceConnections from "pages/DataSources/DataSourceConnectionsTab/DataSourceConnections.tsx"; +import "chart.js/auto"; +import DataSourceVisualization from "pages/DataSources/VisualizationTab/DataSourceVisualization.tsx"; +import { useLocation, useNavigate } from "@tanstack/react-router"; +import { useEffect } from "react"; export const tabItems: TabsProps["items"] = [ { - key: "1", + key: "manage", label: "Manage", children: , }, { - key: "2", + key: "settings", label: "Index Settings", children: , }, { - key: "3", + key: "connections", label: "Connections", children: , }, + { + key: "visualize", + label: "Visualize", + children: , + }, ]; const DataSourcesTabs = () => { + const navigate = useNavigate(); + const location = useLocation(); + + const handleNav = (key: string) => { + navigate({ hash: key }).catch((reason: unknown) => { + console.error(reason); + }); + }; + + useEffect(() => { + if (location.hash) { + const tabsIncludeHash = tabItems.find( + (item) => item.key === location.hash, + ); + + if (!tabsIncludeHash) { + handleNav("manage"); + } + } + }, [location.hash, tabItems, navigate]); + return ( - + { + handleNav(key); + }} + /> ); }; diff --git a/ui/src/pages/DataSources/VisualizationTab/DataSourceVisualization.tsx b/ui/src/pages/DataSources/VisualizationTab/DataSourceVisualization.tsx new file mode 100644 index 00000000..723c1b19 --- /dev/null +++ b/ui/src/pages/DataSources/VisualizationTab/DataSourceVisualization.tsx @@ -0,0 +1,127 @@ +/******************************************************************************* + * CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) + * (C) Cloudera, Inc. 2024 + * All rights reserved. + * + * Applicable Open Source License: Apache 2.0 + * + * NOTE: Cloudera open source products are modular software products + * made up of hundreds of individual components, each of which was + * individually copyrighted. Each Cloudera open source product is a + * collective work under U.S. Copyright Law. Your license to use the + * collective work is as provided in your written agreement with + * Cloudera. Used apart from the collective work, this file is + * licensed for your use pursuant to the open source license + * identified above. + * + * This code is provided to you pursuant a written agreement with + * (i) Cloudera, Inc. or (ii) a third-party authorized to distribute + * this code. If you do not have a written agreement with Cloudera nor + * with an authorized and properly licensed third party, you do not + * have any rights to access nor to use this code. + * + * Absent a written agreement with Cloudera, Inc. ("Cloudera") to the + * contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY + * KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED + * WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO + * IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, + * AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS + * ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE + * OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR + * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES + * RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF + * BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF + * DATA. + ******************************************************************************/ + +import { useParams } from "@tanstack/react-router"; +import { useEffect, useState } from "react"; +import { + getVisualizeDataSource, + Point2d, + useVisualizeDataSourceWithUserQuery, +} from "src/api/dataSourceApi.ts"; +import { useQuery } from "@tanstack/react-query"; +import messageQueue from "src/utils/messageQueue.ts"; +import { Flex, Input, Tooltip, Typography } from "antd"; +import VectorGraph from "pages/DataSources/VisualizationTab/VectorGraph.tsx"; +import { QuestionCircleOutlined } from "@ant-design/icons"; + +const DataSourceVisualization = () => { + const dataSourceId = useParams({ + from: "/_layout/data/_layout-datasources/$dataSourceId", + }).dataSourceId; + const [userInput, setUserInput] = useState(""); + const [vectorData, setVectorData] = useState([]); + + const { data, isPending } = useQuery(getVisualizeDataSource(dataSourceId)); + + useEffect(() => { + if (data) { + setVectorData(data); + } + }, [data]); + + const questionMutation = useVisualizeDataSourceWithUserQuery({ + onSuccess: (result) => { + setVectorData(result); + }, + onError: (res: Error) => { + messageQueue.error(res.toString()); + }, + }); + + const handleQuestion = (question: string) => { + questionMutation.mutate({ + userQuery: question, + dataSourceId: dataSourceId.toString(), + }); + }; + const loading = + isPending || questionMutation.isPending || vectorData.length === 0; + + return ( + + + + 2d Chunk Vector Projection{" "} + + + + + + + + + { + setUserInput(e.target.value); + }} + onKeyDown={(e) => { + if (e.key === "Enter") { + handleQuestion(userInput); + } + }} + /> + + ); +}; + +export default DataSourceVisualization; diff --git a/ui/src/pages/DataSources/VisualizationTab/VectorGraph.tsx b/ui/src/pages/DataSources/VisualizationTab/VectorGraph.tsx new file mode 100644 index 00000000..559a08ce --- /dev/null +++ b/ui/src/pages/DataSources/VisualizationTab/VectorGraph.tsx @@ -0,0 +1,149 @@ +/******************************************************************************* + * CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) + * (C) Cloudera, Inc. 2024 + * All rights reserved. + * + * Applicable Open Source License: Apache 2.0 + * + * NOTE: Cloudera open source products are modular software products + * made up of hundreds of individual components, each of which was + * individually copyrighted. Each Cloudera open source product is a + * collective work under U.S. Copyright Law. Your license to use the + * collective work is as provided in your written agreement with + * Cloudera. Used apart from the collective work, this file is + * licensed for your use pursuant to the open source license + * identified above. + * + * This code is provided to you pursuant a written agreement with + * (i) Cloudera, Inc. or (ii) a third-party authorized to distribute + * this code. If you do not have a written agreement with Cloudera nor + * with an authorized and properly licensed third party, you do not + * have any rights to access nor to use this code. + * + * Absent a written agreement with Cloudera, Inc. ("Cloudera") to the + * contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY + * KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED + * WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO + * IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, + * AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS + * ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE + * OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR + * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES + * RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF + * BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF + * DATA. + ******************************************************************************/ + +import { Point2d } from "src/api/dataSourceApi.ts"; +import { Scatter } from "react-chartjs-2"; +import { ChartDataset, Point } from "chart.js"; +import { Skeleton } from "antd"; + +type DataSets = ChartDataset<"scatter", (number | Point | null)[]>[]; + +const colors = [ + "rgba(255, 99, 132)", + "rgba(54, 162, 235)", + "rgba(255, 206, 86)", + "rgba(75, 192, 192)", + "rgba(153, 102, 255)", + "rgba(255, 159, 64)", + "rgba(199, 199, 199)", + "rgba(83, 102, 255)", + "rgba(255, 99, 255)", + "rgba(99, 255, 132)", + "rgba(255, 99, 71)", + "rgba(60, 179, 113)", + "rgba(123, 104, 238)", + "rgba(255, 215, 0)", + "rgba(0, 191, 255)", + "rgba(255, 69, 0)", + "rgba(138, 43, 226)", + "rgba(0, 255, 127)", + "rgba(70, 130, 180)", + "rgba(255, 20, 147)", +]; + +const hashStringToIndex = (str: string): number => { + let hash = 0; + for (let i = 0; i < str.length; i++) { + hash = (hash << 5) - hash + str.charCodeAt(i); + hash |= 0; // Convert to 32bit integer + } + return Math.abs(hash % colors.length); +}; + +const VectorGraph = ({ + rawData, + userInput, + loading, +}: { + rawData: Point2d[]; + userInput: string; + loading: boolean; +}) => { + const points: Record = {}; + + rawData.forEach((d: Point2d) => { + if (d[1] in points) { + points[d[1]].push({ x: d[0][0], y: d[0][1] }); + } else { + points[d[1]] = [{ x: d[0][0], y: d[0][1] }]; + } + }); + + const pickColor = (label: string) => colors[hashStringToIndex(label)]; + + const vizDatasets: DataSets = Object.entries(points).map( + ([label, points]) => { + const userQuery = label === "USER_QUERY"; + const color = pickColor(label); + return { + label: userQuery ? `Query: ${userInput}` : label, + data: points, + backgroundColor: userQuery ? "lightgray" : color, + borderColor: userQuery ? "black" : color, + borderWidth: 1, + pointStyle: userQuery ? "circle" : "circle", + pointRadius: userQuery ? 15 : 3, + pointHoverRadius: userQuery ? 15 : 8, + pointHoverBackgroundColor: "black", + }; + }, + ); + + if (loading) { + return ; + } + + return ( + "", + }, + }, + }, + interaction: { mode: "dataset" }, + }} + /> + ); +}; + +export default VectorGraph; diff --git a/ui/src/pages/RagChatTab/ChatOutput/Sources/MetaData.tsx b/ui/src/pages/RagChatTab/ChatOutput/Sources/MetaData.tsx new file mode 100644 index 00000000..04917038 --- /dev/null +++ b/ui/src/pages/RagChatTab/ChatOutput/Sources/MetaData.tsx @@ -0,0 +1,81 @@ +/******************************************************************************* + * CLOUDERA APPLIED MACHINE LEARNING PROTOTYPE (AMP) + * (C) Cloudera, Inc. 2024 + * All rights reserved. + * + * Applicable Open Source License: Apache 2.0 + * + * NOTE: Cloudera open source products are modular software products + * made up of hundreds of individual components, each of which was + * individually copyrighted. Each Cloudera open source product is a + * collective work under U.S. Copyright Law. Your license to use the + * collective work is as provided in your written agreement with + * Cloudera. Used apart from the collective work, this file is + * licensed for your use pursuant to the open source license + * identified above. + * + * This code is provided to you pursuant a written agreement with + * (i) Cloudera, Inc. or (ii) a third-party authorized to distribute + * this code. If you do not have a written agreement with Cloudera nor + * with an authorized and properly licensed third party, you do not + * have any rights to access nor to use this code. + * + * Absent a written agreement with Cloudera, Inc. ("Cloudera") to the + * contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY + * KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED + * WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO + * IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU, + * AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS + * ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE + * OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR + * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES + * RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF + * BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF + * DATA. + ******************************************************************************/ +import { Typography } from "antd"; +import { ChunkContentsResponse } from "src/api/ragQueryApi.ts"; + +const MetaData = ({ + metadata, +}: { + metadata: ChunkContentsResponse["metadata"]; +}) => { + const MetaDataItem = ({ + label, + value, + }: { + label: string; + value: string | number | undefined; + }) => ( + <> + {value && ( + + {label}: {value} + + )} + + ); + + const hasMetadata = metadata.row_number ?? metadata.page_label; + + return ( + <> + + Metadata + + {hasMetadata ? ( + <> + + + + ) : ( + N/A + )} + + ); +}; + +export default MetaData; diff --git a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceCard.tsx b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceCard.tsx index 8da7b0f7..74b83a49 100644 --- a/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceCard.tsx +++ b/ui/src/pages/RagChatTab/ChatOutput/Sources/SourceCard.tsx @@ -54,6 +54,7 @@ import { useGetChunkContents } from "src/api/ragQueryApi.ts"; import { useGetDocumentSummary } from "src/api/summaryApi.ts"; import DocumentationIcon from "src/cuix/icons/DocumentationIcon"; import { cdlGray600 } from "src/cuix/variables.ts"; +import MetaData from "pages/RagChatTab/ChatOutput/Sources/MetaData.tsx"; export const SourceCard = ({ source }: { source: SourceNode }) => { const { dataSourceId } = useContext(RagChatContext); @@ -99,7 +100,7 @@ export const SourceCard = ({ source }: { source: SourceNode }) => { - Generated document summary: + Generated document summary {documentSummary.data ?? "No summary available"} @@ -132,19 +133,7 @@ export const SourceCard = ({ source }: { source: SourceNode }) => { > {chunkContents.data.text} - - Metadata - - {chunkContents.data.metadata.row_number && ( - - Row number: {chunkContents.data.metadata.row_number} - - )} - {chunkContents.data.metadata.page_label && ( - - Page label: {chunkContents.data.metadata.page_label} - - )} + ) )} diff --git a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx index d2e6cb1e..901b8b1a 100644 --- a/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx +++ b/ui/src/pages/RagChatTab/FooterComponents/RagChatQueryInput.tsx @@ -135,7 +135,6 @@ const RagChatQueryInput = () => { checkedChildren={} value={!queryConfiguration.exclude_knowledge_base} onChange={handleExcludeKnowledgeBase} - style={{ display: "none" }} // note: disabled for now, until UX is ready /> }