-
Notifications
You must be signed in to change notification settings - Fork 143
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into embedding_reorg
- Loading branch information
Showing
35 changed files
with
1,580 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
__pycache__ | ||
*.egg-info/ | ||
.DS_Store | ||
.idea/ | ||
.venv/ | ||
build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from abc import ABC, abstractmethod | ||
|
||
|
||
class OpeaComponent(ABC): | ||
"""The OpeaComponent class serves as the base class for all components in the GenAIComps. | ||
It provides a unified interface and foundational attributes that every derived component inherits and extends. | ||
Attributes: | ||
name (str): The name of the component. | ||
type (str): The type of the component (e.g., 'retriever', 'embedding', 'reranking', 'llm', etc.). | ||
description (str): A brief description of the component's functionality. | ||
config (dict): A dictionary containing configuration parameters for the component. | ||
""" | ||
|
||
def __init__(self, name: str, type: str, description: str, config: dict = None): | ||
"""Initializes an OpeaComponent instance with the provided attributes. | ||
Args: | ||
name (str): The name of the component. | ||
type (str): The type of the component. | ||
description (str): A brief description of the component. | ||
config (dict, optional): Configuration parameters for the component. Defaults to an empty dictionary. | ||
""" | ||
self.name = name | ||
self.type = type | ||
self.description = description | ||
self.config = config if config is not None else {} | ||
|
||
def get_meta(self) -> dict: | ||
"""Retrieves metadata about the component, including its name, type, description, and configuration. | ||
Returns: | ||
dict: A dictionary containing the component's metadata. | ||
""" | ||
return { | ||
"name": self.name, | ||
"type": self.type, | ||
"description": self.description, | ||
"config": self.config, | ||
} | ||
|
||
def update_config(self, key: str, value): | ||
"""Updates a configuration parameter for the component. | ||
Args: | ||
key (str): The configuration parameter's key. | ||
value: The new value for the configuration parameter. | ||
""" | ||
self.config[key] = value | ||
|
||
@abstractmethod | ||
def check_health(self) -> bool: | ||
"""Checks the health of the component. | ||
Returns: | ||
bool: True if the component is healthy, False otherwise. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def invoke(self, *args, **kwargs): | ||
"""Invoke service accessing using the component. | ||
Args: | ||
*args: Positional arguments. | ||
**kwargs: Keyword arguments. | ||
Returns: | ||
Any: The result of the service accessing. | ||
""" | ||
pass | ||
|
||
def __repr__(self): | ||
"""Provides a string representation of the component for debugging and logging purposes. | ||
Returns: | ||
str: A string representation of the OpeaComponent instance. | ||
""" | ||
return f"OpeaComponent(name={self.name}, type={self.type}, description={self.description})" | ||
|
||
|
||
class OpeaComponentController(ABC): | ||
"""The OpeaComponentController class serves as the base class for managing and orchestrating multiple | ||
instances of components of the same type. It provides a unified interface for routing tasks, | ||
registering components, and dynamically discovering available components. | ||
Attributes: | ||
components (dict): A dictionary to store registered components by their unique identifiers. | ||
""" | ||
|
||
def __init__(self): | ||
"""Initializes the OpeaComponentController instance with an empty component registry.""" | ||
self.components = {} | ||
self.active_component = None | ||
|
||
def register(self, component): | ||
"""Registers an OpeaComponent instance to the controller. | ||
Args: | ||
component (OpeaComponent): An instance of a subclass of OpeaComponent to be managed. | ||
Raises: | ||
ValueError: If the component is already registered. | ||
""" | ||
if component.name in self.components: | ||
raise ValueError(f"Component '{component.name}' is already registered.") | ||
self.components[component.name] = component | ||
|
||
def discover_and_activate(self): | ||
"""Discovers healthy components and activates one. | ||
If multiple components are healthy, it prioritizes the first registered component. | ||
""" | ||
for component in self.components.values(): | ||
if component.check_health(): | ||
self.active_component = component | ||
print(f"Activated component: {component.name}") | ||
return | ||
raise RuntimeError("No healthy components available.") | ||
|
||
def invoke(self, *args, **kwargs): | ||
"""Invokes service accessing using the active component. | ||
Args: | ||
*args: Positional arguments. | ||
**kwargs: Keyword arguments. | ||
Returns: | ||
Any: The result of the service accessing. | ||
Raises: | ||
RuntimeError: If no active component is set. | ||
""" | ||
if not self.active_component: | ||
raise RuntimeError("No active component. Call 'discover_and_activate' first.") | ||
return self.active_component.invoke(*args, **kwargs) | ||
|
||
def list_components(self): | ||
"""Lists all registered components. | ||
Returns: | ||
list: A list of component names that are currently registered. | ||
""" | ||
return self.components.keys() | ||
|
||
def __repr__(self): | ||
"""Provides a string representation of the controller and its registered components. | ||
Returns: | ||
str: A string representation of the OpeaComponentController instance. | ||
""" | ||
return f"OpeaComponentController(registered_components={self.list_components()})" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
FROM python:3.11-slim | ||
|
||
ENV LANG=C.UTF-8 | ||
|
||
ARG ARCH="cpu" | ||
|
||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ | ||
build-essential \ | ||
default-jre \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
USER user | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN pip install --no-cache-dir --upgrade pip setuptools && \ | ||
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ | ||
pip install --no-cache-dir -r /home/user/comps/dataprep/elasticsearch/langchain/requirements.txt | ||
|
||
ENV PYTHONPATH=$PYTHONPATH:/home/user | ||
|
||
USER root | ||
|
||
RUN mkdir -p /home/user/comps/dataprep/elasticsearch/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/elasticsearch/langchain/uploaded_files | ||
|
||
USER user | ||
|
||
WORKDIR /home/user/comps/dataprep/elasticsearch/langchain | ||
|
||
ENTRYPOINT ["python", "prepare_doc_elasticsearch.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
# Dataprep Microservice with Elasticsearch | ||
|
||
## 🚀1. Start Microservice with Python(Option 1) | ||
|
||
### 1.1 Install Requirements | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### 1.2 Setup Environment Variables | ||
|
||
```bash | ||
export ES_CONNECTION_STRING=http://localhost:9200 | ||
export INDEX_NAME=${your_index_name} | ||
``` | ||
|
||
### 1.3 Start Elasticsearch | ||
|
||
Please refer to this [readme](../../../vectorstores/elasticsearch/README.md). | ||
|
||
### 1.4 Start Document Preparation Microservice for Elasticsearch with Python Script | ||
|
||
Start document preparation microservice for Elasticsearch with below command. | ||
|
||
```bash | ||
python prepare_doc_elastic.py | ||
``` | ||
|
||
## 🚀2. Start Microservice with Docker (Option 2) | ||
|
||
### 2.1 Start Elasticsearch | ||
|
||
Please refer to this [readme](../../../vectorstores/elasticsearch/README.md). | ||
|
||
### 2.2 Setup Environment Variables | ||
|
||
```bash | ||
export ES_CONNECTION_STRING=http://localhost:9200 | ||
export INDEX_NAME=${your_index_name} | ||
``` | ||
|
||
### 2.3 Build Docker Image | ||
|
||
```bash | ||
cd GenAIComps | ||
docker build -t opea/dataprep-elasticsearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/elasticsearch/langchain/Dockerfile . | ||
``` | ||
|
||
### 2.4 Run Docker with CLI (Option A) | ||
|
||
```bash | ||
docker run --name="dataprep-elasticsearch" -p 6011:6011 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ES_CONNECTION_STRING=$ES_CONNECTION_STRING -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT opea/dataprep-elastic:latest | ||
``` | ||
|
||
### 2.5 Run with Docker Compose (Option B) | ||
|
||
```bash | ||
cd comps/dataprep/elasticsearch/langchain | ||
docker compose -f docker-compose-dataprep-elastic.yaml up -d | ||
``` | ||
|
||
## 🚀3. Consume Microservice | ||
|
||
### 3.1 Consume Upload API | ||
|
||
Once document preparation microservice for Elasticsearch is started, user can use below command to invoke the | ||
microservice to convert the document to embedding and save to the database. | ||
|
||
```bash | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"path":"/path/to/document"}' \ | ||
http://localhost:6011/v1/dataprep | ||
``` | ||
|
||
### 3.2 Consume get_file API | ||
|
||
To get uploaded file structures, use the following command: | ||
|
||
```bash | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
http://localhost:6011/v1/dataprep/get_file | ||
``` | ||
|
||
Then you will get the response JSON like this: | ||
|
||
```json | ||
[ | ||
{ | ||
"name": "uploaded_file_1.txt", | ||
"id": "uploaded_file_1.txt", | ||
"type": "File", | ||
"parent": "" | ||
}, | ||
{ | ||
"name": "uploaded_file_2.txt", | ||
"id": "uploaded_file_2.txt", | ||
"type": "File", | ||
"parent": "" | ||
} | ||
] | ||
``` | ||
|
||
### 4.3 Consume delete_file API | ||
|
||
To delete uploaded file/link, use the following command. | ||
|
||
The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. | ||
|
||
```bash | ||
# delete link | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"file_path": "https://www.ces.tech/.txt"}' \ | ||
http://localhost:6011/v1/dataprep/delete_file | ||
|
||
# delete file | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"file_path": "uploaded_file_1.txt"}' \ | ||
http://localhost:6011/v1/dataprep/delete_file | ||
|
||
# delete all files and links | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"file_path": "all"}' \ | ||
http://localhost:6011/v1/dataprep/delete_file | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
Oops, something went wrong.