Skip to content

Commit

Permalink
Merge branch 'main' into embedding_reorg
Browse files Browse the repository at this point in the history
  • Loading branch information
ZePan110 authored Dec 10, 2024
2 parents cd4cf5f + c409ef9 commit 88b6578
Show file tree
Hide file tree
Showing 35 changed files with 1,580 additions and 19 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/dataprep-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,7 @@ services:
build:
dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile
image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
dataprep-elasticsearch:
build:
dockerfile: comps/dataprep/elasticsearch/langchain/Dockerfile
image: ${REGISTRY:-opea}/dataprep-elasticsearch:${TAG:-latest}
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/retrievers-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,7 @@ services:
build:
dockerfile: comps/retrievers/neo4j/llama_index/Dockerfile
image: ${REGISTRY:-opea}/retriever-neo4j-llamaindex:${TAG:-latest}
retriever-elasticsearch:
build:
dockerfile: comps/retrievers/elasticsearch/langchain/Dockerfile
image: ${REGISTRY:-opea}/retriever-elasticsearch:${TAG:-latest}
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
__pycache__
*.egg-info/
.DS_Store
.idea/
.venv/
build/
3 changes: 3 additions & 0 deletions comps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@
# Telemetry
from comps.cores.telemetry.opea_telemetry import opea_telemetry

# Common
from comps.cores.common.component import OpeaComponent, OpeaComponentController

# Statistics
from comps.cores.mega.base_statistics import statistics_dict, register_statistics

Expand Down
2 changes: 2 additions & 0 deletions comps/cores/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
155 changes: 155 additions & 0 deletions comps/cores/common/component.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from abc import ABC, abstractmethod


class OpeaComponent(ABC):
"""The OpeaComponent class serves as the base class for all components in the GenAIComps.
It provides a unified interface and foundational attributes that every derived component inherits and extends.
Attributes:
name (str): The name of the component.
type (str): The type of the component (e.g., 'retriever', 'embedding', 'reranking', 'llm', etc.).
description (str): A brief description of the component's functionality.
config (dict): A dictionary containing configuration parameters for the component.
"""

def __init__(self, name: str, type: str, description: str, config: dict = None):
"""Initializes an OpeaComponent instance with the provided attributes.
Args:
name (str): The name of the component.
type (str): The type of the component.
description (str): A brief description of the component.
config (dict, optional): Configuration parameters for the component. Defaults to an empty dictionary.
"""
self.name = name
self.type = type
self.description = description
self.config = config if config is not None else {}

def get_meta(self) -> dict:
"""Retrieves metadata about the component, including its name, type, description, and configuration.
Returns:
dict: A dictionary containing the component's metadata.
"""
return {
"name": self.name,
"type": self.type,
"description": self.description,
"config": self.config,
}

def update_config(self, key: str, value):
"""Updates a configuration parameter for the component.
Args:
key (str): The configuration parameter's key.
value: The new value for the configuration parameter.
"""
self.config[key] = value

@abstractmethod
def check_health(self) -> bool:
"""Checks the health of the component.
Returns:
bool: True if the component is healthy, False otherwise.
"""
pass

@abstractmethod
def invoke(self, *args, **kwargs):
"""Invoke service accessing using the component.
Args:
*args: Positional arguments.
**kwargs: Keyword arguments.
Returns:
Any: The result of the service accessing.
"""
pass

def __repr__(self):
"""Provides a string representation of the component for debugging and logging purposes.
Returns:
str: A string representation of the OpeaComponent instance.
"""
return f"OpeaComponent(name={self.name}, type={self.type}, description={self.description})"


class OpeaComponentController(ABC):
"""The OpeaComponentController class serves as the base class for managing and orchestrating multiple
instances of components of the same type. It provides a unified interface for routing tasks,
registering components, and dynamically discovering available components.
Attributes:
components (dict): A dictionary to store registered components by their unique identifiers.
"""

def __init__(self):
"""Initializes the OpeaComponentController instance with an empty component registry."""
self.components = {}
self.active_component = None

def register(self, component):
"""Registers an OpeaComponent instance to the controller.
Args:
component (OpeaComponent): An instance of a subclass of OpeaComponent to be managed.
Raises:
ValueError: If the component is already registered.
"""
if component.name in self.components:
raise ValueError(f"Component '{component.name}' is already registered.")
self.components[component.name] = component

def discover_and_activate(self):
"""Discovers healthy components and activates one.
If multiple components are healthy, it prioritizes the first registered component.
"""
for component in self.components.values():
if component.check_health():
self.active_component = component
print(f"Activated component: {component.name}")
return
raise RuntimeError("No healthy components available.")

def invoke(self, *args, **kwargs):
"""Invokes service accessing using the active component.
Args:
*args: Positional arguments.
**kwargs: Keyword arguments.
Returns:
Any: The result of the service accessing.
Raises:
RuntimeError: If no active component is set.
"""
if not self.active_component:
raise RuntimeError("No active component. Call 'discover_and_activate' first.")
return self.active_component.invoke(*args, **kwargs)

def list_components(self):
"""Lists all registered components.
Returns:
list: A list of component names that are currently registered.
"""
return self.components.keys()

def __repr__(self):
"""Provides a string representation of the controller and its registered components.
Returns:
str: A string representation of the OpeaComponentController instance.
"""
return f"OpeaComponentController(registered_components={self.list_components()})"
4 changes: 2 additions & 2 deletions comps/cores/mega/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,8 @@ def generate():
)
token_start = time.time()
else:
yield chunk
token_start = self.metrics.token_update(token_start, is_first)
yield chunk
is_first = False
self.metrics.request_update(req_start)
self.metrics.pending_update(False)
Expand Down Expand Up @@ -306,7 +306,7 @@ def token_generator(self, sentence: str, token_start: float, is_first: bool, is_
suffix = "\n\n"
tokens = re.findall(r"\s?\S+\s?", sentence, re.UNICODE)
for token in tokens:
yield prefix + repr(token.replace("\\n", "\n").encode("utf-8")) + suffix
token_start = self.metrics.token_update(token_start, is_first)
yield prefix + repr(token.replace("\\n", "\n").encode("utf-8")) + suffix
if is_last:
yield "data: [DONE]\n\n"
38 changes: 38 additions & 0 deletions comps/dataprep/elasticsearch/langchain/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM python:3.11-slim

ENV LANG=C.UTF-8

ARG ARCH="cpu"

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
default-jre \
libgl1-mesa-glx \
libjemalloc-dev

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

USER user

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/dataprep/elasticsearch/langchain/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

USER root

RUN mkdir -p /home/user/comps/dataprep/elasticsearch/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/elasticsearch/langchain/uploaded_files

USER user

WORKDIR /home/user/comps/dataprep/elasticsearch/langchain

ENTRYPOINT ["python", "prepare_doc_elasticsearch.py"]
130 changes: 130 additions & 0 deletions comps/dataprep/elasticsearch/langchain/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Dataprep Microservice with Elasticsearch

## 🚀1. Start Microservice with Python(Option 1)

### 1.1 Install Requirements

```bash
pip install -r requirements.txt
```

### 1.2 Setup Environment Variables

```bash
export ES_CONNECTION_STRING=http://localhost:9200
export INDEX_NAME=${your_index_name}
```

### 1.3 Start Elasticsearch

Please refer to this [readme](../../../vectorstores/elasticsearch/README.md).

### 1.4 Start Document Preparation Microservice for Elasticsearch with Python Script

Start document preparation microservice for Elasticsearch with below command.

```bash
python prepare_doc_elastic.py
```

## 🚀2. Start Microservice with Docker (Option 2)

### 2.1 Start Elasticsearch

Please refer to this [readme](../../../vectorstores/elasticsearch/README.md).

### 2.2 Setup Environment Variables

```bash
export ES_CONNECTION_STRING=http://localhost:9200
export INDEX_NAME=${your_index_name}
```

### 2.3 Build Docker Image

```bash
cd GenAIComps
docker build -t opea/dataprep-elasticsearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/elasticsearch/langchain/Dockerfile .
```

### 2.4 Run Docker with CLI (Option A)

```bash
docker run --name="dataprep-elasticsearch" -p 6011:6011 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ES_CONNECTION_STRING=$ES_CONNECTION_STRING -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT opea/dataprep-elastic:latest
```

### 2.5 Run with Docker Compose (Option B)

```bash
cd comps/dataprep/elasticsearch/langchain
docker compose -f docker-compose-dataprep-elastic.yaml up -d
```

## 🚀3. Consume Microservice

### 3.1 Consume Upload API

Once document preparation microservice for Elasticsearch is started, user can use below command to invoke the
microservice to convert the document to embedding and save to the database.

```bash
curl -X POST \
-H "Content-Type: application/json" \
-d '{"path":"/path/to/document"}' \
http://localhost:6011/v1/dataprep
```

### 3.2 Consume get_file API

To get uploaded file structures, use the following command:

```bash
curl -X POST \
-H "Content-Type: application/json" \
http://localhost:6011/v1/dataprep/get_file
```

Then you will get the response JSON like this:

```json
[
{
"name": "uploaded_file_1.txt",
"id": "uploaded_file_1.txt",
"type": "File",
"parent": ""
},
{
"name": "uploaded_file_2.txt",
"id": "uploaded_file_2.txt",
"type": "File",
"parent": ""
}
]
```

### 4.3 Consume delete_file API

To delete uploaded file/link, use the following command.

The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API.

```bash
# delete link
curl -X POST \
-H "Content-Type: application/json" \
-d '{"file_path": "https://www.ces.tech/.txt"}' \
http://localhost:6011/v1/dataprep/delete_file

# delete file
curl -X POST \
-H "Content-Type: application/json" \
-d '{"file_path": "uploaded_file_1.txt"}' \
http://localhost:6011/v1/dataprep/delete_file

# delete all files and links
curl -X POST \
-H "Content-Type: application/json" \
-d '{"file_path": "all"}' \
http://localhost:6011/v1/dataprep/delete_file
```
2 changes: 2 additions & 0 deletions comps/dataprep/elasticsearch/langchain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
Loading

0 comments on commit 88b6578

Please sign in to comment.