diff --git a/.github/ISSUE_TEMPLATE/1_bug_template.yml b/.github/ISSUE_TEMPLATE/1_bug_template.yml index 5dac75c52..87855979c 100644 --- a/.github/ISSUE_TEMPLATE/1_bug_template.yml +++ b/.github/ISSUE_TEMPLATE/1_bug_template.yml @@ -128,7 +128,7 @@ body: required: false - - type: file + - type: textarea id: attachments attributes: label: Attachments diff --git a/.github/ISSUE_TEMPLATE/2_feaure_template.yml b/.github/ISSUE_TEMPLATE/2_feature_template.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/2_feaure_template.yml rename to .github/ISSUE_TEMPLATE/2_feature_template.yml diff --git a/.github/workflows/_comps-workflow.yml b/.github/workflows/_comps-workflow.yml index bbb4ce12d..12db04eb7 100644 --- a/.github/workflows/_comps-workflow.yml +++ b/.github/workflows/_comps-workflow.yml @@ -63,7 +63,7 @@ jobs: git clone https://github.com/vllm-project/vllm.git vllm-openvino cd ./vllm-openvino && git checkout v0.6.1 && git rev-parse HEAD && cd ../ fi - if [[ $(grep -c "vllm-hpu:" ${docker_compose_yml}) != 0 ]]; then + if [[ $(grep -c "vllm-gaudi:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/HabanaAI/vllm-fork.git vllm-fork cd vllm-fork && git checkout 3c39626 && cd ../ fi @@ -74,10 +74,17 @@ jobs: mode: ${{ inputs.mode }} run: | build_list=$(bash ${{ github.workspace }}/.github/workflows/scripts/get_cicd_list.sh "${mode}" ${docker_compose_path}) - echo "build_list=${build_list}" >> $GITHUB_OUTPUT + echo "${build_list}" + if [ -z "${build_list}" ]; then + echo "empty=true" >> $GITHUB_OUTPUT + echo "${{ inputs.service }} have no ${mode} part." + else + echo "empty=false" >> $GITHUB_OUTPUT + echo "build_list=${build_list}" >> $GITHUB_OUTPUT + fi - name: Build Image - if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' }} + if: ${{ fromJSON(inputs.build) && steps.get-yaml-path.outputs.file_exists == 'true' && steps.get-build-list.outputs.empty == 'false' }} uses: opea-project/validation/actions/image-build@main with: work_dir: ${{ github.workspace }} diff --git a/.github/workflows/_run-docker-compose.yml b/.github/workflows/_run-docker-compose.yml index 976923cb9..cc49f5cb9 100644 --- a/.github/workflows/_run-docker-compose.yml +++ b/.github/workflows/_run-docker-compose.yml @@ -54,9 +54,8 @@ jobs: cd ${{ github.workspace }}/tests test_cases=$(find . -type f -name "test_${service_l}*.sh") for script in $test_cases; do - echo $script - if echo "$script" | grep -q "on"; then - hardware=$(echo $script | cut -d'/' -f3 | cut -d'.' -f1 | awk -F'on_' '{print $2}') + if echo "$script" | grep -q "_on"; then + hardware=$(echo $script | cut -d'/' -f3 | cut -d'.' -f1 | awk -F'_on_' '{print $2}') else hardware="intel_cpu" fi @@ -93,14 +92,15 @@ jobs: GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} PINECONE_KEY: ${{ secrets.PINECONE_KEY }} + PREDICTIONGUARD_API_KEY: ${{ secrets.PREDICTIONGUARD_API_KEY }} service: ${{ inputs.service }} test_case: ${{ matrix.test_case }} hardware: ${{ matrix.hardware }} run: | cd ${{ github.workspace }}/tests - service=$(echo "${test_case}" | cut -d'_' -f2- |cut -d'.' -f1) + service=$(echo "${test_case}" | cut -d'/' -f3 | cut -d'_' -f2- |cut -d'.' -f1) echo "service=${service}" >> $GITHUB_ENV - if [ -f ${test_case} ]; then timeout 30m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi + if [ -f ${test_case} ]; then timeout 60m bash ${test_case}; else echo "Test script {${test_case}} not found, skip test!"; fi - name: Clean up container if: cancelled() || failure() diff --git a/.github/workflows/docker/compose/dataprep-compose.yaml b/.github/workflows/docker/compose/dataprep-compose.yaml index 7908e8c26..f2ef1b330 100644 --- a/.github/workflows/docker/compose/dataprep-compose.yaml +++ b/.github/workflows/docker/compose/dataprep-compose.yaml @@ -7,6 +7,10 @@ services: build: dockerfile: comps/dataprep/redis/langchain/Dockerfile image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} + dataprep-opensearch: + build: + dockerfile: comps/dataprep/opensearch/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-opensearch:${TAG:-latest} dataprep-qdrant: build: dockerfile: comps/dataprep/qdrant/langchain/Dockerfile diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml index 91fbb46d4..984d59e9d 100644 --- a/.github/workflows/docker/compose/llms-compose.yaml +++ b/.github/workflows/docker/compose/llms-compose.yaml @@ -36,12 +36,12 @@ services: context: vllm-openvino dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} - vllm-hpu: + vllm-gaudi: build: context: vllm-fork dockerfile: Dockerfile.hpu shm_size: '128g' - image: ${REGISTRY:-opea}/vllm-hpu:${TAG:-latest} + image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} vllm-arc: build: dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_gpu diff --git a/.github/workflows/docker/compose/lvms-compose.yaml b/.github/workflows/docker/compose/lvms-compose.yaml index e8ed56a2c..6e89db0e3 100644 --- a/.github/workflows/docker/compose/lvms-compose.yaml +++ b/.github/workflows/docker/compose/lvms-compose.yaml @@ -23,10 +23,10 @@ services: build: dockerfile: comps/lvms/llava/Dockerfile image: ${REGISTRY:-opea}/lvm-llava-svc:${TAG:-latest} - llava-hpu: + llava-gaudi: build: dockerfile: comps/lvms/llava/dependency/Dockerfile.intel_hpu - image: ${REGISTRY:-opea}/llava-hpu:${TAG:-latest} + image: ${REGISTRY:-opea}/llava-gaudi:${TAG:-latest} lvm-predictionguard: build: dockerfile: comps/lvms/predictionguard/Dockerfile diff --git a/.github/workflows/docker/compose/retrievers-compose.yaml b/.github/workflows/docker/compose/retrievers-compose.yaml index 7b89ce9bf..9d4db850e 100644 --- a/.github/workflows/docker/compose/retrievers-compose.yaml +++ b/.github/workflows/docker/compose/retrievers-compose.yaml @@ -7,6 +7,10 @@ services: build: dockerfile: comps/retrievers/redis/langchain/Dockerfile image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} + retriever-opensearch: + build: + dockerfile: comps/retrievers/opensearch/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-opensearch:${TAG:-latest} retriever-qdrant: build: dockerfile: comps/retrievers/qdrant/haystack/Dockerfile diff --git a/.github/workflows/docker/compose/texttosql-compose.yaml b/.github/workflows/docker/compose/texttosql-compose.yaml index a059742d9..56c7b41fb 100644 --- a/.github/workflows/docker/compose/texttosql-compose.yaml +++ b/.github/workflows/docker/compose/texttosql-compose.yaml @@ -3,7 +3,7 @@ # this file should be run in the root of the repo services: - texttosql-langchain: + texttosql: build: dockerfile: comps/texttosql/langchain/Dockerfile image: ${REGISTRY:-opea}/texttosql:${TAG:-latest} diff --git a/.github/workflows/pr-dockerfile-path-scan.yaml b/.github/workflows/pr-dockerfile-path-scan.yaml index ece4bead4..1df0dfaeb 100644 --- a/.github/workflows/pr-dockerfile-path-scan.yaml +++ b/.github/workflows/pr-dockerfile-path-scan.yaml @@ -67,12 +67,6 @@ jobs: if [ -n "$Dockerfiles" ]; then for Dockerfile in $Dockerfiles; do service=$(echo "$Dockerfile" | awk -F '/' '{print $2}') - if grep -q "$Dockerfile" ../GenAIExamples/**/*build.yaml*; then - mode="" #CI - else - mode="-cd" #CD - fi - yaml_file=${{github.workspace}}/.github/workflows/docker/compose/"$service"-compose if ! grep -q "$Dockerfile" "$yaml_file"*yaml; then echo "AR: Update $Dockerfile to .github/workflows/docker/compose/"$service"-compose.yaml. The yaml is used for release images build." diff --git a/comps/agent/langchain/README.md b/comps/agent/langchain/README.md index 2ff934f6c..585ff5d96 100644 --- a/comps/agent/langchain/README.md +++ b/comps/agent/langchain/README.md @@ -93,10 +93,10 @@ export vllm_volume=${YOUR_LOCAL_DIR_FOR_MODELS} # build vLLM image git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork -docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy # TGI serving -docker run -d --runtime=habana --rm --name "comps-vllm-gaudi-service" -p 8080:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-hpu:latest --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral +docker run -d --runtime=habana --rm --name "comps-vllm-gaudi-service" -p 8080:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral # check status docker logs comps-vllm-gaudi-service diff --git a/comps/animation/wav2lip/dependency/entrypoint.sh b/comps/animation/wav2lip/dependency/entrypoint.sh index 1a2270357..1004b3594 100644 --- a/comps/animation/wav2lip/dependency/entrypoint.sh +++ b/comps/animation/wav2lip/dependency/entrypoint.sh @@ -11,11 +11,11 @@ else fi # Download model weights -wget https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth -O Wav2Lip/face_detection/detection/sfd/s3fd.pth +wget --no-verbose https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth -O Wav2Lip/face_detection/detection/sfd/s3fd.pth mkdir -p Wav2Lip/checkpoints -wget "https://iiitaphyd-my.sharepoint.com/:f:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?download=1" -O Wav2Lip/checkpoints/wav2lip.pth -wget "https://iiitaphyd-my.sharepoint.com/:f:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?download=1" -O Wav2Lip/checkpoints/wav2lip_gan.pth -wget https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth -P gfpgan/experiments/pretrained_models +wget --no-verbose "https://iiitaphyd-my.sharepoint.com/:f:/g/personal/radrabha_m_research_iiit_ac_in/Eb3LEzbfuKlJiR600lQWRxgBIY27JZg80f7V9jtMfbNDaQ?download=1" -O Wav2Lip/checkpoints/wav2lip.pth +wget --no-verbose "https://iiitaphyd-my.sharepoint.com/:f:/g/personal/radrabha_m_research_iiit_ac_in/EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA?download=1" -O Wav2Lip/checkpoints/wav2lip_gan.pth +wget --no-verbose https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth -P gfpgan/experiments/pretrained_models echo "Face Detector, Wav2Lip, GFPGAN weights downloaded." # Environment variables diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py index 1aa1a0e1a..29642eea5 100644 --- a/comps/cores/mega/gateway.py +++ b/comps/cores/mega/gateway.py @@ -220,7 +220,11 @@ async def handle_request(self, request: Request): repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, - model=chat_request.model if chat_request.model else None, + model=( + chat_request.model + if chat_request.model + else os.getenv("MODEL_ID") if os.getenv("MODEL_ID") else "Intel/neural-chat-7b-v3-3" + ), ) retriever_parameters = RetrieverParms( search_type=chat_request.search_type if chat_request.search_type else "similarity", @@ -769,7 +773,7 @@ def __init__(self, megaservice, host="0.0.0.0", port=8889): host, port, str(MegaServiceEndpoint.RETRIEVALTOOL), - Union[TextDoc, EmbeddingRequest, ChatCompletionRequest], + Union[TextDoc, ChatCompletionRequest], Union[RerankedDoc, LLMParamsDoc], ) @@ -785,7 +789,7 @@ def parser_input(data, TypeClass, key): data = await request.json() query = None - for key, TypeClass in zip(["text", "input", "messages"], [TextDoc, EmbeddingRequest, ChatCompletionRequest]): + for key, TypeClass in zip(["text", "messages"], [TextDoc, ChatCompletionRequest]): query, chat_request = parser_input(data, TypeClass, key) if query is not None: break diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index d8d469ffb..75cab6df5 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -155,7 +155,7 @@ class ChatCompletionRequest(BaseModel): List[Dict[str, str]], List[Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]], ] - model: Optional[str] = "Intel/neural-chat-7b-v3-3" + model: Optional[str] = None frequency_penalty: Optional[float] = 0.0 logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False diff --git a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py index 3def86f81..a6014b621 100644 --- a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py @@ -30,7 +30,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -39,17 +39,16 @@ logflag = os.getenv("LOGFLAG", False) # workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py -# from utils import document_loader, get_tables_result, parse_html index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}} partition_field_name = "filename" upload_folder = "./uploaded_files/" +milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}" class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: - _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] response = self.client.create(input=texts, **self._invocation_params) if not isinstance(response, dict): @@ -93,7 +92,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): batch_docs, embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, partition_key_field=partition_field_name, ) except Exception as e: @@ -211,7 +210,7 @@ async def ingest_documents( my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, index_params=index_params, auto_id=True, ) @@ -318,7 +317,7 @@ async def ingest_documents( ) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) await save_content_to_local_disk(save_path, content) ingest_data_to_milvus( DocPath( @@ -347,7 +346,7 @@ async def rag_get_file_structure(): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, index_params=index_params, auto_id=True, ) @@ -405,7 +404,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, index_params=index_params, auto_id=True, ) diff --git a/comps/dataprep/milvus/langchain/requirements.txt b/comps/dataprep/milvus/langchain/requirements.txt index 85ba3e972..611c95a15 100644 --- a/comps/dataprep/milvus/langchain/requirements.txt +++ b/comps/dataprep/milvus/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py b/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py index 198b61048..a7ece023f 100644 --- a/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py +++ b/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py @@ -48,7 +48,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, save_content_to_local_disk, ) @@ -654,7 +654,7 @@ async def ingest_documents( for link in link_list: encoded_link = encode_filename(link) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) try: await save_content_to_local_disk(save_path, content) index = ingest_data_to_neo4j( diff --git a/comps/dataprep/neo4j/llama_index/requirements.txt b/comps/dataprep/neo4j/llama_index/requirements.txt index fc5f7b8d6..c183ecf3d 100644 --- a/comps/dataprep/neo4j/llama_index/requirements.txt +++ b/comps/dataprep/neo4j/llama_index/requirements.txt @@ -6,6 +6,7 @@ easyocr fastapi future graspologic +html2text huggingface_hub ipython langchain diff --git a/comps/dataprep/opensearch/README.md b/comps/dataprep/opensearch/README.md new file mode 100644 index 000000000..a4067b7ea --- /dev/null +++ b/comps/dataprep/opensearch/README.md @@ -0,0 +1,253 @@ +# Dataprep Microservice with OpenSearch + +For dataprep microservice for text input, we provide here the `Langchain` framework. + +## ๐Ÿš€1. Start Microservice with Python๏ผˆOption 1๏ผ‰ + +### 1.1 Install Requirements + +- option 1: Install Single-process version (for processing up to 10 files) + +```bash +apt update +apt install default-jre tesseract-ocr libtesseract-dev poppler-utils -y +# for langchain +cd langchain +pip install -r requirements.txt +``` + +### 1.2 Start OpenSearch Stack Server + +Please refer to this [readme](../../vectorstores/opensearch/README.md). + +### 1.3 Setup Environment Variables + +```bash +export your_ip=$(hostname -I | awk '{print $1}') +export OPENSEARCH_URL="http://${your_ip}:9200" +export INDEX_NAME=${your_index_name} +export PYTHONPATH=${path_to_comps} +``` + +### 1.4 Start Embedding Service + +First, you need to start a TEI service. + +```bash +your_port=6006 +model="BAAI/bge-base-en-v1.5" +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model +``` + +Then you need to test your TEI service using the following commands: + +```bash +curl localhost:$your_port/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +After checking that it works, set up environment variables. + +```bash +export TEI_ENDPOINT="http://localhost:$your_port" +``` + +### 1.4 Start Document Preparation Microservice for OpenSearch with Python Script + +Start document preparation microservice for OpenSearch with below command. + +- option 1: Start single-process version (for processing up to 10 files) + +```bash +cd langchain +python prepare_doc_opensearch.py +``` + +## ๐Ÿš€2. Start Microservice with Docker (Option 2) + +### 2.1 Start OpenSearch Stack Server + +Please refer to this [readme](../../vectorstores/opensearch/README.md). + +### 2.2 Setup Environment Variables + +```bash +export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" +export TEI_ENDPOINT="http://${your_ip}:6006" +export OPENSEARCH_URL="http://${your_ip}:9200" +export INDEX_NAME=${your_index_name} +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +``` + +### 2.3 Build Docker Image + +- Build docker image with langchain + +- option 1: Start single-process version (for processing up to 10 files) + +```bash +cd ../../ +docker build -t opea/dataprep-opensearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/opensearch/langchain/Dockerfile . +``` + +### 2.4 Run Docker with CLI (Option A) + +- option 1: Start single-process version (for processing up to 10 files) + +```bash +docker run -d --name="dataprep-opensearch-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e OPENSEARCH_URL=$OPENSEARCH_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/dataprep-opensearch:latest +``` + +### 2.5 Run with Docker Compose (Option B - deprecated, will move to genAIExample in future) + +```bash +# for langchain +cd comps/dataprep/opensearch/langchain +# common command +docker compose -f docker-compose-dataprep-opensearch.yaml up -d +``` + +## ๐Ÿš€3. Status Microservice + +```bash +docker container logs -f dataprep-opensearch-server +``` + +## ๐Ÿš€4. Consume Microservice + +### 4.1 Consume Upload API + +Once document preparation microservice for OpenSearch is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +Make sure the file path after `files=@` is correct. + +- Single file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` + +- Multiple file upload + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "files=@./file2.txt" \ + -F "files=@./file3.txt" \ + http://localhost:6007/v1/dataprep +``` + +- Links upload (not supported for llama_index now) + +```bash +curl -X POST \ + -F 'link_list=["https://www.ces.tech/"]' \ + http://localhost:6007/v1/dataprep +``` + +or + +```python +import requests +import json + +proxies = {"http": ""} +url = "http://localhost:6007/v1/dataprep" +urls = [ + "https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4" +] +payload = {"link_list": json.dumps(urls)} + +try: + resp = requests.post(url=url, data=payload, proxies=proxies) + print(resp.text) + resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes + print("Request successful!") +except requests.exceptions.RequestException as e: + print("An error occurred:", e) +``` + +### 4.2 Consume get_file API + +To get uploaded file structures, use the following command: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6007/v1/dataprep/get_file +``` + +Then you will get the response JSON like this: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +### 4.3 Consume delete_file API + +To delete uploaded file/link, use the following command. + +The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. + +```bash +# delete link +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' \ + http://localhost:6007/v1/dataprep/delete_file + +# delete file +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' \ + http://localhost:6007/v1/dataprep/delete_file + +# delete all files and links +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ + http://localhost:6007/v1/dataprep/delete_file +``` diff --git a/comps/dataprep/opensearch/langchain/Dockerfile b/comps/dataprep/opensearch/langchain/Dockerfile new file mode 100644 index 000000000..f29a753bc --- /dev/null +++ b/comps/dataprep/opensearch/langchain/Dockerfile @@ -0,0 +1,42 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev \ + libreoffice \ + poppler-utils \ + tesseract-ocr + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/opensearch/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/opensearch/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/opensearch/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/opensearch/langchain + +ENTRYPOINT ["python", "prepare_doc_opensearch.py"] + diff --git a/comps/dataprep/opensearch/langchain/__init__.py b/comps/dataprep/opensearch/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/opensearch/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/opensearch/langchain/config.py b/comps/dataprep/opensearch/langchain/config.py new file mode 100644 index 000000000..49e9e65a5 --- /dev/null +++ b/comps/dataprep/opensearch/langchain/config.py @@ -0,0 +1,60 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# OpenSearch Connection Information +OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost") +OPENSEARCH_PORT = int(os.getenv("OPENSEARCH_PORT", 9200)) +OPENSEARCH_INITIAL_ADMIN_PASSWORD = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD", "StRoNgOpEa0)") + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +def format_opensearch_conn_from_env(): + opensearch_url = os.getenv("OPENSEARCH_URL", None) + if opensearch_url: + return opensearch_url + else: + using_ssl = get_boolean_env_var("OPENSEARCH_SSL", False) + start = "https://" if using_ssl else "http://" + + return start + f"{OPENSEARCH_HOST}:{OPENSEARCH_PORT}" + + +OPENSEARCH_URL = format_opensearch_conn_from_env() + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "rag-opensearch") +KEY_INDEX_NAME = os.getenv("KEY_INDEX_NAME", "file-keys") + +TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600)) + +SEARCH_BATCH_SIZE = int(os.getenv("SEARCH_BATCH_SIZE", 10)) diff --git a/comps/dataprep/opensearch/langchain/docker-compose-dataprep-opensearch.yaml b/comps/dataprep/opensearch/langchain/docker-compose-dataprep-opensearch.yaml new file mode 100644 index 000000000..7699bee1c --- /dev/null +++ b/comps/dataprep/opensearch/langchain/docker-compose-dataprep-opensearch.yaml @@ -0,0 +1,65 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + opensearch-vector-db: + image: opensearchproject/opensearch:latest + container_name: opensearch-vector-db + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-vector-db + - discovery.seed_hosts=opensearch-vector-db + - cluster.initial_master_nodes=opensearch-vector-db + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + ports: + - 9200:9200 + - 9600:9600 # required for Performance Analyzer + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6060:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate + dataprep-opensearch: + image: opea/dataprep-opensearch:latest + container_name: dataprep-opensearch-server + ports: + - 6007:6007 + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + OPENSEARCH_URL: ${OPENSEARCH_URL} + INDEX_NAME: ${INDEX_NAME} + TEI_ENDPOINT: ${TEI_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + restart: unless-stopped + security_opt: + - no-new-privileges:true + +networks: + default: + driver: bridge + opensearch-net: diff --git a/comps/dataprep/opensearch/langchain/prepare_doc_opensearch.py b/comps/dataprep/opensearch/langchain/prepare_doc_opensearch.py new file mode 100644 index 000000000..10c9f8353 --- /dev/null +++ b/comps/dataprep/opensearch/langchain/prepare_doc_opensearch.py @@ -0,0 +1,471 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from pathlib import Path +from typing import List, Optional, Union + +from config import ( + EMBED_MODEL, + INDEX_NAME, + KEY_INDEX_NAME, + OPENSEARCH_INITIAL_ADMIN_PASSWORD, + OPENSEARCH_URL, + SEARCH_BATCH_SIZE, +) +from fastapi import Body, File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_community.vectorstores import OpenSearchVectorSearch +from langchain_huggingface import HuggingFaceEndpointEmbeddings +from langchain_text_splitters import HTMLHeaderTextSplitter + +# from pyspark import SparkConf, SparkContext +from opensearchpy import OpenSearch, helpers + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + create_upload_folder, + document_loader, + encode_filename, + format_search_results, + get_separators, + get_tables_result, + parse_html, + remove_folder_with_ignore, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_opensearch") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") +if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) +else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) +auth = ("admin", OPENSEARCH_INITIAL_ADMIN_PASSWORD) +opensearch_client = OpenSearchVectorSearch( + opensearch_url=OPENSEARCH_URL, + index_name=INDEX_NAME, + embedding_function=embeddings, + http_auth=auth, + use_ssl=True, + verify_certs=False, + ssl_assert_hostname=False, + ssl_show_warn=False, +) + + +def check_index_existence(client, index_name): + if logflag: + logger.info(f"[ check index existence ] checking {client}") + try: + exists = client.index_exists(index_name) + exists = False if exists is None else exists + if exists: + if logflag: + logger.info(f"[ check index existence ] index of client exists: {client}") + else: + if logflag: + logger.info("[ check index existence ] index does not exist") + return exists + except Exception as e: + if logflag: + logger.info(f"[ check index existence ] error checking index for client: {e}") + return False + + +def create_index(client, index_name: str = KEY_INDEX_NAME): + if logflag: + logger.info(f"[ create index ] creating index {index_name}") + try: + index_body = { + "mappings": { + "properties": { + "file_name": {"type": "text"}, + "key_ids": {"type": "text"}, + } + } + } + + # Create the index + client.client.indices.create(index_name, body=index_body) + + if logflag: + logger.info(f"[ create index ] index {index_name} successfully created") + return True + except Exception as e: + if logflag: + logger.info(f"[ create index ] fail to create index {index_name}: {e}") + return False + + +def store_by_id(client, key, value): + if logflag: + logger.info(f"[ store by id ] storing ids of {key}") + try: + client.client.index( + index=KEY_INDEX_NAME, body={"file_name": f"file:${key}", "key_ids:": value}, id="file:" + key, refresh=True + ) + if logflag: + logger.info(f"[ store by id ] store document success. id: file:{key}") + except Exception as e: + if logflag: + logger.info(f"[ store by id ] fail to store document file:{key}: {e}") + return False + return True + + +def search_by_id(client, doc_id): + if logflag: + logger.info(f"[ search by id ] searching docs of {doc_id}") + try: + result = client.client.get(index=KEY_INDEX_NAME, id=doc_id) + if result["found"]: + if logflag: + logger.info(f"[ search by id ] search success of {doc_id}: {result}") + return result + return None + except Exception as e: + if logflag: + logger.info(f"[ search by id ] fail to search docs of {doc_id}: {e}") + return None + + +def drop_index(client, index_name): + if logflag: + logger.info(f"[ drop index ] dropping index {index_name}") + try: + client.client.indices.delete(index=index_name) + if logflag: + logger.info(f"[ drop index ] index {index_name} deleted") + except Exception as e: + if logflag: + logger.info(f"[ drop index ] index {index_name} delete failed: {e}") + return False + return True + + +def delete_by_id(client, doc_id): + try: + response = client.client.delete(index=KEY_INDEX_NAME, id=doc_id) + if response["result"] == "deleted": + if logflag: + logger.info(f"[ delete by id ] delete id success: {doc_id}") + return True + else: + if logflag: + logger.info(f"[ delete by id ] delete id failed: {doc_id}") + return False + except Exception as e: + if logflag: + logger.info(f"[ delete by id ] fail to delete ids {doc_id}: {e}") + return False + + +def ingest_chunks_to_opensearch(file_name: str, chunks: List): + if logflag: + logger.info(f"[ ingest chunks ] file name: {file_name}") + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + + file_ids = [] + for i in range(0, num_chunks, batch_size): + if logflag: + logger.info(f"[ ingest chunks ] Current batch: {i}") + batch_chunks = chunks[i : i + batch_size] + + keys = opensearch_client.add_texts(texts=batch_chunks, metadatas=[{"source": file_name} for _ in batch_chunks]) + if logflag: + logger.info(f"[ ingest chunks ] keys: {keys}") + file_ids.extend(keys) + if logflag: + logger.info(f"[ ingest chunks ] Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + # store file_ids into index file-keys + if not check_index_existence(opensearch_client, KEY_INDEX_NAME): + assert create_index(opensearch_client) + + try: + assert store_by_id(opensearch_client, key=file_name, value="#".join(file_ids)) + except Exception as e: + if logflag: + logger.info(f"[ ingest chunks ] {e}. Fail to store chunks of file {file_name}.") + raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.") + return True + + +def ingest_data_to_opensearch(doc_path: DocPath): + """Ingest document to OpenSearch.""" + path = doc_path.path + if logflag: + logger.info(f"[ ingest data ] Parsing document {path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + if logflag: + logger.info("[ ingest data ] file content loaded") + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + ### Specially processing for the table content in PDFs + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info(f"[ ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.") + + file_name = doc_path.path.split("/")[-1] + return ingest_chunks_to_opensearch(file_name, chunks) + + +def search_all_documents(index_name, offset, search_batch_size): + try: + response = opensearch_client.client.search( + index=index_name, + body={ + "query": {"match_all": {}}, + "from": offset, # Starting position + "size": search_batch_size, # Number of results to return + }, + ) + # Get total number of matching documents + total_hits = response["hits"]["total"]["value"] + # Get the documents from the current batch + documents = response["hits"]["hits"] + + return {"total_hits": total_hits, "documents": documents} + + except Exception as e: + print(f"Error performing search: {e}") + return None + + +@register_microservice(name="opea_service@prepare_doc_opensearch", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"[ upload ] files:{files}") + logger.info(f"[ upload ] link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + + for file in files: + encode_file = encode_filename(file.filename) + doc_id = "file:" + encode_file + if logflag: + logger.info(f"[ upload ] processing file {doc_id}") + + # check whether the file already exists + key_ids = None + try: + document = search_by_id(opensearch_client, doc_id) + if document: + if logflag: + logger.info(f"[ upload ] File {file.filename} already exists.") + key_ids = document["_id"] + except Exception as e: + logger.info(f"[ upload ] File {file.filename} does not exist.") + if key_ids: + raise HTTPException( + status_code=400, detail=f"Uploaded file {file.filename} already exists. Please change file name." + ) + + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_opensearch( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"[ upload ] Successfully saved file {save_path}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail=f"Link_list {link_list} should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + doc_id = "file:" + encoded_link + ".txt" + if logflag: + logger.info(f"[ upload ] processing link {doc_id}") + + # check whether the link file already exists + key_ids = None + try: + document = search_by_id(opensearch_client, doc_id) + if document: + if logflag: + logger.info(f"[ upload ] Link {link} already exists.") + key_ids = document["_id"] + except Exception as e: + logger.info(f"[ upload ] Link {link} does not exist. Keep storing.") + if key_ids: + raise HTTPException( + status_code=400, detail=f"Uploaded link {link} already exists. Please change another link." + ) + + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + await save_content_to_local_disk(save_path, content) + ingest_data_to_opensearch( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + if logflag: + logger.info(f"[ upload ] Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +@register_microservice( + name="opea_service@prepare_doc_opensearch", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6007 +) +async def rag_get_file_structure(): + if logflag: + logger.info("[ get ] start to get file structure") + + offset = 0 + file_list = [] + + # check index existence + res = check_index_existence(opensearch_client, KEY_INDEX_NAME) + if not res: + if logflag: + logger.info(f"[ get ] index {KEY_INDEX_NAME} does not exist") + return file_list + + while True: + response = search_all_documents(KEY_INDEX_NAME, offset, SEARCH_BATCH_SIZE) + # no doc retrieved + if len(response) < 2: + break + + def format_opensearch_results(response, file_list): + for document in response["documents"]: + file_id = document["_id"] + file_list.append({"name": file_id, "id": file_id, "type": "File", "parent": ""}) + + file_list = format_opensearch_results(response, file_list) + offset += SEARCH_BATCH_SIZE + # last batch + if (len(response) - 1) // 2 < SEARCH_BATCH_SIZE: + break + if logflag: + logger.info(f"[get] final file_list: {file_list}") + return file_list + + +@register_microservice( + name="opea_service@prepare_doc_opensearch", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6007 +) +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - specific file path (e.g. /path/to/file.txt) + - "all": delete all files uploaded + """ + + # delete all uploaded files + if file_path == "all": + if logflag: + logger.info("[ delete ] delete all files") + + # drop index KEY_INDEX_NAME + if check_index_existence(opensearch_client, KEY_INDEX_NAME): + try: + assert drop_index(index_name=KEY_INDEX_NAME) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to drop index {KEY_INDEX_NAME}.") + raise HTTPException(status_code=500, detail=f"Fail to drop index {KEY_INDEX_NAME}.") + else: + logger.info(f"[ delete ] Index {KEY_INDEX_NAME} does not exits.") + + # drop index INDEX_NAME + if check_index_existence(opensearch_client, INDEX_NAME): + try: + assert drop_index(index_name=INDEX_NAME) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to drop index {INDEX_NAME}.") + raise HTTPException(status_code=500, detail=f"Fail to drop index {INDEX_NAME}.") + else: + if logflag: + logger.info(f"[ delete ] Index {INDEX_NAME} does not exits.") + + # delete files on local disk + try: + remove_folder_with_ignore(upload_folder) + except Exception as e: + if logflag: + logger.info(f"[ delete ] {e}. Fail to delete {upload_folder}.") + raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.") + + if logflag: + logger.info("[ delete ] successfully delete all files.") + create_upload_folder(upload_folder) + if logflag: + logger.info({"status": True}) + return {"status": True} + else: + raise HTTPException(status_code=404, detail="Single file deletion is not implemented yet") + + +if __name__ == "__main__": + create_upload_folder(upload_folder) + opea_microservices["opea_service@prepare_doc_opensearch"].start() diff --git a/comps/dataprep/opensearch/langchain/requirements.txt b/comps/dataprep/opensearch/langchain/requirements.txt new file mode 100644 index 000000000..fa242973e --- /dev/null +++ b/comps/dataprep/opensearch/langchain/requirements.txt @@ -0,0 +1,30 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-text-splitters +langchain_huggingface +markdown +numpy +opensearch-py +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +pyspark +pytesseract +python-bidi +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs] +uvicorn diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py index 1331f3772..78f9e3eea 100644 --- a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py +++ b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py @@ -21,7 +21,7 @@ encode_filename, get_file_structure, get_separators, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -158,7 +158,7 @@ async def ingest_link_to_pgvector(link_list: List[str]): for link in link_list: texts = [] - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) if logflag: logger.info(f"[ ingest link ] link: {link} content: {content}") encoded_link = encode_filename(link) diff --git a/comps/dataprep/pgvector/langchain/requirements.txt b/comps/dataprep/pgvector/langchain/requirements.txt index 5235cd5ff..ab3d19db4 100644 --- a/comps/dataprep/pgvector/langchain/requirements.txt +++ b/comps/dataprep/pgvector/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py b/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py index 9bb5c35ff..aa24e44b1 100644 --- a/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py +++ b/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py @@ -24,7 +24,7 @@ get_file_structure, get_separators, get_tables_result, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -158,7 +158,7 @@ def ingest_data_to_pinecone(doc_path: DocPath): pc = Pinecone(api_key=PINECONE_API_KEY) -async def ingest_link_to_pinecone(link_list: List[str]): +async def ingest_link_to_pinecone(link_list: List[str], chunk_size, chunk_overlap): # Create embedding obj if tei_embedding_endpoint: # create embeddings using TEI endpoint service @@ -178,7 +178,7 @@ async def ingest_link_to_pinecone(link_list: List[str]): # save link contents and doc_ids one by one for link in link_list: - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) if logflag: logger.info(f"[ ingest link ] link: {link} content: {content}") encoded_link = encode_filename(link) @@ -239,7 +239,7 @@ async def ingest_documents( link_list = json.loads(link_list) # Parse JSON string to list if not isinstance(link_list, list): raise HTTPException(status_code=400, detail="link_list should be a list.") - await ingest_link_to_pinecone(link_list) + await ingest_link_to_pinecone(link_list, chunk_size, chunk_overlap) result = {"status": 200, "message": "Data preparation succeeded"} if logflag: logger.info(f"Successfully saved link list {link_list}") diff --git a/comps/dataprep/pinecone/langchain/requirements.txt b/comps/dataprep/pinecone/langchain/requirements.txt index 80f81bd5e..27bbac44b 100644 --- a/comps/dataprep/pinecone/langchain/requirements.txt +++ b/comps/dataprep/pinecone/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py b/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py index a97987817..80678e98e 100644 --- a/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py +++ b/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py @@ -19,7 +19,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, save_content_to_local_disk, ) @@ -149,7 +149,7 @@ async def ingest_documents( for link in link_list: encoded_link = encode_filename(link) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) try: await save_content_to_local_disk(save_path, content) ingest_data_to_qdrant( diff --git a/comps/dataprep/qdrant/langchain/requirements.txt b/comps/dataprep/qdrant/langchain/requirements.txt index f505af163..8f92c8ca8 100644 --- a/comps/dataprep/qdrant/langchain/requirements.txt +++ b/comps/dataprep/qdrant/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 6902117dc..ae69a28fc 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -26,7 +26,7 @@ format_search_results, get_separators, get_tables_result, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -320,7 +320,7 @@ async def ingest_documents( ) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) await save_content_to_local_disk(save_path, content) ingest_data_to_redis( DocPath( diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt index 8c3b116fa..43ff2f93b 100644 --- a/comps/dataprep/redis/langchain/requirements.txt +++ b/comps/dataprep/redis/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py index d5ec731ba..2af834cac 100644 --- a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py +++ b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py @@ -48,7 +48,7 @@ encode_filename, get_file_structure, get_separators, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, timeout, @@ -255,7 +255,7 @@ def ingest_link_to_redis(link_list: List[str], enable_ray=False, num_cpus=20): link_list = [str(f) for f in link_list] def _parse_html(link): - data = parse_html([link]) + data = parse_html_new([link], chunk_size=1500, chunk_overlap=100) return data[0][0] if enable_ray: diff --git a/comps/dataprep/redis/langchain_ray/requirements.txt b/comps/dataprep/redis/langchain_ray/requirements.txt index 0237109e7..853304542 100644 --- a/comps/dataprep/redis/langchain_ray/requirements.txt +++ b/comps/dataprep/redis/langchain_ray/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index 910bca343..cf104017f 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -620,6 +620,28 @@ def parse_html(input): return chucks +def load_html_content(links, chunk_size=1500, chunk_overlap=50): + from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_community.document_loaders import AsyncHtmlLoader + from langchain_community.document_transformers import Html2TextTransformer + + loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True) + docs = loader.load() + html2text = Html2TextTransformer() + docs = list(html2text.transform_documents(docs)) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + docs = text_splitter.split_documents(docs) + return docs + + +def parse_html_new(input, chunk_size, chunk_overlap): + docs = load_html_content(input, chunk_size, chunk_overlap) + html_content = "" + for doc in docs: + html_content += doc.page_content + "\n" + return html_content + + def get_tables_result(pdf_path, table_strategy): """Extract tables information from pdf file.""" if table_strategy == "fast": diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py index d45373f00..a50a95853 100644 --- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py +++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py @@ -19,7 +19,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, save_content_to_local_disk, ) @@ -143,7 +143,7 @@ async def ingest_documents( # check whether the link file already exists save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) await save_content_to_local_disk(save_path, content) ingest_data_to_vdms( DocPath( diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt index 88b2c033a..96fac9215 100644 --- a/comps/dataprep/vdms/langchain/requirements.txt +++ b/comps/dataprep/vdms/langchain/requirements.txt @@ -6,6 +6,7 @@ docx2txt easyocr einops fastapi +html2text huggingface_hub langchain langchain-community @@ -17,10 +18,12 @@ numpy opencv-python opentelemetry-api opentelemetry-exporter-otlp +opentelemetry-proto==1.23.0 opentelemetry-sdk pandas Pillow prometheus-fastapi-instrumentator +protobuf==4.24.2 pymupdf pyspark pytesseract @@ -35,4 +38,4 @@ typing tzlocal unstructured[all-docs]==0.11.5 uvicorn -vdms +vdms>=0.0.20 diff --git a/comps/dataprep/vdms/multimodal_langchain/requirements.txt b/comps/dataprep/vdms/multimodal_langchain/requirements.txt index f6044266c..773f91289 100644 --- a/comps/dataprep/vdms/multimodal_langchain/requirements.txt +++ b/comps/dataprep/vdms/multimodal_langchain/requirements.txt @@ -17,10 +17,12 @@ numpy opencv-python opentelemetry-api opentelemetry-exporter-otlp +opentelemetry-proto==1.23.0 opentelemetry-sdk pandas Pillow prometheus-fastapi-instrumentator +protobuf==4.24.2 pymupdf pyspark python-bidi==0.4.2 @@ -34,4 +36,4 @@ typing tzlocal unstructured[all-docs]==0.11.5 uvicorn -vdms +vdms>=0.0.20 diff --git a/comps/llms/faq-generation/vllm/langchain/README.md b/comps/llms/faq-generation/vllm/langchain/README.md index b04cfc9d0..132521c4e 100644 --- a/comps/llms/faq-generation/vllm/langchain/README.md +++ b/comps/llms/faq-generation/vllm/langchain/README.md @@ -35,7 +35,7 @@ You can choose one as needed. ### 1.3 Run Docker with CLI (Option A) ```bash -docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm:hpu --model-id ${LLM_MODEL_ID} +docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID} ``` ```bash diff --git a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml index d0a00af5b..8ed64dd97 100644 --- a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm:hpu + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/summarization/vllm/langchain/README.md b/comps/llms/summarization/vllm/langchain/README.md index bdb8f9beb..dafc6e109 100644 --- a/comps/llms/summarization/vllm/langchain/README.md +++ b/comps/llms/summarization/vllm/langchain/README.md @@ -18,7 +18,7 @@ pip install -r requirements.txt ```bash export HF_TOKEN=${your_hf_api_token} export LLM_MODEL_ID=${your_hf_llm_model} -docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm:hpu --model-id ${LLM_MODEL_ID} +docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID} ``` ### 1.3 Verify the vLLM Service diff --git a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml b/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml index 8cc13e318..b93fd8030 100644 --- a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm:hpu + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh index aa189df0c..bcbf20c4a 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh @@ -38,7 +38,7 @@ if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ git checkout 3c39626 - docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork else diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh index 421112b68..6f6a7d211 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-hpu:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml index cb0dc2216..e817c9f35 100644 --- a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm-hpu:latest + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh index 7bd162954..c94dd7237 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh @@ -33,7 +33,7 @@ if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ git checkout 3c39626 - docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork else diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh index 300d8a551..d3363aa40 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm-hpu:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm-gaudi:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml index e6ccae55f..eeed7d19a 100644 --- a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm-hpu:latest + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/reranks/tei/reranking_tei.py b/comps/reranks/tei/reranking_tei.py index daae461da..682346f6d 100644 --- a/comps/reranks/tei/reranking_tei.py +++ b/comps/reranks/tei/reranking_tei.py @@ -41,8 +41,8 @@ endpoint="/v1/reranking", host="0.0.0.0", port=8000, - input_datatype=SearchedDoc, - output_datatype=LLMParamsDoc, + input_datatype=Union[SearchedDoc, RerankingRequest, ChatCompletionRequest], + output_datatype=Union[LLMParamsDoc, RerankingResponse, ChatCompletionRequest], ) @register_statistics(names=["opea_service@reranking_tei"]) async def reranking( diff --git a/comps/retrievers/opensearch/data/nke-10k-2023.pdf b/comps/retrievers/opensearch/data/nke-10k-2023.pdf new file mode 100644 index 000000000..6ade8863e Binary files /dev/null and b/comps/retrievers/opensearch/data/nke-10k-2023.pdf differ diff --git a/comps/retrievers/opensearch/langchain/Dockerfile b/comps/retrievers/opensearch/langchain/Dockerfile new file mode 100644 index 000000000..038b5d6bc --- /dev/null +++ b/comps/retrievers/opensearch/langchain/Dockerfile @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY comps /home/user/comps + +USER user + +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/opensearch/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/opensearch/langchain + +ENTRYPOINT ["python", "retriever_opensearch.py"] diff --git a/comps/retrievers/opensearch/langchain/README.md b/comps/retrievers/opensearch/langchain/README.md new file mode 100644 index 000000000..487f8e7d5 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/README.md @@ -0,0 +1,144 @@ +# Retriever Microservice + +This retriever microservice is a highly efficient search service designed for handling and retrieving embedding vectors. It operates by receiving an embedding vector as input and conducting a similarity search against vectors stored in a VectorDB database. Users must specify the VectorDB's URL and the index name, and the service searches within that index to find documents with the highest similarity to the input vector. + +The service primarily utilizes similarity measures in vector space to rapidly retrieve contentually similar documents. The vector-based retrieval approach is particularly suited for handling large datasets, offering fast and accurate search results that significantly enhance the efficiency and quality of information retrieval. + +Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial. + +## ๐Ÿš€1. Start Microservice with Python (Option 1) + +To start the retriever microservice, you must first install the required python packages. + +### 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +### 1.2 Start TEI Service + +```bash +model=BAAI/bge-base-en-v1.5 +volume=$PWD/data +docker run -d -p 6060:80 -v $volume:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model +``` + +### 1.3 Verify the TEI Service + +Health check the embedding service with: + +```bash +curl 127.0.0.1:6060/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +### 1.4 Setup VectorDB Service + +You need to setup your own VectorDB service (OpenSearch in this example), and ingest your knowledge documents into the vector database. + +As for OpenSearch, you could start a docker container referencing the instructions found in the OpenSearch vectorstores [README.md](../../../vectorstores/opensearch/README.md) + +### 1.5 Start Retriever Service + +```bash +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +python retriever_opensearch.py +``` + +## ๐Ÿš€2. Start Microservice with Docker (Option 2) + +### 2.1 Setup Environment Variables + +```bash +export RETRIEVE_MODEL_ID="BAAI/bge-base-en-v1.5" +export OPENSEARCH_URL="http://${your_ip}:9200" +export INDEX_NAME=${your_index_name} +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +export HUGGINGFACEHUB_API_TOKEN=${your_hf_token} +export OPENSEARCH_INITIAL_ADMIN_PASSWORD=${your_opensearch_initial_admin_password} +``` + +### 2.2 Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/retriever-opensearch-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/opensearch/langchain/Dockerfile . +``` + +To start a docker container, you have two options: + +- A. Run Docker with CLI +- B. Run Docker with Docker Compose + +You can choose one as needed. + +### 2.3 Run Docker with CLI (Option A) + +```bash +docker run -d --name="retriever-opensearch-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e OPENSEARCH_URL=$OPENSEARCH_URL -e INDEX_NAME=$INDEX_NAME -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/retriever-opensearch:latest +``` + +### 2.4 Run Docker with Docker Compose (Option B) + +```bash +docker compose -f docker_compose_retriever.yaml up -d +``` + +## ๐Ÿš€3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://localhost:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/opensearch/langchain/__init__.py b/comps/retrievers/opensearch/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/opensearch/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/opensearch/langchain/docker_compose_retriever.yaml b/comps/retrievers/opensearch/langchain/docker_compose_retriever.yaml new file mode 100644 index 000000000..653e413a3 --- /dev/null +++ b/comps/retrievers/opensearch/langchain/docker_compose_retriever.yaml @@ -0,0 +1,36 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + tei_xeon_service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 + container_name: tei-xeon_server + ports: + - "6060:80" + volumes: + - "./data:/data" + shm_size: 1g + command: --model-id ${RETRIEVE_MODEL_ID} + retriever: + image: opea/retriever-opensearch-server + container_name: retriever-opensearch-server + ports: + - "7000:7000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + OPENSEARCH_URL: ${OPENSEARCH_URL} + INDEX_NAME: ${INDEX_NAME} + TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + restart: unless-stopped + security_opt: + - no-new-privileges:true + +networks: + default: + driver: bridge diff --git a/comps/retrievers/opensearch/langchain/opensearch_config.py b/comps/retrievers/opensearch/langchain/opensearch_config.py new file mode 100644 index 000000000..32273d1de --- /dev/null +++ b/comps/retrievers/opensearch/langchain/opensearch_config.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +# Whether or not to enable langchain debugging +DEBUG = get_boolean_env_var("DEBUG", False) +# Set DEBUG env var to "true" if you wish to enable LC debugging module +if DEBUG: + import langchain + + langchain.debug = True + + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + + +# OpenSearch Connection Information +OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost") +OPENSEARCH_PORT = int(os.getenv("OPENSEARCH_PORT", 9200)) +OPENSEARCH_INITIAL_ADMIN_PASSWORD = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD", "StRoNgOpEa0)") + + +def format_opensearch_conn_from_env(): + opensearch_url = os.getenv("OPENSEARCH_URL", None) + if opensearch_url: + return opensearch_url + else: + using_ssl = get_boolean_env_var("OPENSEARCH_SSL", False) + start = "https://" if using_ssl else "http://" + + return start + f"{OPENSEARCH_HOST}:{OPENSEARCH_PORT}" + + +OPENSEARCH_URL = format_opensearch_conn_from_env() + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "rag-opensearch") + + +current_file_path = os.path.abspath(__file__) +parent_dir = os.path.dirname(current_file_path) diff --git a/comps/retrievers/opensearch/langchain/requirements.txt b/comps/retrievers/opensearch/langchain/requirements.txt new file mode 100644 index 000000000..5690118bb --- /dev/null +++ b/comps/retrievers/opensearch/langchain/requirements.txt @@ -0,0 +1,16 @@ +docarray[full] +easyocr +fastapi +langchain_community +langchain_huggingface +numpy +opensearch-py +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +pydantic +pymupdf +sentence_transformers +shortuuid +uvicorn diff --git a/comps/retrievers/opensearch/langchain/retriever_opensearch.py b/comps/retrievers/opensearch/langchain/retriever_opensearch.py new file mode 100644 index 000000000..c570cb6db --- /dev/null +++ b/comps/retrievers/opensearch/langchain/retriever_opensearch.py @@ -0,0 +1,162 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import Callable, List, Union + +import numpy as np +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +from langchain_community.vectorstores import OpenSearchVectorSearch +from langchain_huggingface import HuggingFaceEndpointEmbeddings +from opensearch_config import EMBED_MODEL, INDEX_NAME, OPENSEARCH_INITIAL_ADMIN_PASSWORD, OPENSEARCH_URL +from pydantic import conlist + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + +logger = CustomLogger("retriever_opensearch") +logflag = os.getenv("LOGFLAG", False) + +tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT", None) + + +async def search_all_embeddings_vectors( + embeddings: Union[conlist(float, min_length=0), List[conlist(float, min_length=0)]], func: Callable, *args, **kwargs +): + try: + if not isinstance(embeddings, np.ndarray): + embeddings = np.array(embeddings) + + if not np.issubdtype(embeddings.dtype, np.floating): + raise ValueError("All embeddings values must be floating point numbers") + + if embeddings.ndim == 1: + return await func(embedding=embeddings, *args, **kwargs) + elif embeddings.ndim == 2: + responses = [] + for emb in embeddings: + response = await func(embedding=emb, *args, **kwargs) + responses.extend(response) + return responses + else: + raise ValueError("Embeddings must be one or two dimensional") + except Exception as e: + raise ValueError(f"Embedding data is not valid: {e}") + + +@register_microservice( + name="opea_service@retriever_opensearch", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_opensearch"]) +async def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + if logflag: + logger.info(input) + start = time.time() + + # Check if the index exists and has documents + doc_count = 0 + + index_exists = vector_db.client.indices.exists(index=INDEX_NAME) + if index_exists: + doc_count = vector_db.client.count(index=INDEX_NAME)["count"] + if (not index_exists) or doc_count == 0: + search_res = [] + else: + if isinstance(input, EmbedDoc): + query = input.text + else: + # for RetrievalRequest, ChatCompletionRequest + query = input.input + # if the OpenSearch index has data, perform the search + if input.search_type == "similarity": + search_res = await search_all_embeddings_vectors( + embeddings=input.embedding, + func=vector_db.asimilarity_search_by_vector, + k=input.k, + ) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") + search_res = await search_all_embeddings_vectors( + embeddings=input.embedding, + func=vector_db.asimilarity_search_by_vector, + k=input.k, + distance_threshold=input.distance_threshold, + ) + elif input.search_type == "similarity_score_threshold": + doc_and_similarities = await vector_db.asimilarity_search_with_relevance_scores( + query=input.text, k=input.k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in doc_and_similarities] + elif input.search_type == "mmr": + search_res = await vector_db.amax_marginal_relevance_search( + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + else: + raise ValueError(f"{input.search_type} not valid") + + # return different response format + retrieved_docs = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + else: + for r in search_res: + retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata)) + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input + + statistics_dict["opea_service@retriever_opensearch"].append_latency(time.time() - start, None) + if logflag: + logger.info(result) + return result + + +if __name__ == "__main__": + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embeddings = HuggingFaceEndpointEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + auth = ("admin", OPENSEARCH_INITIAL_ADMIN_PASSWORD) + vector_db = OpenSearchVectorSearch( + opensearch_url=OPENSEARCH_URL, + index_name=INDEX_NAME, + embedding_function=embeddings, + http_auth=auth, + use_ssl=True, + verify_certs=False, + ssl_assert_hostname=False, + ssl_show_warn=False, + ) + opea_microservices["opea_service@retriever_opensearch"].start() diff --git a/comps/retrievers/redis/langchain/retriever_redis.py b/comps/retrievers/redis/langchain/retriever_redis.py index d46e792f0..ada07d236 100644 --- a/comps/retrievers/redis/langchain/retriever_redis.py +++ b/comps/retrievers/redis/langchain/retriever_redis.py @@ -23,6 +23,7 @@ ) from comps.cores.proto.api_protocol import ( ChatCompletionRequest, + EmbeddingResponse, RetrievalRequest, RetrievalResponse, RetrievalResponseData, @@ -54,12 +55,25 @@ async def retrieve( else: if isinstance(input, EmbedDoc): query = input.text + embedding_data_input = input.embedding else: # for RetrievalRequest, ChatCompletionRequest query = input.input + if isinstance(input.embedding, EmbeddingResponse): + embeddings = input.embedding.data + embedding_data_input = [] + for emb in embeddings: + # each emb is EmbeddingResponseData + # print("Embedding data: ", emb.embedding) + # print("Embedding data length: ",len(emb.embedding)) + embedding_data_input.append(emb.embedding) + # print("All Embedding data length: ",len(embedding_data_input)) + else: + embedding_data_input = input.embedding + # if the Redis index has data, perform the search if input.search_type == "similarity": - search_res = await vector_db.asimilarity_search_by_vector(embedding=input.embedding, k=input.k) + search_res = await vector_db.asimilarity_search_by_vector(embedding=embedding_data_input, k=input.k) elif input.search_type == "similarity_distance_threshold": if input.distance_threshold is None: raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") diff --git a/comps/retrievers/vdms/langchain/requirements.txt b/comps/retrievers/vdms/langchain/requirements.txt index 9138aea2e..44d80c13b 100644 --- a/comps/retrievers/vdms/langchain/requirements.txt +++ b/comps/retrievers/vdms/langchain/requirements.txt @@ -7,10 +7,12 @@ langchain-core langchain-huggingface opentelemetry-api opentelemetry-exporter-otlp +opentelemetry-proto==1.23.0 opentelemetry-sdk prometheus-fastapi-instrumentator +protobuf==4.24.2 pymupdf sentence_transformers shortuuid uvicorn -vdms +vdms>=0.0.20 diff --git a/comps/vectorstores/opensearch/README.md b/comps/vectorstores/opensearch/README.md new file mode 100644 index 000000000..f784d7296 --- /dev/null +++ b/comps/vectorstores/opensearch/README.md @@ -0,0 +1,35 @@ +# Start Opensearch server + +## Prerequisites + +1. Install docker +1. Install docker compose (if not already installed) + 1. `sudo curl -L https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose` + 2. `sudo chmod +x /usr/local/bin/docker-compose` + +## Instructions + +### 1. Set admin password as environment variable + +OpenSearch version 2.12 and later require a custom admin password to be set. Following [these guidelines](https://opensearch.org/docs/latest/security/configuration/demo-configuration/#setting-up-a-custom-admin-password), set the admin password as an environment variable to be used by the `docker-compose-opensearch.yml` file like `export OPENSEARCH_INITIAL_ADMIN_PASSWORD=_some_admin_password` in the terminal before starting the docker containers. + +### 2. Start the cluster + +`docker-compose -f docker-compose-opensearch.yml up` + +## Troubleshooting + +### "java.nio.file.FileSystemNotFoundException: null" error + +1. Make sure to grant read permissions to your local data volume folders + 1. `sudo chown -R instance_user:instance_user ./opensearch-data1` + 2. `sudo chown -R instance_user:instance_user ./opensearch-data2` + 1. Replace `instance_user` with the login user (i.e. ec2-user, ssm-user, or your local user name) +2. Try increasing the virtual max memory map count + 1. `sudo sysctl -w vm.max_map_count=262144` + +### OpenSearch Dashboards container errors + +1. Make sure to grant read permission to the `opensearch_dashboards.yml` file +1. `sudo chown -R instance_user:instance_user ./opensearch_dashboards.yml` + 1. Replace `instance_user` with the login user (i.e. ec2-user, ssm-user, or your local user name) diff --git a/comps/vectorstores/opensearch/__init__.py b/comps/vectorstores/opensearch/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/vectorstores/opensearch/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/vectorstores/opensearch/docker-compose-opensearch.yml b/comps/vectorstores/opensearch/docker-compose-opensearch.yml new file mode 100644 index 000000000..1769850e6 --- /dev/null +++ b/comps/vectorstores/opensearch/docker-compose-opensearch.yml @@ -0,0 +1,81 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: '3' +services: + opensearch-node1: + image: opensearchproject/opensearch:latest + container_name: opensearch-node1 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node1 + - discovery.seed_hosts=opensearch-node1,opensearch-node2 + - cluster.initial_master_nodes=opensearch-node1,opensearch-node2 + - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems + hard: 65536 + volumes: + - ./opensearch-data1:/var/lib/opensearch/data + ports: + - 9200:9200 + - 9600:9600 # required for Performance Analyzer + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + opensearch-node2: + image: opensearchproject/opensearch:latest + container_name: opensearch-node2 + environment: + - cluster.name=opensearch-cluster + - node.name=opensearch-node2 + - discovery.seed_hosts=opensearch-node1,opensearch-node2 + - cluster.initial_master_nodes=opensearch-node1,opensearch-node2 + - bootstrap.memory_lock=true + - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" + - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_INITIAL_ADMIN_PASSWORD} # Sets the demo admin user password when using demo configuration, required for OpenSearch 2.12 and later + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + volumes: + - ./opensearch-data2:/var/lib/opensearch/data + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + opensearch-dashboards: + image: opensearchproject/opensearch-dashboards:latest + volumes: + - ./opensearch_dashboards.yml:/usr/share/opensearch-dashboards/config/opensearch_dashboards.yml + container_name: opensearch-dashboards + ports: + - 5601:5601 + expose: + - "5601" + environment: + OPENSEARCH_HOSTS: '["https://opensearch-node1:9200","https://opensearch-node2:9200"]' # must be a string with no spaces when specified as an environment variable + networks: + - opensearch-net + security_opt: + - no-new-privileges:true + depends_on: + - opensearch-node1 + - opensearch-node2 + +volumes: + opensearch-data1: + opensearch-data2: + +networks: + opensearch-net: diff --git a/comps/vectorstores/opensearch/opensearch_dashboards.yml b/comps/vectorstores/opensearch/opensearch_dashboards.yml new file mode 100644 index 000000000..f6d43e6ed --- /dev/null +++ b/comps/vectorstores/opensearch/opensearch_dashboards.yml @@ -0,0 +1,210 @@ +--- +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 + +# Description: +# Default configuration for OpenSearch Dashboards + +# OpenSearch Dashboards is served by a back end server. This setting specifies the port to use. +# server.port: 5601 + +# Specifies the address to which the OpenSearch Dashboards server will bind. IP addresses and host names are both valid values. +# The default is 'localhost', which usually means remote machines will not be able to connect. +# To allow connections from remote users, set this parameter to a non-loopback address. +# server.host: "localhost" + +# Enables you to specify a path to mount OpenSearch Dashboards at if you are running behind a proxy. +# Use the `server.rewriteBasePath` setting to tell OpenSearch Dashboards if it should remove the basePath +# from requests it receives, and to prevent a deprecation warning at startup. +# This setting cannot end in a slash. +# server.basePath: "" + +# Specifies whether OpenSearch Dashboards should rewrite requests that are prefixed with +# `server.basePath` or require that they are rewritten by your reverse proxy. +# server.rewriteBasePath: false + +# The maximum payload size in bytes for incoming server requests. +# server.maxPayloadBytes: 1048576 + +# The OpenSearch Dashboards server's name. This is used for display purposes. +# server.name: "your-hostname" + +# The URLs of the OpenSearch instances to use for all your queries. +# opensearch.hosts: ["http://localhost:9200"] + +# OpenSearch Dashboards uses an index in OpenSearch to store saved searches, visualizations and +# dashboards. OpenSearch Dashboards creates a new index if the index doesn't already exist. +# opensearchDashboards.index: ".opensearch_dashboards" + +# The default application to load. +# opensearchDashboards.defaultAppId: "home" + +# Setting for an optimized healthcheck that only uses the local OpenSearch node to do Dashboards healthcheck. +# This settings should be used for large clusters or for clusters with ingest heavy nodes. +# It allows Dashboards to only healthcheck using the local OpenSearch node rather than fan out requests across all nodes. +# +# It requires the user to create an OpenSearch node attribute with the same name as the value used in the setting +# This node attribute should assign all nodes of the same cluster an integer value that increments with each new cluster that is spun up +# e.g. in opensearch.yml file you would set the value to a setting using node.attr.cluster_id: +# Should only be enabled if there is a corresponding node attribute created in your OpenSearch config that matches the value here +# opensearch.optimizedHealthcheckId: "cluster_id" + +# If your OpenSearch is protected with basic authentication, these settings provide +# the username and password that the OpenSearch Dashboards server uses to perform maintenance on the OpenSearch Dashboards +# index at startup. Your OpenSearch Dashboards users still need to authenticate with OpenSearch, which +# is proxied through the OpenSearch Dashboards server. +# opensearch.username: "opensearch_dashboards_system" +# opensearch.password: "pass" + +# Enables SSL and paths to the PEM-format SSL certificate and SSL key files, respectively. +# These settings enable SSL for outgoing requests from the OpenSearch Dashboards server to the browser. +# server.ssl.enabled: false +# server.ssl.certificate: /path/to/your/server.crt +# server.ssl.key: /path/to/your/server.key + +# Optional settings that provide the paths to the PEM-format SSL certificate and key files. +# These files are used to verify the identity of OpenSearch Dashboards to OpenSearch and are required when +# xpack.security.http.ssl.client_authentication in OpenSearch is set to required. +# opensearch.ssl.certificate: /path/to/your/client.crt +# opensearch.ssl.key: /path/to/your/client.key + +# Optional setting that enables you to specify a path to the PEM file for the certificate +# authority for your OpenSearch instance. +# opensearch.ssl.certificateAuthorities: [ "/path/to/your/CA.pem" ] + +# To disregard the validity of SSL certificates, change this setting's value to 'none'. +# opensearch.ssl.verificationMode: full + +# Time in milliseconds to wait for OpenSearch to respond to pings. Defaults to the value of +# the opensearch.requestTimeout setting. +# opensearch.pingTimeout: 1500 + +# Time in milliseconds to wait for responses from the back end or OpenSearch. This value +# must be a positive integer. +# opensearch.requestTimeout: 30000 + +# List of OpenSearch Dashboards client-side headers to send to OpenSearch. To send *no* client-side +# headers, set this value to [] (an empty list). +# opensearch.requestHeadersWhitelist: [ authorization ] + +# Header names and values that are sent to OpenSearch. Any custom headers cannot be overwritten +# by client-side headers, regardless of the opensearch.requestHeadersWhitelist configuration. +# opensearch.customHeaders: {} + +# Time in milliseconds for OpenSearch to wait for responses from shards. Set to 0 to disable. +# opensearch.shardTimeout: 30000 + +# Logs queries sent to OpenSearch. Requires logging.verbose set to true. +# opensearch.logQueries: false + +# Specifies the path where OpenSearch Dashboards creates the process ID file. +# pid.file: /var/run/opensearchDashboards.pid + +# Enables you to specify a file where OpenSearch Dashboards stores log output. +# logging.dest: stdout + +# Set the value of this setting to true to suppress all logging output. +# logging.silent: false + +# Set the value of this setting to true to suppress all logging output other than error messages. +# logging.quiet: false + +# Set the value of this setting to true to log all events, including system usage information +# and all requests. +# logging.verbose: false + +# Set the interval in milliseconds to sample system and process performance +# metrics. Minimum is 100ms. Defaults to 5000. +# ops.interval: 5000 + +# Specifies locale to be used for all localizable strings, dates and number formats. +# Supported languages are the following: English - en , by default , Chinese - zh-CN . +# i18n.locale: "en" + +# Set the allowlist to check input graphite Url. Allowlist is the default check list. +# vis_type_timeline.graphiteAllowedUrls: ['https://www.hostedgraphite.com/UID/ACCESS_KEY/graphite'] + +# Set the blocklist to check input graphite Url. Blocklist is an IP list. +# Below is an example for reference +# vis_type_timeline.graphiteBlockedIPs: [ +# //Loopback +# '127.0.0.0/8', +# '::1/128', +# //Link-local Address for IPv6 +# 'fe80::/10', +# //Private IP address for IPv4 +# '10.0.0.0/8', +# '172.16.0.0/12', +# '192.168.0.0/16', +# //Unique local address (ULA) +# 'fc00::/7', +# //Reserved IP address +# '0.0.0.0/8', +# '100.64.0.0/10', +# '192.0.0.0/24', +# '192.0.2.0/24', +# '198.18.0.0/15', +# '192.88.99.0/24', +# '198.51.100.0/24', +# '203.0.113.0/24', +# '224.0.0.0/4', +# '240.0.0.0/4', +# '255.255.255.255/32', +# '::/128', +# '2001:db8::/32', +# 'ff00::/8', +# ] +# vis_type_timeline.graphiteBlockedIPs: [] + +# opensearchDashboards.branding: +# logo: +# defaultUrl: "" +# darkModeUrl: "" +# mark: +# defaultUrl: "" +# darkModeUrl: "" +# loadingLogo: +# defaultUrl: "" +# darkModeUrl: "" +# faviconUrl: "" +# applicationTitle: "" + +# Set the value of this setting to true to capture region blocked warnings and errors +# for your map rendering services. +# map.showRegionBlockedWarning: false% + +# Set the value of this setting to false to suppress search usage telemetry +# for reducing the load of OpenSearch cluster. +# data.search.usageTelemetry.enabled: false + +# 2.4 renames 'wizard.enabled: false' to 'vis_builder.enabled: false' +# Set the value of this setting to false to disable VisBuilder +# functionality in Visualization. +# vis_builder.enabled: false + +# 2.4 New Experimental Feature +# Set the value of this setting to true to enable the experimental multiple data source +# support feature. Use with caution. +# data_source.enabled: false +# Set the value of these settings to customize crypto materials to encryption saved credentials +# in data sources. +# data_source.encryption.wrappingKeyName: 'changeme' +# data_source.encryption.wrappingKeyNamespace: 'changeme' +# data_source.encryption.wrappingKey: [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + +# 2.6 New ML Commons Dashboards Experimental Feature +# Set the value of this setting to true to enable the experimental ml commons dashboards +ml_commons_dashboards.enabled: true + +opensearch.hosts: ["https://localhost:9200"] +opensearch.ssl.verificationMode: none +opensearch.username: kibanaserver +opensearch.password: kibanaserver +opensearch.requestHeadersWhitelist: [authorization, securitytenant] + +opensearch_security.multitenancy.enabled: true +opensearch_security.multitenancy.tenants.preferred: [Private, Global] +opensearch_security.readonly_mode.roles: [kibana_read_only] +# Use this setting if you are running opensearch-dashboards without https +opensearch_security.cookie.secure: false +server.host: '0.0.0.0' diff --git a/comps/version.py b/comps/version.py index 3f8ffef29..488897838 100644 --- a/comps/version.py +++ b/comps/version.py @@ -3,4 +3,4 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -__version__ = "1.0" +__version__ = "1.1" diff --git a/tests/agent/test_agent_langchain_on_intel_hpu.sh b/tests/agent/test_agent_langchain_on_intel_hpu.sh index 14eb874ae..9ba25228a 100644 --- a/tests/agent/test_agent_langchain_on_intel_hpu.sh +++ b/tests/agent/test_agent_langchain_on_intel_hpu.sh @@ -51,12 +51,12 @@ function build_vllm_docker_images() { git clone https://github.com/HabanaAI/vllm-fork.git fi cd ./vllm-fork - docker build -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy if [ $? -ne 0 ]; then - echo "opea/vllm-hpu:comps failed" + echo "opea/vllm-gaudi:comps failed" exit 1 else - echo "opea/vllm-hpu:comps successful" + echo "opea/vllm-gaudi:comps successful" fi } @@ -88,7 +88,7 @@ function start_vllm_service() { #single card echo "start vllm gaudi service" echo "**************model is $model**************" - docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-hpu:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 + docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 sleep 5s echo "Waiting vllm gaudi ready" n=0 @@ -115,7 +115,7 @@ function start_vllm_auto_tool_choice_service() { #single card echo "start vllm gaudi service" echo "**************auto_tool model is $model**************" - docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-hpu:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser ${model_parser} + docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser ${model_parser} sleep 5s echo "Waiting vllm gaudi ready" n=0 diff --git a/tests/dataprep/test_dataprep_opensearch_langchain.sh b/tests/dataprep/test_dataprep_opensearch_langchain.sh new file mode 100755 index 000000000..57e5c95ff --- /dev/null +++ b/tests/dataprep/test_dataprep_opensearch_langchain.sh @@ -0,0 +1,180 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +dataprep_service_port="6007" +OPENSEARCH_INITIAL_ADMIN_PASSWORD="StRoNgOpEa0)" + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build -t opea/dataprep-opensearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/opensearch/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/dataprep-opensearch built fail" + exit 1 + else + echo "opea/dataprep-opensearch built successful" + fi +} + +function start_service() { + # Start OpenSearch vector db container + sudo sysctl -w vm.max_map_count=262144 + docker run -d \ + --name test-comps-dataprep-opensearch-langchain \ + -e cluster.name=opensearch-cluster \ + -e node.name=opensearch-vector-db \ + -e discovery.seed_hosts=opensearch-vector-db \ + -e cluster.initial_master_nodes=opensearch-vector-db \ + -e bootstrap.memory_lock=true \ + -e "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + --ulimit memlock=-1:-1 \ + --ulimit nofile=65536:65536 \ + -p 9200:9200 \ + -p 9600:9600 \ + --ipc=host \ + opensearchproject/opensearch:latest + + # Start OpenSearch dataprep container + OPENSEARCH_URL="http://${ip_address}:9200" + echo $(OPENSEARCH_URL) + INDEX_NAME="file-index" + docker run -d \ + --name test-comps-dataprep-opensearch-langchain-server \ + -p 6007:6007 \ + -e https_proxy=$https_proxy \ + -e http_proxy=$http_proxy \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + -e OPENSEARCH_URL=$OPENSEARCH_URL \ + -e INDEX_NAME=$INDEX_NAME \ + --ipc=host \ + opea/dataprep-opensearch:latest + + sleep 2m +} + +function validate_microservice() { + cd $LOG_PATH + + # test OpenSearch cluster health endpoint + curl -X GET "https://localhost:9200/_cluster/health?pretty" -ku admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD + + # test /v1/dataprep upload file + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - file" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + + # test /v1/dataprep upload link + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - upload - link" + + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_upload_link.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test /v1/dataprep/get_file + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep/get_file" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - get" + + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + if [[ "$RESPONSE_BODY" -ne "null" ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + # test /v1/dataprep/delete_file + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep/delete_file" + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "dataprep_file.txt"}' -H 'Content-Type: application/json' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + SERVICE_NAME="dataprep - del" + + # check response status + if [ "$HTTP_STATUS" -ne "404" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 404. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_del.log + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 404. Checking content..." + fi + # check response body + if [[ "$RESPONSE_BODY" != *'{"detail":"Single file deletion is not implemented yet"}'* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + docker logs test-comps-dataprep-opensearch-langchain-server >> ${LOG_PATH}/dataprep_del.log + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi +} + +function stop_service() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-opensearch-langchain*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + +} + +function main() { + stop_service + + build_docker_images + start_service + + validate_microservice + + stop_service + # echo y | docker system prune +} + +main diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 073ee5736..b1fd41e9a 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -13,12 +13,12 @@ function build_docker_images() { git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ git checkout 3c39626 - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . if [ $? -ne 0 ]; then - echo "opea/vllm-hpu built fail" + echo "opea/vllm-gaudi built fail" exit 1 else - echo "opea/vllm-hpu built successful" + echo "opea/vllm-gaudi built successful" fi ## Build OPEA microservice docker @@ -47,7 +47,7 @@ function start_service() { --cap-add=sys_nice \ --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ - opea/vllm-hpu:comps \ + opea/vllm-gaudi:comps \ --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index 62626508a..3019d6c08 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -13,12 +13,12 @@ function build_docker_images() { git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ git checkout 3c39626 - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . if [ $? -ne 0 ]; then - echo "opea/vllm-hpu built fail" + echo "opea/vllm-gaudi built fail" exit 1 else - echo "opea/vllm-hpu built successful" + echo "opea/vllm-gaudi built successful" fi ## Build OPEA microservice docker @@ -47,7 +47,7 @@ function start_service() { --cap-add=sys_nice \ --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ - opea/vllm-hpu:comps \ + opea/vllm-gaudi:comps \ --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" diff --git a/tests/retrievers/test_retrievers_opensearch_langchain.sh b/tests/retrievers/test_retrievers_opensearch_langchain.sh new file mode 100755 index 000000000..8657cc91e --- /dev/null +++ b/tests/retrievers/test_retrievers_opensearch_langchain.sh @@ -0,0 +1,114 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +retriever_port="7000" +OPENSEARCH_INITIAL_ADMIN_PASSWORD="StRoNgOpEa0)" + +function build_docker_images() { + cd $WORKPATH + docker build -t opea/retriever-opensearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/opensearch/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-opensearch built fail" + exit 1 + else + echo "opea/retriever-opensearch built successful" + fi +} + +function start_service() { + # Start OpenSearch vector db container + sudo sysctl -w vm.max_map_count=262144 + docker run -d \ + --name test-comps-retriever-opensearch \ + -e cluster.name=opensearch-cluster \ + -e node.name=opensearch-vector-db \ + -e discovery.seed_hosts=opensearch-vector-db \ + -e cluster.initial_master_nodes=opensearch-vector-db \ + -e bootstrap.memory_lock=true \ + -e "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + --ulimit memlock=-1:-1 \ + --ulimit nofile=65536:65536 \ + -p 9200:9200 \ + -p 9600:9600 \ + --ipc=host \ + opensearchproject/opensearch:latest + + # tei endpoint + tei_endpoint=6060 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-opensearch-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --ipc=host + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # Start OpenSearch retriever container + OPENSEARCH_URL="http://${ip_address}:9200" + INDEX_NAME="file-index" + docker run -d \ + --name test-comps-retriever-opensearch-server \ + -p 7000:7000 \ + -e https_proxy=$https_proxy \ + -e http_proxy=$http_proxy \ + -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \ + -e OPENSEARCH_URL=$OPENSEARCH_URL \ + -e INDEX_NAME=$INDEX_NAME \ + -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} \ + --ipc=host \ + opea/retriever-opensearch:latest + + sleep 2m +} + +function validate_microservice() { + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:$retriever_port/v1/retrieval" + + test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' -k -u admin:$OPENSEARCH_INITIAL_ADMIN_PASSWORD "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-opensearch-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-opensearch-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-opensearch-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-opensearch-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_service() { + cid=$(docker ps -aq --filter "name=test-comps-retriever-opensearch*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + +} + +function main() { + stop_service + + build_docker_images + start_service + + validate_microservice + + stop_service + # echo y | docker system prune +} + +main