Merge pull request #291 from truefoundry/main

Main -> Release
truefoundry · Jul 30, 2024 · 2519c51 · 2519c51
2 parents 22cacb0 + 72d8e9d
commit 2519c51
Show file tree

Hide file tree

Showing 24 changed files with 313 additions and 654 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,7 @@ qdrant_storage/
 .truefoundry
 infinity/
 volumes/
+user_data/
 pgdata/
 *.bak
 models_config.yaml
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -9,22 +9,13 @@ RUN python3 -m venv /virtualenvs/venv/
 # Copy requirements.txt
 COPY backend/requirements.txt /tmp/requirements.txt
 COPY backend/vectordb.requirements.txt /tmp/vectordb.requirements.txt
-COPY backend/parsers.requirements.txt /tmp/parsers.requirements.txt
 
 # Install Python packages
 RUN python3 -m pip install -U pip setuptools wheel && \
     python3 -m pip install --use-pep517 --no-cache-dir -r /tmp/requirements.txt
 
 ENV LD_LIBRARY_PATH=/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cublas/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cudnn/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cufft/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/curand/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cusolver/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/cusparse/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/nccl/lib:/virtualenvs/venv/lib/python3.11/site-packages/nvidia/nvtx/lib:/virtualenvs/venv/lib/python3.11/site-packages/torch/lib/:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
 
-ARG ADD_PYTORCH=0
-# Install torch
-RUN if [ "${ADD_PYTORCH}" = "1" ]; then pip install "torch==2.2.2+cu121" --extra-index-url https://download.pytorch.org/whl/cu121; fi
-
-# Install Parser packages
-ARG ADD_PARSER=0
-RUN if [ "${ADD_PARSER}" = "1" ]; then python3 -m pip install --use-pep517 --no-cache-dir -r /tmp/parsers.requirements.txt; fi
-
 # Install VectorDB packages
 ARG ADD_VECTORDB=0
 RUN if [ "${ADD_VECTORDB}" = "1" ]; then python3 -m pip install --use-pep517 --no-cache-dir -r /tmp/vectordb.requirements.txt; fi
@@ -40,6 +31,12 @@ ENV MODELS_CONFIG_PATH=${MODELS_CONFIG_PATH}
 ARG INFINITY_API_KEY
 ENV INFINITY_API_KEY=${INFINITY_API_KEY}
 
+ARG UNSTRUCTURED_IO_URL
+ENV UNSTRUCTURED_IO_URL=${UNSTRUCTURED_IO_URL}
+
+ARG UNSTRUCTURED_IO_API_KEY
+ENV UNSTRUCTURED_IO_API_KEY=${UNSTRUCTURED_IO_API_KEY}
+
 # Copy the project files
 COPY . /app
 

diff --git a/backend/indexer/indexer.py b/backend/indexer/indexer.py
@@ -234,7 +234,6 @@ async def ingest_data_points(
             file_extension=loaded_data_point.file_extension,
             parsers_map=inputs.parser_config.parser_map,
             max_chunk_size=inputs.parser_config.chunk_size,
-            chunk_overlap=inputs.parser_config.chunk_overlap,
             additional_config=inputs.parser_config.additional_config,
         )
         if parser is None:
@@ -370,10 +369,8 @@ async def ingest_data(request: IngestDataToCollectionDto):
                 )
                 created_data_ingestion_run.status = DataIngestionRunStatus.COMPLETED
             else:
-                if not settings.JOB_FQN or not settings.JOB_COMPONENT_NAME:
-                    logger.error(
-                        "Job FQN and Job Component Name are required to trigger the job"
-                    )
+                if not settings.JOB_FQN:
+                    logger.error("Job FQN is required to trigger the job")
                     raise HTTPException(
                         status_code=500,
                         detail="Job FQN and Job Component Name are required to trigger the job",
@@ -391,7 +388,6 @@ async def ingest_data(request: IngestDataToCollectionDto):
                 )
                 trigger_job(
                     application_fqn=settings.JOB_FQN,
-                    component_name=settings.JOB_COMPONENT_NAME,
                     params={
                         "collection_name": collection.name,
                         "data_source_fqn": associated_data_source.data_source_fqn,

diff --git a/backend/modules/metadata_store/prismastore.py b/backend/modules/metadata_store/prismastore.py
@@ -379,7 +379,7 @@ async def adelete_data_source(self, data_source_fqn: str):
                 data_source_uri = data_source.uri
                 # data_source_uri is of the form: `/app/users_data/folder_name`
                 folder_name = data_source_uri.split("/")[-1]
-                folder_path = os.path.join("/app/user_data", folder_name)
+                folder_path = os.path.join(settings.LOCAL_DATA_DIRECTORY, folder_name)
                 logger.info(
                     f"Deleting folder: {folder_path}, path exists: {os.path.exists(folder_path)}"
                 )

diff --git a/backend/modules/parsers/__init__.py b/backend/modules/parsers/__init__.py
@@ -1,15 +1,7 @@
-from backend.modules.parsers.codeparser import CodeParser
-from backend.modules.parsers.markdownparser import MarkdownParser
 from backend.modules.parsers.multimodalparser import MultiModalParser
 from backend.modules.parsers.parser import register_parser
-from backend.modules.parsers.pdfparser_fast import PdfParserUsingPyMuPDF
-from backend.modules.parsers.tablepdfparser import PdfTableParser
-from backend.modules.parsers.textparser import TextParser
+from backend.modules.parsers.unstructured_io import UnstructuredIoParser
 
 # The order of registry defines the order of precedence
-register_parser("MarkdownParser", MarkdownParser)
-register_parser("TextParser", TextParser)
-register_parser("PdfParserFast", PdfParserUsingPyMuPDF)
+register_parser("UnstructuredIoParser", UnstructuredIoParser)
 register_parser("MultiModalParser", MultiModalParser)
-register_parser("CodeParser", CodeParser)
-register_parser("PdfTableParser", PdfTableParser)
diff --git a/backend/modules/parsers/codeparser.py b/backend/modules/parsers/codeparser.py
diff --git a/backend/modules/parsers/markdownparser.py b/backend/modules/parsers/markdownparser.py