From af3493539cfbfa2b1455de6124290137cce6f801 Mon Sep 17 00:00:00 2001
From: Seungwoo hong <1100974+hongsw@users.noreply.github.com>
Date: Tue, 1 Oct 2024 10:39:57 +0900
Subject: [PATCH] Feature/hongsw/671 dockerfile Add Dockerfile and Docker
 configuration for AutoRAG production environment (#763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ✨ feat: Add Dockerfile for setting up Python environment and running application tests.

* 🔧 chore: update .gitignore to exclude .DS_Store file.

* 🔧 chore: organize Dockerfile into separate stages for base, test, and production operations, improving readability and maintainability

* 📝 docs: Add AutoRAG Docker guide to README.md

* 📝 docs: add instructions for running AutoRAG from Dockerfile

* ✨ feat: Introduce .dockerignore and docker-compose.yml

Added .dockerignore file with a comprehensive list of patterns to ignore in Docker builds, covering compiled files, various packages, distributions, and tools. Also included a docker-compose.yml file defining services for the autorag application, specifying build context, volumes, environment variables like HF_HOME and OPENAI_API_KEY, and a command for evaluation with specific paths and configurations.

* docs(source/install.md): update instructions for running AutoRAG with Docker, including building a production-ready Docker image and running the container with required configurations and data paths. Add information on mounting directories, setting `HF_HOME` variable, using Docker Compose, manual access for debugging, and additional notes for ensuring directory presence and managing dynamic paths in CI/CD pipelines.

* 🚑 fix: Update Docker Compose command to use 'validate' instead of 'evaluate'

* ✨ feat: Add configuration for Tutorial Step 1 in AutoRAG project

- Added a configuration file for Tutorial Step 1 in the `projects/tutorial_1` directory. This configuration includes node lines for retrieving and post-retrieval activities, defining various nodes with strategies, metrics, and modules, such as vectordb, bm25, hybrid_rrf, prompt_maker, and generator. Specific details on embeddings, weights, prompts, and models are provided.

This commit enhances the AutoRAG project by introducing a detailed configuration for Tutorial Step 1, crucial for accurate question answering and passage generation tasks.

* 🔧 chore: Organize Dockerfile instructions and update README with evaluation steps

- Rearranged Dockerfile to optimize the order of commands.
- Added a new step to copy requirements.txt before installing Python dependencies.
- Added another step to install project-specific requirements and packages.
- Updated README with detailed instructions for running evaluation, validation, dashboard, and web applications.
- Included notes about costs and potential time consumption during the evaluation step.

* 🚑 fix: Update docker command in README.md for autorag project to include trial path

* 🚑 fix: Update docker command in README.md for autorag project to include trial path

* add new line at projects/tutorial_1.yaml

* docs(source/install.md): update autorag image reference to autoraghq/autorag:all for evaluation process

* delete docker-compose description at README.md

---------

Co-authored-by: jeffrey <vkefhdl1@gmail.com>
Co-authored-by: Jeffrey (Dongkyu) Kim <vkehfdl1@gmail.com>
---
 .dockerignore                   | 164 ++++++++++++++++++++++++++++++++
 .gitignore                      |   3 +
 Dockerfile                      |  45 +++++++++
 README.md                       |  69 ++++++++++++++
 docker-compose.yml              |  15 +++
 docs/source/install.md          |  72 ++++++++++++++
 projects/tutorial_1/config.yaml |  37 +++++++
 7 files changed, 405 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml
 create mode 100644 projects/tutorial_1/config.yaml

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..6ec7d4887
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+pytest.ini
+.DS_Store
+projects/tutorial_1
+!projects/tutorial_1/config.yaml
diff --git a/.gitignore b/.gitignore
index ece4997f0..6ec7d4887 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,3 +159,6 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 pytest.ini
+.DS_Store
+projects/tutorial_1
+!projects/tutorial_1/config.yaml
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..59270409f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,45 @@
+# Base stage: Install dependencies
+FROM python:3.10-slim AS base
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    gcc \
+    libssl-dev \
+    poppler-utils \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    tesseract-ocr-kor && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set work directory
+WORKDIR /usr/src/app
+
+# Install Python dependencies
+RUN pip install --upgrade pip setuptools setuptools-scm
+COPY requirements.txt /usr/src/app/requirements.txt
+
+RUN pip install -r requirements.txt
+
+# Copy project files
+COPY . /usr/src/app
+RUN pip install -e ./
+
+# Test stage: Run tests if CI=true
+FROM base AS test
+
+# Install testing dependencies
+RUN pip install pytest pytest-xdist
+
+# Run tests if CI is set to true
+RUN pytest -o log_cli=true --log-cli-level=INFO -n auto tests
+
+# Production stage: Create final image for production
+FROM base AS production
+
+COPY projects /usr/src/app/projects
+
+# Set the entrypoint for the production application
+ENTRYPOINT ["python", "-m", "autorag.cli"]
+# ENTRYPOINT ["bash"]
diff --git a/README.md b/README.md
index cbcf7754f..88b61edb5 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ You can see on [YouTube](https://youtu.be/2ojK8xjyXAU?feature=shared)
 # Index
 
 - [Quick Install](#quick-install)
+- [🐳 AutoRAG Docker Guide](#-autorag-docker-guide)
 - [Data Creation](#data-creation)
   - [Parsing](#1-parsing)
   - [Chunking](#2-chunking)
@@ -187,6 +188,74 @@ initial_qa.to_parquet('./qa.parquet', './corpus.parquet')
 
 ![rag_opt_gif](https://github.com/user-attachments/assets/55bd09cd-8420-4f6d-bc7d-0a66af288317)
 
+## 🐳 AutoRAG Docker Guide
+
+This guide provides a quick overview of building and running the AutoRAG Docker container for production, with instructions on setting up the environment for evaluation using your configuration and data paths.
+
+### 🚀 Building the Docker Image
+
+#### 1.Download dataset for [Turorial Step 1](https://colab.research.google.com/drive/19OEQXO_pHN6gnn2WdfPd4hjnS-4GurVd?usp=sharing)
+```bash
+python sample_dataset/eli5/load_eli5_dataset.py --save_path projects/tutorial_1
+```
+
+#### 2. Run `evaluate`
+> **Note**: This step may take a long time to complete and involves OpenAI API calls, which may cost approximately $0.30.
+
+```bash
+docker run --rm -it \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v $(pwd)/projects:/usr/src/app/projects \
+  -e OPENAI_API_KEY=${OPENAI_API_KEY} \
+  autoraghq/autorag:all evaluate \
+  --config /usr/src/app/projects/tutorial_1/config.yaml \
+  --qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet \
+  --corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet \
+  --project_dir /usr/src/app/projects/tutorial_1/
+```
+
+
+#### 3. Run validate
+```bash
+docker run --rm -it \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v $(pwd)/projects:/usr/src/app/projects \
+  -e OPENAI_API_KEY=${OPENAI_API_KEY} \
+  autoraghq/autorag:all validate \
+  --config /usr/src/app/projects/tutorial_1/config.yaml \
+  --qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet \
+  --corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet
+```
+
+
+#### 4. Run `dashboard`
+```bash
+docker run --rm -it \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v $(pwd)/projects:/usr/src/app/projects \
+  -e OPENAI_API_KEY=${OPENAI_API_KEY} \
+  -p 8502:8502 \
+  autoraghq/autorag:all dashboard \
+    --trial_dir /usr/src/app/projects/tutorial_1/0
+```
+
+
+#### 4. Run `run_web`
+```bash
+docker run --rm -it \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v $(pwd)/projects:/usr/src/app/projects \
+  -e OPENAI_API_KEY=${OPENAI_API_KEY} \
+  -p 8501:8501 \
+  autoraghq/autorag:all run_web --trial_path ./projects/tutorial_1/0
+```
+
+#### Key Points :
+- **`-v ~/.cache/huggingface:/cache/huggingface`**: Mounts the host machine’s Hugging Face cache to `/cache/huggingface` in the container, enabling access to pre-downloaded models.
+- **`-e OPENAI_API_KEY: ${OPENAI_API_KEY}`**: Passes the `OPENAI_API_KEY` from your host environment.
+
+For more detailed instructions, refer to the [Docker Installation Guide](./docs/source/install.md#1-build-the-docker-image).
+
 ## Quick Start
 
 ### 1. Set YAML File
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 000000000..e60ecc81f
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,15 @@
+services:
+  validate:
+    image: autoraghq/autorag:all
+    command: >
+      validate
+      --config /usr/src/app/projects/tutorial_1/config.yaml
+      --qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet
+      --corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    volumes:
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - ./projects:/usr/src/app/projects
+    stdin_open: true
+    tty: true
diff --git a/docs/source/install.md b/docs/source/install.md
index 55ebf0715..573e55029 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -138,3 +138,75 @@ python -m pytest -n auto
 
 After this, please check out our documentation for contributors.
 We are writing this documentation for contributors, so please wait for a while.
+
+
+## Run AutoRAG with 🐳 Docker
+
+To run AutoRAG using Docker, follow these steps:
+
+### 1. Build the Docker Image
+
+```bash
+docker build --target production -t autorag:prod .
+```
+
+This command will build the production-ready Docker image, using only the `production` stage defined in the `Dockerfile`.
+
+### 2. Run the Docker Container
+
+Run the container with the following command:
+
+```bash
+docker run --rm -it \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -v $(pwd)/sample_config:/usr/src/app/sample_config \
+  -v $(pwd)/projects:/usr/src/app/projects \
+  autoraghq/autorag:all evaluate \
+  --config /usr/src/app/sample_config/rag/simple/simple_openai.yaml \
+  --qa_data_path /usr/src/app/projects/test01/qa_validation.parquet \
+  --corpus_data_path /usr/src/app/projects/test01/corpus.parquet \
+  --project_dir /usr/src/app/projects/test01
+```
+
+#### Explanation:
+- **`-v ~/.cache/huggingface:/root/.cache/huggingface`**: Mounts the host's Hugging Face cache to the container, allowing it to access pre-downloaded models.
+- **`-v $(pwd)/sample_config:/usr/src/app/sample_config`**: Mounts the local `sample_config` directory to the container.
+- **`-v $(pwd)/projects:/usr/src/app/projects`**: Mounts the local `projects` directory to the container.
+- **`autoraghq/autorag:all evaluate`**: Executes the `evaluate` command inside the `autoraghq/autorag:all` container.
+- **`--config`, `--qa_data_path`, `--corpus_data_path`, `--project_dir`**: Specifies paths to the configuration file, QA dataset, corpus data, and project directory.
+
+### 3. Using a Custom Cache Directory with `HF_HOME`
+
+Alternatively, you can mount the Hugging Face cache to a custom location inside the container and set the `HF_HOME` environment variable:
+
+```bash
+docker run --rm -it \
+  -v ~/.cache/huggingface:/cache/huggingface \
+  -v $(pwd)/sample_config:/usr/src/app/sample_config \
+  -v $(pwd)/projects:/usr/src/app/projects \
+  -e HF_HOME=/cache/huggingface \
+  autoraghq/autorag:all evaluate \
+  --config /usr/src/app/sample_config/rag/simple/simple_openai.yaml \
+  --qa_data_path /usr/src/app/projects/test01/qa_validation.parquet \
+  --corpus_data_path /usr/src/app/projects/test01/corpus.parquet \
+  --project_dir /usr/src/app/projects/test01
+```
+
+#### Explanation:
+- **`-v ~/.cache/huggingface:/cache/huggingface`**: Mounts the host's Hugging Face cache to `/cache/huggingface` inside the container.
+- **`-e HF_HOME=/cache/huggingface`**: Sets the `HF_HOME` environment variable to point to the mounted cache directory.
+
+### 5. Debugging and Manual Access
+
+To manually access the container for debugging or testing, start a Bash shell:
+
+```bash
+docker run --rm -it --entrypoint /bin/bash autoraghq/autorag:all
+```
+
+This command allows you to explore the container’s filesystem, run commands manually, or inspect logs for troubleshooting.
+
+## Additional Notes
+
+- Ensure that the necessary directories (`sample_config` and `projects`) are present in the host system.
+- If running in a CI/CD pipeline, consider using environment variables or `.env` files to manage API keys and paths dynamically.
diff --git a/projects/tutorial_1/config.yaml b/projects/tutorial_1/config.yaml
new file mode 100644
index 000000000..44dd26c2e
--- /dev/null
+++ b/projects/tutorial_1/config.yaml
@@ -0,0 +1,37 @@
+
+node_lines:
+- node_line_name: retrieve_node_line
+  nodes:
+    - node_type: retrieval
+      strategy:
+        metrics: [retrieval_f1, retrieval_recall, retrieval_ndcg, retrieval_mrr]
+      top_k: 3
+      modules:
+        - module_type: vectordb
+          embedding_model: openai
+        - module_type: bm25
+        - module_type: hybrid_rrf
+          weight_range: (4,80)
+- node_line_name: post_retrieve_node_line
+  nodes:
+    - node_type: prompt_maker
+      strategy:
+        metrics:
+          - metric_name: meteor
+          - metric_name: rouge
+          - metric_name: sem_score
+            embedding_model: openai
+      modules:
+        - module_type: fstring
+          prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : "
+    - node_type: generator
+      strategy:
+        metrics:
+          - metric_name: meteor
+          - metric_name: rouge
+          - metric_name: sem_score
+            embedding_model: openai
+      modules:
+        - module_type: openai_llm
+          llm: gpt-4o-mini
+          batch: 16 # If you have low tier at OpenAI, decrease this.