Merge pull request #100 from BharatSahAIyak/dev

Dev -> Main v0.4.5
BharatSahAIyak · Jun 19, 2024 · f598e6c · f598e6c
2 parents ead3717 + 6699129
commit f598e6c
Show file tree

Hide file tree

Showing 59 changed files with 676 additions and 229 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -4,56 +4,81 @@ on: [push, pull_request]
 
 jobs:
   test:
-
     runs-on: ubuntu-latest
 
     strategy:
       matrix:
         python-version: ['3.10']
 
     steps:
-    - uses: actions/checkout@v4
-
-    - name: Install poetry
-      run: pip install poetry
-
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-        cache: 'poetry'
-
-    - run: poetry install
-    - run: poetry show --latest
-    - run: poetry run pytest --cov --cov-report xml
-
-    - name: Coveralls
-      uses: coverallsapp/github-action@v2
-      with:
-        github-token: ${{ secrets.COVERALLS_REPO_TOKEN }}
-        file: coverage.xml
-        flag-name: python-${{ matrix.python-version }}
-
-  test-mac:
-
-    runs-on: macos-latest
-    strategy:
-      matrix:
-        python-version: ['3.10']
+      - uses: actions/checkout@v4
 
-    steps:
-    - uses: actions/checkout@v4
-
-    - name: Install poetry
-      run: pip install poetry
-
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-        cache: 'poetry'
-
-    - run: poetry install
-    - run: poetry show --latest
-    - run: poetry run pytest --cov 
+      - name: Install poetry
+        run: pip install poetry
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Cache poetry dependencies
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-poetry-
+
+      - run: poetry install
+      - run: poetry show --latest
+      - run: poetry run pytest --cov --cov-report xml
+
+      - name: Coveralls
+        uses: coverallsapp/github-action@v2
+        with:
+          github-token: ${{ secrets.COVERALLS_REPO_TOKEN }}
+          file: coverage.xml
+          flag-name: python-${{ matrix.python-version }}
+
+  # test-mac:
+  #   runs-on: macos-latest
+  #   strategy:
+  #     matrix:
+  #       python-version: ['3.10']
+
+  #   steps:
+  #     - uses: actions/checkout@v4
+
+  #     - name: Install Poetry
+  #       run: |
+  #         curl -sSL https://install.python-poetry.org | python -
+  #         echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+  #     - name: Set up Python ${{ matrix.python-version }}
+  #       uses: actions/setup-python@v4
+  #       with:
+  #         python-version: ${{ matrix.python-version }}
+  #         cache: 'poetry'
+
+  #     - name: Install Faiss
+  #       run: |
+  #         brew install cmake libomp openblas
+  #         git clone https://github.com/facebookresearch/faiss.git
+  #         cd faiss
+  #         cmake -B build -DFAISS_ENABLE_GPU=ON -DFAISS_ENABLE_PYTHON=ON -DCMAKE_BUILD_TYPE=Release
+  #         make -C build -j
+  #         cd build/faiss/python
+  #         python setup.py install
+  #       shell: bash
+
+  #     - name: Install Poetry
+  #       run: pip install poetry
+
+  #     - name: Install Project Dependencies
+  #       run: poetry install
+
+  #     - name: Show Latest Poetry Packages
+  #       run: poetry show --latest
 
+  #     - name: Run Tests
+  #       run: poetry run pytest --cov
diff --git a/Dockerfile b/Dockerfile
@@ -6,6 +6,8 @@ RUN pip install poetry==1.6.0 && poetry config virtualenvs.create false
 
 COPY pyproject.toml poetry.lock ./
 
+RUN apt-get update -qq && apt-get install ffmpeg -y
+
 # RUN poetry install 
 
 COPY . .

diff --git a/autotune/settings.py b/autotune/settings.py
@@ -31,7 +31,7 @@
 SECRET_KEY = os.getenv("DJANGO_SECRET_KEY")
 
 # SECURITY WARNING: don't run with debug turned on in production!
-DEBUG = os.getenv("DEBUG")
+DEBUG = True
 
 # ALLOWED_HOSTS = []
 

diff --git a/docs/AUTOTUNE.md b/docs/AUTOTUNE.md
@@ -0,0 +1,70 @@
+# INTRODUCTION
+
+## Entities in the system
+
+### WORKFLOWS
+
+Every action taken by a user in autotune is mapped to a workflow. Autotune has two broad functions which is housed in the same place: `Synthetic Data Generation` and `Model Training`. These are built as two separate functions, with interoperability provided in autotune.
+Based on this, there are two types of workflows in autotune: `training` and `complete`. A complete workflow indicates that the entire process from the data generation to the training is being performed at autotune. A `training` workflow can be used to perform a subset of operations of a complete workflow.
+In training workflows, user can provide a HuggingFace dataset for training/fine tuning a model of a model.
+
+Autotune has the assumption that a given user will have only one workflow for training a given model type like `Text Classification`, `Named entity recognition`, etc.
+
+### CONFIG
+
+Configs are re-usable components which provides metadata and various other fixed aspects of a workflow.
+
+Overall config items which can be stored are:
+
+- temperature: OpenAI temperature used in dataset generation
+- system_prompt: System prompt which is passed to OpenAI API.
+- user_prompt_template: A template with replaceable values according to workflow needs.
+- schema_example: A sample JSON which we want the generated data to follow. We can create dynamic models of any structure we like, with validation using dynamically created pydantic models
+
+### TASKS
+
+### TRAINING
+
+## Development Journey
+
+## Models Supported
+
+- Text Classification
+- Colbert training
+- Force Alignment
+
+# SETUP
+
+## API specifications
+
+There are two versions of the APIs, with the core functionality accross both the APIs the same
+
+### POST /v1/workflow/config
+
+- REQUEST:
+
+- RESPONSE:
+
+### POST /v1/workflow/create
+
+- REQUEST:
+
+- RESPONSE:
+
+### POST /v1/workflow/iterate/<UUID>
+
+- REQUEST:
+
+- RESPONSE:
+
+### POST /v1/workflow/generate/<UUID>
+
+- REQUEST:
+
+- RESPONSE:
+
+### POST /v1/workflow/status/<WORKFLOW_ID>
+
+- REQUEST:
+
+- RESPONSE:
diff --git a/01old/.coveragerc → old/.coveragerc b/01old/.coveragerc → old/.coveragerc
diff --git a/01old/.pylintrc → old/.pylintrc b/01old/.pylintrc → old/.pylintrc
diff --git a/01old/Dockerfile → old/Dockerfile b/01old/Dockerfile → old/Dockerfile
diff --git a/01old/README.md → old/README.md b/01old/README.md → old/README.md
diff --git a/01old/__init__.py → old/__init__.py b/01old/__init__.py → old/__init__.py
diff --git a/01old/api_doc.md → old/api_doc.md b/01old/api_doc.md → old/api_doc.md
diff --git a/01old/docker-compose.yml → old/docker-compose.yml b/01old/docker-compose.yml → old/docker-compose.yml
diff --git a/01old/docs/autotune-flow.jpg → old/docs/autotune-flow.jpg b/01old/docs/autotune-flow.jpg → old/docs/autotune-flow.jpg
diff --git a/01old/docs/entitites.md → old/docs/entitites.md b/01old/docs/entitites.md → old/docs/entitites.md
diff --git a/01old/docs/readme.md → old/docs/readme.md b/01old/docs/readme.md → old/docs/readme.md
diff --git a/01old/docs/requests/get.task.md → old/docs/requests/get.task.md b/01old/docs/requests/get.task.md → old/docs/requests/get.task.md
diff --git a/01old/docs/requests/get.workflow.md → old/docs/requests/get.workflow.md b/01old/docs/requests/get.workflow.md → old/docs/requests/get.workflow.md
diff --git a/01old/docs/requests/get.workflow.status.md → old/docs/requests/get.workflow.status.md b/01old/docs/requests/get.workflow.status.md → old/docs/requests/get.workflow.status.md
diff --git a/...ocs/requests/post.workflow.generate.id.md → ...ocs/requests/post.workflow.generate.id.md b/...ocs/requests/post.workflow.generate.id.md → ...ocs/requests/post.workflow.generate.id.md
diff --git a/...docs/requests/post.workflow.iterate.id.md → ...docs/requests/post.workflow.iterate.id.md b/...docs/requests/post.workflow.iterate.id.md → ...docs/requests/post.workflow.iterate.id.md
diff --git a/01old/docs/requests/put.dataset.id.md → old/docs/requests/put.dataset.id.md b/01old/docs/requests/put.dataset.id.md → old/docs/requests/put.dataset.id.md
diff --git a/01old/docs/requests/put.workflow.md → old/docs/requests/put.workflow.md b/01old/docs/requests/put.workflow.md → old/docs/requests/put.workflow.md
diff --git a/01old/docs/workflow-api.md → old/docs/workflow-api.md b/01old/docs/workflow-api.md → old/docs/workflow-api.md
diff --git a/01old/logging.conf → old/logging.conf b/01old/logging.conf → old/logging.conf
diff --git a/01old/main.py → old/main.py b/01old/main.py → old/main.py
diff --git a/01old/mock.py → old/mock.py b/01old/mock.py → old/mock.py
diff --git a/01old/models/__init__.py → old/models/__init__.py b/01old/models/__init__.py → old/models/__init__.py
diff --git a/01old/models/data.py → old/models/data.py b/01old/models/data.py → old/models/data.py
diff --git a/01old/models/train.py → old/models/train.py b/01old/models/train.py → old/models/train.py
diff --git a/01old/poetry.lock → old/poetry.lock b/01old/poetry.lock → old/poetry.lock
diff --git a/01old/pyproject.toml → old/pyproject.toml b/01old/pyproject.toml → old/pyproject.toml
diff --git a/01old/pytest.ini → old/pytest.ini b/01old/pytest.ini → old/pytest.ini
diff --git a/01old/tasks/__init__.py → old/tasks/__init__.py b/01old/tasks/__init__.py → old/tasks/__init__.py
diff --git a/01old/tasks/data.py → old/tasks/data.py b/01old/tasks/data.py → old/tasks/data.py
diff --git a/01old/tasks/data_fetcher.py → old/tasks/data_fetcher.py b/01old/tasks/data_fetcher.py → old/tasks/data_fetcher.py
diff --git a/01old/tasks/questions.py → old/tasks/questions.py b/01old/tasks/questions.py → old/tasks/questions.py
diff --git a/01old/tasks/train.py → old/tasks/train.py b/01old/tasks/train.py → old/tasks/train.py
@@ -6,7 +6,7 @@
 from datasets import load_dataset
 from huggingface_hub import HfApi, login
 
-from utils import CeleryProgressCallback, get_task_class
+from old.utils import CeleryProgressCallback, get_task_class
 
 
 def train_model(celery, req, api_key):

diff --git a/01old/tests/__init__.py → old/tests/__init__.py b/01old/tests/__init__.py → old/tests/__init__.py
diff --git a/01old/tests/e2e/__init__.py → old/tests/e2e/__init__.py b/01old/tests/e2e/__init__.py → old/tests/e2e/__init__.py
diff --git a/01old/tests/e2e/hf_repo.py → old/tests/e2e/hf_repo.py b/01old/tests/e2e/hf_repo.py → old/tests/e2e/hf_repo.py
diff --git a/01old/tests/e2e/parser.py → old/tests/e2e/parser.py b/01old/tests/e2e/parser.py → old/tests/e2e/parser.py
diff --git a/01old/tests/e2e/prompt.py → old/tests/e2e/prompt.py b/01old/tests/e2e/prompt.py → old/tests/e2e/prompt.py
diff --git a/01old/tests/e2e/simple_label.py → old/tests/e2e/simple_label.py b/01old/tests/e2e/simple_label.py → old/tests/e2e/simple_label.py
diff --git a/01old/tests/e2e/train.py → old/tests/e2e/train.py b/01old/tests/e2e/train.py → old/tests/e2e/train.py
diff --git a/01old/tests/e2e/utils.py → old/tests/e2e/utils.py b/01old/tests/e2e/utils.py → old/tests/e2e/utils.py
diff --git a/01old/tests/unit/__init__.py → old/tests/unit/__init__.py b/01old/tests/unit/__init__.py → old/tests/unit/__init__.py
diff --git a/01old/tests/unit/fixtures.py → old/tests/unit/fixtures.py b/01old/tests/unit/fixtures.py → old/tests/unit/fixtures.py
diff --git a/01old/tests/unit/test_data_fetcher.py → old/tests/unit/test_data_fetcher.py b/01old/tests/unit/test_data_fetcher.py → old/tests/unit/test_data_fetcher.py
@@ -6,8 +6,8 @@
 
 import pytest
 
-from models import GenerationAndCommitRequest
-from tasks.data_fetcher import DataFetcher
+from old.models import GenerationAndCommitRequest
+from old.tasks.data_fetcher import DataFetcher
 
 from .fixtures import REDIS_DATA
 
@@ -78,7 +78,7 @@ async def test_initialization_from_redis():
 
     # We only care about the data key here
     mock_redis.hgetall = AsyncMock(return_value={"data": "[]"})
-    with patch("utils.get_data", mock_get_data):
+    with patch("old.utils.get_data", mock_get_data):
         fetcher = DataFetcher(
             GENERATION_AND_COMMIT_REQUEST, "openai_key", mock_redis, "task_id"
         )
@@ -103,7 +103,7 @@ async def test_fetch_and_update():
     mock_redis.hset = AsyncMock()
     mock_redis.hgetall = AsyncMock(return_value={})
 
-    with patch("utils.get_data", mock_get_data):
+    with patch("old.utils.get_data", mock_get_data):
         fetcher = DataFetcher(
             GENERATION_AND_COMMIT_REQUEST, "openai_key", mock_redis, "task_id"
         )

diff --git a/01old/tests/unit/test_parser.py → old/tests/unit/test_parser.py b/01old/tests/unit/test_parser.py → old/tests/unit/test_parser.py
diff --git a/01old/utils/__init__.py → old/utils/__init__.py b/01old/utils/__init__.py → old/utils/__init__.py
diff --git a/01old/utils/data.py → old/utils/data.py b/01old/utils/data.py → old/utils/data.py
diff --git a/01old/utils/tasks.py → old/utils/tasks.py b/01old/utils/tasks.py → old/utils/tasks.py
diff --git a/01old/utils/train.py → old/utils/train.py b/01old/utils/train.py → old/utils/train.py
diff --git a/01old/utils/upload.py → old/utils/upload.py b/01old/utils/upload.py → old/utils/upload.py
diff --git a/01old/worker.py → old/worker.py b/01old/worker.py → old/worker.py
diff --git a/workflow/align_tasks.py b/workflow/align_tasks.py
@@ -10,27 +10,52 @@
 from django_pandas.io import read_frame
 from huggingface_hub import CommitOperationAdd, HfApi, login
 from transformers import TrainerCallback
+from celery.utils.log import get_task_logger
 
 from workflow.models import Task
 from workflow.force_alignment.alignment import ForceAligner
 
+logger=get_task_logger(__name__)
+
 
 @shared_task(bind=True, max_retries=settings.CELERY_MAX_RETRIES, retry_backoff=True)
 def align_task(self,req_data):
 
+    logger.info('Starting align_task wirh request_data: %s',req_data)
+
     task_id=self.request.id 
-    task=Task.objects.get(id=task_id)
-    task.status="ALIGNING"
+    try:
+        task=Task.objects.get(id=task_id)
+        task.status="ALIGNING"
+        task.save()
+
+        logger.info('Task %s status set to ALIGNING', task_id)
+
+        dataset=req_data["dataset"]
+        if "time_duration" in req_data:
+            time_duration=req_data["time_duration"]
+        else:
+            time_duration=None
+
+        api_key=settings.HUGGING_FACE_TOKEN
+        alignment_object=ForceAligner()
+
+        logger.info('Aligning dataset')
+        alignment_object.align_dataset(dataset,alignment_duration=time_duration)
+
+        logger.info('Pushing aligned audios to hugging-face at path: %s',req_data["save_path"])
+
+        task.status="PUSHING"
+        task.save()
+        alignment_object.push_to_hub(req_data["save_path"],api_key)
+
+        logger.info('Task %s status set to PUSHING',task_id)
+
+    except Exception as e:
+        logger.info('An error occured: %s',str(e))
+
+    task.status='COMPLETE'
     task.save()
-    dataset=req_data["dataset"]
-    if "time_duration" in req_data:
-        time_duration=req_data["time_duration"]
-    else:
-        time_duration=None
 
-    api_key=settings.HUGGING_FACE_TOKEN
-    alignment_object=ForceAligner()
-    alignment_object.align_dataset(dataset,alignment_duration=time_duration)
-    alignment_object.push_to_hub(req_data["save_path"],api_key)
-    task.status="PUSHING"
-    task.save()
+
+
diff --git a/workflow/models.py b/workflow/models.py
@@ -20,6 +20,9 @@ def default_split():
 
 
 LLM_MODELS = [
+    "gpt-4-turbo",
+    "gpt-4-turbo-preview",
+    "gpt-4o",
     "gpt-4-0125-preview",
     "gpt-4-1106-preview",
     "gpt-4-vision-preview",