Coda-Research-Group · ProchazkaDavid · Oct 2, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 2, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -4,10 +4,7 @@ on:
   push:
     # Sequence of patterns matched against refs/heads
     branches:    
-      # Push events on main and dev branch
-      - main
-      - ui-tests
-    # Sequence of patterns matched against refs/tags
+      - '**'
     tags: '*'
 
 jobs:
@@ -47,3 +44,27 @@ jobs:
 
     - name: Docker build training, backend, and frontend
       run: ./run.sh
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r training/requirements.txt
+        pip install pytest pytest-cov
+        cd training && pip install -e .
+
+    - name: Run tests with pytest and generate coverage
+      run: |
+        cd /home/runner/work/AlphaFind/AlphaFind/training
+        export PYTHONPATH=$PYTHONPATH:$(pwd)
+        pytest -v --cov=. --cov-report=xml --cov-report=term-missing
+
+    - name: Upload coverage report
+      uses: actions/upload-artifact@v4
+      with:
+        name: coverage-report
+        path: training/coverage.xml
diff --git a/training/.gitignore b/training/.gitignore
@@ -10,4 +10,7 @@ kubectl
 data/bucket-*
 data/embedding.pkl
 data/kmeans.idx
-models/
+models/
+.coverage
+coverage.xml
+*.egg-info
diff --git a/training/Dockerfile b/training/Dockerfile
@@ -17,5 +17,7 @@ RUN pip install -r /var/requirements.txt && rm -rf ~/.cache
 
 COPY . /training
 WORKDIR /training
+RUN pip install -e .
 RUN chmod +x /training/run.sh
+
 CMD [ "/bin/bash", "/training/run.sh" ]
diff --git a/training/alphafind_training/__init__.py b/training/alphafind_training/__init__.py
diff --git a/training/alphafind_training/cluster.py b/training/alphafind_training/cluster.py
@@ -0,0 +1,63 @@
+import logging
+
+import numpy as np
+import torch
+from alphafind_training.clustering import run_clustering
+from alphafind_training.utils import dir_exists, file_exists, load_dataset, load_pickle
+
+LOG = logging.getLogger(__name__)
+
+torch.manual_seed(2023)
+np.random.seed(2023)
+
+
+def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=10):
+    """
+    Function for clustering the embeddings using K-Means.
+
+    Args:
+    input_path (str): Path to the embeddings pickle file or directory of pickle files
+    output_path (str): Path to the output K-Means file
+    n_clusters (int): Number of clusters (default: 2)
+    sample_size (int): Size of the sample (default: 108)
+    n_iterations (int): Number of k-means iterations (default: 10)
+
+    Returns:
+    None
+    """
+    assert file_exists(input_path) or dir_exists(input_path), 'Input file or directory does not exist'
+
+    LOG.info('Loading embeddings')
+    if dir_exists(input_path) and not file_exists(input_path):
+        embeddings, _ = load_dataset(input_path, sample_size, shuffle=True)
+    else:
+        embeddings = load_pickle(input_path)
+
+    assert embeddings.shape[0] >= sample_size, 'Sample size must be smaller than the number of embeddings'
+
+    LOG.info(f'Loaded embeddings of shape: {embeddings.shape}')
+    LOG.info(f'Running clustering, result k-means object will be saved to: {output_path}')
+
+    run_clustering(
+        output_path,
+        embeddings.values,
+        sample_size,
+        n_clusters,
+        n_iterations,
+    )
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Cluster embeddings using K-Means")
+    parser.add_argument(
+        '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files'
+    )
+    parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file')
+    parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters')
+    parser.add_argument('--sample-size', type=int, default=108, help='Size of the sample')
+    parser.add_argument('--n-iterations', type=int, default=10, help='Number of k-means iterations')
+    args = parser.parse_args()
+
+    create_kmeans(args.input, args.output, args.n_clusters, args.sample_size, args.n_iterations)
diff --git a/training/clustering.py → training/alphafind_training/clustering.py b/training/clustering.py → training/alphafind_training/clustering.py
@@ -2,8 +2,7 @@
 
 import faiss
 import numpy as np
-
-from utils import measure_memory_usage, measure_runtime
+from alphafind_training.utils import measure_memory_usage, measure_runtime
 
 np.random.seed(2023)
 

diff --git a/training/create-buckets.py → ...ning/alphafind_training/create_buckets.py b/training/create-buckets.py → ...ning/alphafind_training/create_buckets.py
@@ -7,9 +7,8 @@
 import numpy as np
 import pandas as pd
 import torch
-from model import LIDatasetPredict, load_model
-from tqdm import tqdm
-from utils import (
+from alphafind_training.model import LIDatasetPredict, load_model
+from alphafind_training.utils import (
     create_dir,
     dir_exists,
     file_exists,
@@ -19,6 +18,7 @@
     save_pickle,
     save_predictions,
 )
+from tqdm import tqdm
 
 torch.manual_seed(2023)
 np.random.seed(2023)
@@ -114,6 +114,64 @@ def assign_proteins_to_buckets(config):
     LOG.info(f'Saved predictions per class in `{config.output_predictions}`')
 
 
+def create_buckets(
+    output_chunks, output_predictions, input_path, model_dir_path, output_bucket_path, chunk_size=1000000
+):
+    """
+    Create buckets for protein IDs based on model predictions.
+
+    Args:
+    output_chunks (str): Path to a folder where temporary (per class + per slice) predictions will be saved.
+    output_predictions (str): Path to a folder where the per bucket objects will be saved.
+    input_path (str): Path to the dataset.
+    model_dir_path (str): Path to the model.
+    output_bucket_path (str): Path to output bucket data.
+    chunk_size (int): Chunk size for processing data.
+
+    Returns:
+    None
+    """
+    assert output_chunks is not None
+    assert output_predictions is not None
+
+    LOG.info('Saving predictions per chunk and class')
+
+    # the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
+    files = listdir(model_dir_path)
+
+    if not any([f.endswith('.pt') for f in listdir(model_dir_path)]):
+        model_dir_path = load_newest_file_in_dir(model_dir_path)
+
+    args = argparse.Namespace(
+        output_chunks=output_chunks,
+        output_predictions=output_predictions,
+        input=input_path,
+        model_dir_path=model_dir_path,
+        output_bucket_path=output_bucket_path,
+        chunk_size=chunk_size,
+    )
+
+    assign_proteins_to_buckets(args)
+
+    LOG.info('Loading all data')
+    df = load_all_embeddings(input_path)
+
+    create_dir(output_bucket_path)
+
+    LOG.info(f'Saving predictions per bucket in `{output_bucket_path}`')
+    for f in tqdm(listdir(output_predictions)):
+        data_subset = df[df.index.isin(load_pickle(f'{output_predictions}/{f}'))]
+        save_pickle(f'{output_bucket_path}/{f}', data_subset)
+
+    LOG.info(f'Saved predictions per bucket in `{output_bucket_path}`')
+
+    LOG.info(f'Removing temporary files in `{output_chunks}`, `{output_predictions}`')
+    remove_dir(output_chunks)
+    remove_dir(output_predictions)
+
+    LOG.info('Done')
+
+
 '''
 The script loads a model and assigns protein IDs to buckets based on the model's predictions.
 
@@ -134,73 +192,34 @@ def assign_proteins_to_buckets(config):
     --model-dir-path "./data/models/"
 '''
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description="Create buckets for protein IDs based on model predictions")
     parser.add_argument(
         '--output-chunks',
         type=str,
-        default=('./data/chunks'),
-        help=(
-            'Path to a folder where temporary (per class + per slice) '
-            'predictions will be saved (without the / at the end)'
-        ),
+        default='./data/chunks',
+        help='Path to a folder where temporary (per class + per slice) predictions will be saved (without the / at the end)',
     )
     parser.add_argument(
         '--output-predictions',
         type=str,
-        default=('./data/overall'),
+        default='./data/overall',
         help='Path to a folder where the per bucket objects will be saved (without the / at the end)',
     )
+    parser.add_argument('--input', type=str, default='./data/embeddings', help='Path to the dataset')
+    parser.add_argument('--model-dir-path', type=str, default='./data/models/', help='Path to the model')
     parser.add_argument(
-        '--input',
-        type=str,
-        default='./data/embeddings',
-        help='Path to the dataset',
-    )
-    parser.add_argument(
-        '--model-dir-path',
-        type=str,
-        default=('./data/models/'),
-        help='Path to the model',
-    )
-    parser.add_argument(
-        '--output-bucket-path',
-        type=str,
-        default='./data/bucket-data/',
-        help='path to output bucket data',
+        '--output-bucket-path', type=str, default='./data/bucket-data/', help='path to output bucket data'
     )
     parser.add_argument('--chunk-size', type=int, default=1000000, help='Chunk size')
-
     args = parser.parse_args()
 
-    assert args.output_chunks is not None
-    assert args.output_predictions is not None
-
     logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')
 
-    LOG.info('Saving predictions per chunk and class')
-
-    # the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
-    files = listdir(args.model_dir_path)
-
-    if not any([f.endswith('.pt') for f in listdir(args.model_dir_path)]):
-        args.model_dir_path = load_newest_file_in_dir(args.model_dir_path)
-
-    assign_proteins_to_buckets(args)
-
-    LOG.info('Loading all data')
-    df = load_all_embeddings(args.input)
-
-    create_dir(args.output_bucket_path)
-
-    LOG.info(f'Saving predictions per bucket in `{args.output_bucket_path}`')
-    for f in tqdm(listdir(args.output_predictions)):
-        data_subset = df[df.index.isin(load_pickle(f'{args.output_predictions}/{f}'))]
-        save_pickle(f'{args.output_bucket_path}/{f}', data_subset)
-
-    LOG.info(f'Saved predictions per bucket in `{args.output_bucket_path}`')
-
-    LOG.info(f'Removing temporary files in `{args.output_chunks}`, `{args.output_predictions}`')
-    remove_dir(args.output_chunks)
-    remove_dir(args.output_predictions)
-
-    LOG.info('Done')
+    create_buckets(
+        args.output_chunks,
+        args.output_predictions,
+        args.input,
+        args.model_dir_path,
+        args.output_bucket_path,
+        args.chunk_size,
+    )
diff --git a/training/create-embedding.py → ...ng/alphafind_training/create_embedding.py b/training/create-embedding.py → ...ng/alphafind_training/create_embedding.py
@@ -19,24 +19,30 @@
 DST_THRESHOLD = 20.0
 
 
-def run(cif_path, output_path, granularity):
+def create_embedding(input_path, output_path, granularity):
     """Calculate all protein descriptors
 
     Args:
-        cif_path (str): path to CIF
-        output_path (str): output file
+        input_path (str or Path): path to CIF directory
+        output_path (str or Path): output file path
         granularity (int): granularity of the descriptors
+
+    Returns:
+        None
     """
-    proteins = os.listdir(cif_path)
-    proteins = [file for file in proteins if file.endswith(".cif")]
+    input_path = Path(input_path)
+    output_path = Path(output_path)
+
+    proteins = [file for file in os.listdir(input_path) if file.endswith(".cif")]
     LOG.info(f'Found {len(proteins)} proteins to create the embedding for')
+
     with Pool() as pool:
         results = []
         data = []
         index = []
 
         for protein in proteins:
-            result = pool.apply_async(process_protein, (cif_path / protein, granularity))
+            result = pool.apply_async(process_protein, (input_path / protein, granularity))
             results.append(result)
 
         LOG.info("Processing started")
@@ -46,7 +52,7 @@ def run(cif_path, output_path, granularity):
         ]
         index = [n for sublist in [result.get()['index'] for result in results] for n in sublist]
         df = pd.DataFrame(index=index, data=data)
-        df.to_pickle(Path(output_path))
+        df.to_pickle(output_path)
         t = time() - t
         LOG.info(f'Processing took {t:.1f} seconds')
         LOG.info(f'Output saved to {output_path}')
@@ -194,17 +200,17 @@ def remap(n, min_, max_):
 python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
 """
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description="Create protein descriptors from CIF files")
     parser.add_argument("--input", type=str, required=True, help="Path to the directory containing CIF files")
     parser.add_argument("--output", type=str, required=True, help="Path to the output file")
-    parser.add_argument(
-        "--granularity", type=int, required=False, default=10, help="How detailed should the descriptor be"
-    )
+    parser.add_argument("--granularity", type=int, default=10, help="How detailed should the descriptor be")
 
     args = parser.parse_args()
 
+    logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')
+
     input_path = Path(args.input)
     output_path = Path(args.output)
-    assert input_path.exists()
+    assert input_path.exists(), f"Input path {input_path} does not exist"
 
-    run(input_path, output_path, args.granularity)
+    create_embedding(input_path, output_path, args.granularity)