Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

alphafind_training local package, set up automated tests #10

Merged
merged 4 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@ on:
push:
# Sequence of patterns matched against refs/heads
branches:
# Push events on main and dev branch
- main
- ui-tests
# Sequence of patterns matched against refs/tags
- '**'
tags: '*'

jobs:
Expand Down Expand Up @@ -47,3 +44,27 @@ jobs:

- name: Docker build training, backend, and frontend
run: ./run.sh

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r training/requirements.txt
pip install pytest pytest-cov
cd training && pip install -e .

- name: Run tests with pytest and generate coverage
run: |
cd /home/runner/work/AlphaFind/AlphaFind/training
export PYTHONPATH=$PYTHONPATH:$(pwd)
pytest -v --cov=. --cov-report=xml --cov-report=term-missing

- name: Upload coverage report
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: training/coverage.xml
5 changes: 4 additions & 1 deletion training/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,7 @@ kubectl
data/bucket-*
data/embedding.pkl
data/kmeans.idx
models/
models/
.coverage
coverage.xml
*.egg-info
2 changes: 2 additions & 0 deletions training/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ RUN pip install -r /var/requirements.txt && rm -rf ~/.cache

COPY . /training
WORKDIR /training
RUN pip install -e .
RUN chmod +x /training/run.sh

CMD [ "/bin/bash", "/training/run.sh" ]
Empty file.
63 changes: 63 additions & 0 deletions training/alphafind_training/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import logging

import numpy as np
import torch
from alphafind_training.clustering import run_clustering
from alphafind_training.utils import dir_exists, file_exists, load_dataset, load_pickle

LOG = logging.getLogger(__name__)

torch.manual_seed(2023)
np.random.seed(2023)


def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=10):
"""
Function for clustering the embeddings using K-Means.

Args:
input_path (str): Path to the embeddings pickle file or directory of pickle files
output_path (str): Path to the output K-Means file
n_clusters (int): Number of clusters (default: 2)
sample_size (int): Size of the sample (default: 108)
n_iterations (int): Number of k-means iterations (default: 10)

Returns:
None
"""
assert file_exists(input_path) or dir_exists(input_path), 'Input file or directory does not exist'

LOG.info('Loading embeddings')
if dir_exists(input_path) and not file_exists(input_path):
embeddings, _ = load_dataset(input_path, sample_size, shuffle=True)
else:
embeddings = load_pickle(input_path)

assert embeddings.shape[0] >= sample_size, 'Sample size must be smaller than the number of embeddings'

LOG.info(f'Loaded embeddings of shape: {embeddings.shape}')
LOG.info(f'Running clustering, result k-means object will be saved to: {output_path}')

run_clustering(
output_path,
embeddings.values,
sample_size,
n_clusters,
n_iterations,
)


if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser(description="Cluster embeddings using K-Means")
parser.add_argument(
'--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files'
)
parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file')
parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters')
parser.add_argument('--sample-size', type=int, default=108, help='Size of the sample')
parser.add_argument('--n-iterations', type=int, default=10, help='Number of k-means iterations')
args = parser.parse_args()

create_kmeans(args.input, args.output, args.n_clusters, args.sample_size, args.n_iterations)
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import faiss
import numpy as np

from utils import measure_memory_usage, measure_runtime
from alphafind_training.utils import measure_memory_usage, measure_runtime

np.random.seed(2023)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
import numpy as np
import pandas as pd
import torch
from model import LIDatasetPredict, load_model
from tqdm import tqdm
from utils import (
from alphafind_training.model import LIDatasetPredict, load_model
from alphafind_training.utils import (
create_dir,
dir_exists,
file_exists,
Expand All @@ -19,6 +18,7 @@
save_pickle,
save_predictions,
)
from tqdm import tqdm

torch.manual_seed(2023)
np.random.seed(2023)
Expand Down Expand Up @@ -114,6 +114,64 @@ def assign_proteins_to_buckets(config):
LOG.info(f'Saved predictions per class in `{config.output_predictions}`')


def create_buckets(
output_chunks, output_predictions, input_path, model_dir_path, output_bucket_path, chunk_size=1000000
):
"""
Create buckets for protein IDs based on model predictions.

Args:
output_chunks (str): Path to a folder where temporary (per class + per slice) predictions will be saved.
output_predictions (str): Path to a folder where the per bucket objects will be saved.
input_path (str): Path to the dataset.
model_dir_path (str): Path to the model.
output_bucket_path (str): Path to output bucket data.
chunk_size (int): Chunk size for processing data.

Returns:
None
"""
assert output_chunks is not None
assert output_predictions is not None

LOG.info('Saving predictions per chunk and class')

# the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
files = listdir(model_dir_path)

if not any([f.endswith('.pt') for f in listdir(model_dir_path)]):
model_dir_path = load_newest_file_in_dir(model_dir_path)

args = argparse.Namespace(
output_chunks=output_chunks,
output_predictions=output_predictions,
input=input_path,
model_dir_path=model_dir_path,
output_bucket_path=output_bucket_path,
chunk_size=chunk_size,
)

assign_proteins_to_buckets(args)

LOG.info('Loading all data')
df = load_all_embeddings(input_path)

create_dir(output_bucket_path)

LOG.info(f'Saving predictions per bucket in `{output_bucket_path}`')
for f in tqdm(listdir(output_predictions)):
data_subset = df[df.index.isin(load_pickle(f'{output_predictions}/{f}'))]
save_pickle(f'{output_bucket_path}/{f}', data_subset)

LOG.info(f'Saved predictions per bucket in `{output_bucket_path}`')

LOG.info(f'Removing temporary files in `{output_chunks}`, `{output_predictions}`')
remove_dir(output_chunks)
remove_dir(output_predictions)

LOG.info('Done')


'''
The script loads a model and assigns protein IDs to buckets based on the model's predictions.

Expand All @@ -134,73 +192,34 @@ def assign_proteins_to_buckets(config):
--model-dir-path "./data/models/"
'''
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(description="Create buckets for protein IDs based on model predictions")
parser.add_argument(
'--output-chunks',
type=str,
default=('./data/chunks'),
help=(
'Path to a folder where temporary (per class + per slice) '
'predictions will be saved (without the / at the end)'
),
default='./data/chunks',
help='Path to a folder where temporary (per class + per slice) predictions will be saved (without the / at the end)',
)
parser.add_argument(
'--output-predictions',
type=str,
default=('./data/overall'),
default='./data/overall',
help='Path to a folder where the per bucket objects will be saved (without the / at the end)',
)
parser.add_argument('--input', type=str, default='./data/embeddings', help='Path to the dataset')
parser.add_argument('--model-dir-path', type=str, default='./data/models/', help='Path to the model')
parser.add_argument(
'--input',
type=str,
default='./data/embeddings',
help='Path to the dataset',
)
parser.add_argument(
'--model-dir-path',
type=str,
default=('./data/models/'),
help='Path to the model',
)
parser.add_argument(
'--output-bucket-path',
type=str,
default='./data/bucket-data/',
help='path to output bucket data',
'--output-bucket-path', type=str, default='./data/bucket-data/', help='path to output bucket data'
)
parser.add_argument('--chunk-size', type=int, default=1000000, help='Chunk size')

args = parser.parse_args()

assert args.output_chunks is not None
assert args.output_predictions is not None

logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')

LOG.info('Saving predictions per chunk and class')

# the dir can be models/<dirs> or <specific-model-dir>/checkpoint.pt
files = listdir(args.model_dir_path)

if not any([f.endswith('.pt') for f in listdir(args.model_dir_path)]):
args.model_dir_path = load_newest_file_in_dir(args.model_dir_path)

assign_proteins_to_buckets(args)

LOG.info('Loading all data')
df = load_all_embeddings(args.input)

create_dir(args.output_bucket_path)

LOG.info(f'Saving predictions per bucket in `{args.output_bucket_path}`')
for f in tqdm(listdir(args.output_predictions)):
data_subset = df[df.index.isin(load_pickle(f'{args.output_predictions}/{f}'))]
save_pickle(f'{args.output_bucket_path}/{f}', data_subset)

LOG.info(f'Saved predictions per bucket in `{args.output_bucket_path}`')

LOG.info(f'Removing temporary files in `{args.output_chunks}`, `{args.output_predictions}`')
remove_dir(args.output_chunks)
remove_dir(args.output_predictions)

LOG.info('Done')
create_buckets(
args.output_chunks,
args.output_predictions,
args.input,
args.model_dir_path,
args.output_bucket_path,
args.chunk_size,
)
Original file line number Diff line number Diff line change
Expand Up @@ -19,24 +19,30 @@
DST_THRESHOLD = 20.0


def run(cif_path, output_path, granularity):
def create_embedding(input_path, output_path, granularity):
"""Calculate all protein descriptors

Args:
cif_path (str): path to CIF
output_path (str): output file
input_path (str or Path): path to CIF directory
output_path (str or Path): output file path
granularity (int): granularity of the descriptors

Returns:
None
"""
proteins = os.listdir(cif_path)
proteins = [file for file in proteins if file.endswith(".cif")]
input_path = Path(input_path)
output_path = Path(output_path)

proteins = [file for file in os.listdir(input_path) if file.endswith(".cif")]
LOG.info(f'Found {len(proteins)} proteins to create the embedding for')

with Pool() as pool:
results = []
data = []
index = []

for protein in proteins:
result = pool.apply_async(process_protein, (cif_path / protein, granularity))
result = pool.apply_async(process_protein, (input_path / protein, granularity))
results.append(result)

LOG.info("Processing started")
Expand All @@ -46,7 +52,7 @@ def run(cif_path, output_path, granularity):
]
index = [n for sublist in [result.get()['index'] for result in results] for n in sublist]
df = pd.DataFrame(index=index, data=data)
df.to_pickle(Path(output_path))
df.to_pickle(output_path)
t = time() - t
LOG.info(f'Processing took {t:.1f} seconds')
LOG.info(f'Output saved to {output_path}')
Expand Down Expand Up @@ -194,17 +200,17 @@ def remap(n, min_, max_):
python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(description="Create protein descriptors from CIF files")
parser.add_argument("--input", type=str, required=True, help="Path to the directory containing CIF files")
parser.add_argument("--output", type=str, required=True, help="Path to the output file")
parser.add_argument(
"--granularity", type=int, required=False, default=10, help="How detailed should the descriptor be"
)
parser.add_argument("--granularity", type=int, default=10, help="How detailed should the descriptor be")

args = parser.parse_args()

logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s')

input_path = Path(args.input)
output_path = Path(args.output)
assert input_path.exists()
assert input_path.exists(), f"Input path {input_path} does not exist"

run(input_path, output_path, args.granularity)
create_embedding(input_path, output_path, args.granularity)
Loading
Loading