diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c567a5e..118a175 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,20 +37,16 @@ jobs: python -m pip install --upgrade pip pip install -r training/requirements.txt pip install pytest pytest-cov + cd training && pip install -e . - name: Run tests with pytest and generate coverage run: | - cd training - pytest --cov=. --cov-report=xml --cov-report=term-missing + cd /home/runner/work/AlphaFind/AlphaFind/training + export PYTHONPATH=$PYTHONPATH:$(pwd) + pytest -v --cov=. --cov-report=xml --cov-report=term-missing - name: Upload coverage report uses: actions/upload-artifact@v4 with: name: coverage-report path: training/coverage.xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - with: - file: ./training/coverage.xml - fail_ci_if_error: true diff --git a/training/.gitignore b/training/.gitignore index 723add1..8e6b0a4 100644 --- a/training/.gitignore +++ b/training/.gitignore @@ -13,3 +13,4 @@ data/kmeans.idx models/ .coverage coverage.xml +*.egg-info \ No newline at end of file diff --git a/training/Dockerfile b/training/Dockerfile index 8591b7c..2375fbf 100644 --- a/training/Dockerfile +++ b/training/Dockerfile @@ -17,5 +17,7 @@ RUN pip install -r /var/requirements.txt && rm -rf ~/.cache COPY . /training WORKDIR /training +RUN pip install -e . RUN chmod +x /training/run.sh + CMD [ "/bin/bash", "/training/run.sh" ] \ No newline at end of file diff --git a/training/alphafind_training/__init__.py b/training/alphafind_training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/training/alphafind_training/cluster.py b/training/alphafind_training/cluster.py new file mode 100644 index 0000000..7eda5e4 --- /dev/null +++ b/training/alphafind_training/cluster.py @@ -0,0 +1,63 @@ +import logging + +import numpy as np +import torch +from alphafind_training.clustering import run_clustering +from alphafind_training.utils import dir_exists, file_exists, load_dataset, load_pickle + +LOG = logging.getLogger(__name__) + +torch.manual_seed(2023) +np.random.seed(2023) + + +def create_kmeans(input_path, output_path, n_clusters=2, sample_size=108, n_iterations=10): + """ + Function for clustering the embeddings using K-Means. + + Args: + input_path (str): Path to the embeddings pickle file or directory of pickle files + output_path (str): Path to the output K-Means file + n_clusters (int): Number of clusters (default: 2) + sample_size (int): Size of the sample (default: 108) + n_iterations (int): Number of k-means iterations (default: 10) + + Returns: + None + """ + assert file_exists(input_path) or dir_exists(input_path), 'Input file or directory does not exist' + + LOG.info('Loading embeddings') + if dir_exists(input_path) and not file_exists(input_path): + embeddings, _ = load_dataset(input_path, sample_size, shuffle=True) + else: + embeddings = load_pickle(input_path) + + assert embeddings.shape[0] >= sample_size, 'Sample size must be smaller than the number of embeddings' + + LOG.info(f'Loaded embeddings of shape: {embeddings.shape}') + LOG.info(f'Running clustering, result k-means object will be saved to: {output_path}') + + run_clustering( + output_path, + embeddings.values, + sample_size, + n_clusters, + n_iterations, + ) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description="Cluster embeddings using K-Means") + parser.add_argument( + '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files' + ) + parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file') + parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters') + parser.add_argument('--sample-size', type=int, default=108, help='Size of the sample') + parser.add_argument('--n-iterations', type=int, default=10, help='Number of k-means iterations') + args = parser.parse_args() + + create_kmeans(args.input, args.output, args.n_clusters, args.sample_size, args.n_iterations) diff --git a/training/clustering.py b/training/alphafind_training/clustering.py similarity index 95% rename from training/clustering.py rename to training/alphafind_training/clustering.py index 1d045a7..f5d7af9 100644 --- a/training/clustering.py +++ b/training/alphafind_training/clustering.py @@ -2,8 +2,7 @@ import faiss import numpy as np - -from utils import measure_memory_usage, measure_runtime +from alphafind_training.utils import measure_memory_usage, measure_runtime np.random.seed(2023) diff --git a/training/create_buckets.py b/training/alphafind_training/create_buckets.py similarity index 62% rename from training/create_buckets.py rename to training/alphafind_training/create_buckets.py index 619744c..28cea6f 100644 --- a/training/create_buckets.py +++ b/training/alphafind_training/create_buckets.py @@ -7,10 +7,8 @@ import numpy as np import pandas as pd import torch -from tqdm import tqdm - -from model import LIDatasetPredict, load_model -from utils import ( +from alphafind_training.model import LIDatasetPredict, load_model +from alphafind_training.utils import ( create_dir, dir_exists, file_exists, @@ -20,6 +18,7 @@ save_pickle, save_predictions, ) +from tqdm import tqdm torch.manual_seed(2023) np.random.seed(2023) @@ -42,16 +41,23 @@ def load_all_embeddings(path): def parse_model_params(model_path): LOG.info(f'Parsing out model params from model path: {model_path}') pattern = r'model-(\w+)--.*?n_classes-(\d+)(?:--.*?dimensionality-(\d+))?' + + if model_path is None: + model = 'MLP' + dimensionality = DEFAULT_DIMENSIONALITY + n_classes = 2 + LOG.info(f'Parsed out model={model}, dimensionality={dimensionality}, n_classes={n_classes}') + return model, dimensionality, n_classes + match = re.search(pattern, model_path, re.MULTILINE) - # new model format if match and len(match.groups()) == 3: - model = match.group(1) - n_classes = int(match.group(2)) - dimensionality = match.group(3) + model, n_classes, dimensionality = match.groups() dimensionality = int(dimensionality) if dimensionality is not None else DEFAULT_DIMENSIONALITY + n_classes = int(n_classes) else: LOG.info(f'Failed to parse out model params from model path: {model_path}') exit(1) + LOG.info(f'Parsed out model={model}, dimensionality={dimensionality}, n_classes={n_classes}') return model, dimensionality, n_classes @@ -108,6 +114,64 @@ def assign_proteins_to_buckets(config): LOG.info(f'Saved predictions per class in `{config.output_predictions}`') +def create_buckets( + output_chunks, output_predictions, input_path, model_dir_path, output_bucket_path, chunk_size=1000000 +): + """ + Create buckets for protein IDs based on model predictions. + + Args: + output_chunks (str): Path to a folder where temporary (per class + per slice) predictions will be saved. + output_predictions (str): Path to a folder where the per bucket objects will be saved. + input_path (str): Path to the dataset. + model_dir_path (str): Path to the model. + output_bucket_path (str): Path to output bucket data. + chunk_size (int): Chunk size for processing data. + + Returns: + None + """ + assert output_chunks is not None + assert output_predictions is not None + + LOG.info('Saving predictions per chunk and class') + + # the dir can be models/ or /checkpoint.pt + files = listdir(model_dir_path) + + if not any([f.endswith('.pt') for f in listdir(model_dir_path)]): + model_dir_path = load_newest_file_in_dir(model_dir_path) + + args = argparse.Namespace( + output_chunks=output_chunks, + output_predictions=output_predictions, + input=input_path, + model_dir_path=model_dir_path, + output_bucket_path=output_bucket_path, + chunk_size=chunk_size, + ) + + assign_proteins_to_buckets(args) + + LOG.info('Loading all data') + df = load_all_embeddings(input_path) + + create_dir(output_bucket_path) + + LOG.info(f'Saving predictions per bucket in `{output_bucket_path}`') + for f in tqdm(listdir(output_predictions)): + data_subset = df[df.index.isin(load_pickle(f'{output_predictions}/{f}'))] + save_pickle(f'{output_bucket_path}/{f}', data_subset) + + LOG.info(f'Saved predictions per bucket in `{output_bucket_path}`') + + LOG.info(f'Removing temporary files in `{output_chunks}`, `{output_predictions}`') + remove_dir(output_chunks) + remove_dir(output_predictions) + + LOG.info('Done') + + ''' The script loads a model and assigns protein IDs to buckets based on the model's predictions. @@ -128,73 +192,34 @@ def assign_proteins_to_buckets(config): --model-dir-path "./data/models/" ''' if __name__ == '__main__': - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description="Create buckets for protein IDs based on model predictions") parser.add_argument( '--output-chunks', type=str, - default=('./data/chunks'), - help=( - 'Path to a folder where temporary (per class + per slice) ' - 'predictions will be saved (without the / at the end)' - ), + default='./data/chunks', + help='Path to a folder where temporary (per class + per slice) predictions will be saved (without the / at the end)', ) parser.add_argument( '--output-predictions', type=str, - default=('./data/overall'), + default='./data/overall', help='Path to a folder where the per bucket objects will be saved (without the / at the end)', ) + parser.add_argument('--input', type=str, default='./data/embeddings', help='Path to the dataset') + parser.add_argument('--model-dir-path', type=str, default='./data/models/', help='Path to the model') parser.add_argument( - '--input', - type=str, - default='./data/embeddings', - help='Path to the dataset', - ) - parser.add_argument( - '--model-dir-path', - type=str, - default=('./data/models/'), - help='Path to the model', - ) - parser.add_argument( - '--output-bucket-path', - type=str, - default='./data/bucket-data/', - help='path to output bucket data', + '--output-bucket-path', type=str, default='./data/bucket-data/', help='path to output bucket data' ) parser.add_argument('--chunk-size', type=int, default=1000000, help='Chunk size') - args = parser.parse_args() - assert args.output_chunks is not None - assert args.output_predictions is not None - logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s') - LOG.info('Saving predictions per chunk and class') - - # the dir can be models/ or /checkpoint.pt - files = listdir(args.model_dir_path) - - if not any([f.endswith('.pt') for f in listdir(args.model_dir_path)]): - args.model_dir_path = load_newest_file_in_dir(args.model_dir_path) - - assign_proteins_to_buckets(args) - - LOG.info('Loading all data') - df = load_all_embeddings(args.input) - - create_dir(args.output_bucket_path) - - LOG.info(f'Saving predictions per bucket in `{args.output_bucket_path}`') - for f in tqdm(listdir(args.output_predictions)): - data_subset = df[df.index.isin(load_pickle(f'{args.output_predictions}/{f}'))] - save_pickle(f'{args.output_bucket_path}/{f}', data_subset) - - LOG.info(f'Saved predictions per bucket in `{args.output_bucket_path}`') - - LOG.info(f'Removing temporary files in `{args.output_chunks}`, `{args.output_predictions}`') - remove_dir(args.output_chunks) - remove_dir(args.output_predictions) - - LOG.info('Done') + create_buckets( + args.output_chunks, + args.output_predictions, + args.input, + args.model_dir_path, + args.output_bucket_path, + args.chunk_size, + ) diff --git a/training/create_embedding.py b/training/alphafind_training/create_embedding.py similarity index 87% rename from training/create_embedding.py rename to training/alphafind_training/create_embedding.py index 84ddc60..f65ff26 100644 --- a/training/create_embedding.py +++ b/training/alphafind_training/create_embedding.py @@ -19,24 +19,30 @@ DST_THRESHOLD = 20.0 -def run(cif_path, output_path, granularity): +def create_embedding(input_path, output_path, granularity): """Calculate all protein descriptors Args: - cif_path (str): path to CIF - output_path (str): output file + input_path (str or Path): path to CIF directory + output_path (str or Path): output file path granularity (int): granularity of the descriptors + + Returns: + None """ - proteins = os.listdir(cif_path) - proteins = [file for file in proteins if file.endswith(".cif")] + input_path = Path(input_path) + output_path = Path(output_path) + + proteins = [file for file in os.listdir(input_path) if file.endswith(".cif")] LOG.info(f'Found {len(proteins)} proteins to create the embedding for') + with Pool() as pool: results = [] data = [] index = [] for protein in proteins: - result = pool.apply_async(process_protein, (cif_path / protein, granularity)) + result = pool.apply_async(process_protein, (input_path / protein, granularity)) results.append(result) LOG.info("Processing started") @@ -46,7 +52,7 @@ def run(cif_path, output_path, granularity): ] index = [n for sublist in [result.get()['index'] for result in results] for n in sublist] df = pd.DataFrame(index=index, data=data) - df.to_pickle(Path(output_path)) + df.to_pickle(output_path) t = time() - t LOG.info(f'Processing took {t:.1f} seconds') LOG.info(f'Output saved to {output_path}') @@ -194,17 +200,17 @@ def remap(n, min_, max_): python3 create-embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10 """ if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description="Create protein descriptors from CIF files") parser.add_argument("--input", type=str, required=True, help="Path to the directory containing CIF files") parser.add_argument("--output", type=str, required=True, help="Path to the output file") - parser.add_argument( - "--granularity", type=int, required=False, default=10, help="How detailed should the descriptor be" - ) + parser.add_argument("--granularity", type=int, default=10, help="How detailed should the descriptor be") args = parser.parse_args() + logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s') + input_path = Path(args.input) output_path = Path(args.output) - assert input_path.exists() + assert input_path.exists(), f"Input path {input_path} does not exist" - run(input_path, output_path, args.granularity) + create_embedding(input_path, output_path, args.granularity) diff --git a/training/alphafind_training/create_protein_bucket_mapping.py b/training/alphafind_training/create_protein_bucket_mapping.py new file mode 100644 index 0000000..1038150 --- /dev/null +++ b/training/alphafind_training/create_protein_bucket_mapping.py @@ -0,0 +1,51 @@ +import argparse +import logging +import os + +from alphafind_training.utils import load_pickle, save_pickle +from tqdm import tqdm + +LOG = logging.getLogger(__name__) + + +def create_mapping(bucket_data_path, output_path): + """ + Creates an index mapping protein id to its bucket id and position in the bucket's DataFrame. + The index is saved as a pickle file. + + Args: + bucket_data_path (str): Path to the bucket data + output_path (str): Path where the index will be saved + + Returns: + None + """ + LOG.info('Creating the index') + protein_id_to_position_mapping = dict() + + for bucket_name in tqdm(os.listdir(bucket_data_path)): + bucket_data = load_pickle(os.path.join(bucket_data_path, bucket_name)) + + for dataframe_protein_index, protein_id in enumerate(bucket_data.index): + bucket_id = int(bucket_name.replace('class-', '').replace('.pkl', '')) + + protein_id_to_position_mapping[protein_id] = (bucket_id, dataframe_protein_index) + + del bucket_data + + LOG.info(f'Saving the index to {output_path}') + save_pickle(output_path, protein_id_to_position_mapping) + + LOG.info("DONE") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Create protein-to-bucket mapping") + parser.add_argument('--bucket-path', type=str, required=True, help='Path to the bucket data') + parser.add_argument('--output', type=str, required=True, help='Path where the index will be saved') + + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s') + + create_mapping(args.bucket_path, args.output) diff --git a/training/model.py b/training/alphafind_training/model.py similarity index 99% rename from training/model.py rename to training/alphafind_training/model.py index 310a789..afd3886 100644 --- a/training/model.py +++ b/training/alphafind_training/model.py @@ -11,16 +11,15 @@ import torch.nn.functional as nnf import torch.utils.data import wandb -from torch import nn - -from clustering import assign_labels -from utils import ( +from alphafind_training.clustering import assign_labels +from alphafind_training.utils import ( dir_exists, file_exists, get_current_timestamp, load_newest_file_in_dir, load_pickle, ) +from torch import nn # set seeds for reproducibility torch.manual_seed(2023) diff --git a/training/randomize-data.py b/training/alphafind_training/randomize-data.py similarity index 96% rename from training/randomize-data.py rename to training/alphafind_training/randomize-data.py index a1b8733..b3dc6f0 100644 --- a/training/randomize-data.py +++ b/training/alphafind_training/randomize-data.py @@ -5,8 +5,13 @@ from typing import Dict, List import pandas as pd - -from utils import create_dir, load_dataset, save_json, save_pickle, write_row_to_csv +from alphafind_training.utils import ( + create_dir, + load_dataset, + save_json, + save_pickle, + write_row_to_csv, +) def construct_metadata_dict( diff --git a/training/train.py b/training/alphafind_training/train.py similarity index 51% rename from training/train.py rename to training/alphafind_training/train.py index 1f3a889..1f844fc 100644 --- a/training/train.py +++ b/training/alphafind_training/train.py @@ -6,24 +6,92 @@ import numpy as np import torch import wandb - -from model import LIDataset, load_model, save_model -from utils import create_dir, dir_exists, file_exists, get_current_timestamp +from alphafind_training.model import LIDataset, load_model, save_model +from alphafind_training.utils import ( + create_dir, + dir_exists, + file_exists, + get_current_timestamp, +) torch.manual_seed(2023) np.random.seed(2023) LOG = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s') -def run_training(config): - with wandb.init( - project='small-data-training', - entity='protein-db', - config=config, - settings=wandb.Settings(start_method='thread'), - ): +def train_model( + input_path, + output_model_dir, + kmeans_path, + model='MLP', + model_path=None, + epochs=10, + n_classes=2, + batch_size=32, + dimensionality=45, + wandb_project='small-data-training', + wandb_entity='protein-db', +): + """ + Train a model on the embeddings dataset. + + Args: + input_path (str): Path to the embeddings pickle file or directory of pickle files + output_model_dir (str): Path to the output model directory + kmeans_path (str): Path to the k-means model + model (str): Model to use (default: 'MLP') + model_path (str): Path to the trained model if using a pretrained model (default: None) + epochs (int): Number of epochs (default: 10) + n_classes (int): Number of classes to use (default: 2) + batch_size (int): Batch size (default: 32) + dimensionality (int): Number of dimensions of the data (default: 45) + wandb_project (str): W&B project name (default: 'small-data-training') + wandb_entity (str): W&B entity name (default: 'protein-db') + + Returns: + None + """ + pretrained = model_path is not None + if not dir_exists(output_model_dir): + create_dir(output_model_dir) + + if model_path is not None: + assert file_exists(model_path) or dir_exists(model_path), 'Model file or dir does not exist' + assert file_exists(kmeans_path), 'K-Means file does not exist' + + timestamp = get_current_timestamp() + if pretrained: + timestamp = model_path.split('--')[-1] + name = str( + f'model-{model}--pretrained-{pretrained}--n_classes-{n_classes}--epochs-{epochs}' + f'--batchsize={batch_size}--dimensionality-{dimensionality}--{timestamp}' + ) + + config = argparse.Namespace( + input=input_path, + output_model_dir=output_model_dir, + model=model, + model_path=model_path, + kmeans_path=kmeans_path, + epochs=epochs, + n_classes=n_classes, + batch_size=batch_size, + dimensionality=dimensionality, + use_wandb=False, + wandb_project=wandb_project, + wandb_entity=wandb_entity, + name=name, + ) + + if config.use_wandb: + + wandb.init( + project=wandb_project, + entity=wandb_entity, + config=config, + settings=wandb.Settings(start_method='thread'), + ) wandb.run.name = config.name LOG.info(f'Using config: {config}') @@ -74,22 +142,12 @@ def run_training(config): losses, f'{config.output_model_dir}/{config.name}/epoch-{epoch+1}.pt', ) + if config.use_wandb: + wandb.finish() -""" -This script is used to train a model on the embeddings dataset. - -Input: Embeddings pickle file, K-Means object -Output: Trained model, predictions - -EXAMPLE USE: -WANDB_MODE=offline python train.py\ - --input ./data/embeddings/\ - --kmeans-path ./data/kmeans.idx\ - --output-model-dir ./models/ -""" if __name__ == '__main__': - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description="Train a model on the embeddings dataset") parser.add_argument( '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files' ) @@ -106,19 +164,16 @@ def run_training(config): args = parser.parse_args() - pretrained = args.model_path is not None - if not dir_exists(args.output_model_dir): - create_dir(args.output_model_dir) - - if args.model_path is not None: - assert file_exists(args.model_path) or dir_exists(args.model_path), 'Model file or dir does not exist' - assert file_exists(args.kmeans_path), 'K-Means file does not exist' - - timestamp = get_current_timestamp() - if pretrained: - timestamp = args.model_path.split('--')[-1] - args.name = str( - f'model-{args.model}--pretrained-{pretrained}--n_classes-{args.n_classes}--epochs-{args.epochs}' - f'--batchsize={args.batch_size}--dimensionality-{args.dimensionality}--{timestamp}' + logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s') + + train_model( + input_path=args.input, + output_model_dir=args.output_model_dir, + kmeans_path=args.kmeans_path, + model=args.model, + model_path=args.model_path, + epochs=args.epochs, + n_classes=args.n_classes, + batch_size=args.batch_size, + dimensionality=args.dimensionality, ) - run_training(args) diff --git a/training/utils.py b/training/alphafind_training/utils.py similarity index 100% rename from training/utils.py rename to training/alphafind_training/utils.py diff --git a/training/cluster.py b/training/cluster.py deleted file mode 100644 index f9713ad..0000000 --- a/training/cluster.py +++ /dev/null @@ -1,55 +0,0 @@ -import argparse -import logging - -import numpy as np -import torch - -from clustering import run_clustering -from utils import dir_exists, file_exists, load_dataset, load_pickle - -LOG = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)-5.5s][%(name)-.20s] %(message)s') - -torch.manual_seed(2023) -np.random.seed(2023) - -""" -Script for clustering the embeddings using K-Means. - -Input: Embeddings pickle file -Output: K-Means object - -EXAMPLE USE: -python3 cluster.py --input=./data/embedding.pkl --output=./data/kmeans.idx -""" -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--input', type=str, required=True, help='Path to the embeddings pickle file or directory of pickle files' - ) - parser.add_argument('--output', type=str, required=True, help='Path to the output K-Means file') - parser.add_argument('--n-clusters', type=int, default=2, help='Number of clusters') - parser.add_argument('--sample-size', type=int, default=108, help='Size of the sample') - parser.add_argument('--n-iterations', type=int, default=10, help='Number of k-means iterations') - args = parser.parse_args() - - assert file_exists(args.input) or dir_exists(args.input), 'Input file or directory does not exist' - - LOG.info('Loading embeddings') - if dir_exists(args.input) and not file_exists(args.input): - embeddings, _ = load_dataset(args.input, args.sample_size, shuffle=True) - else: - embeddings = load_pickle(args.input) - - assert embeddings.shape[0] >= args.sample_size, 'Sample size must be smaller than the number of embeddings' - - LOG.info(f'Loaded embeddings of shape: {embeddings.shape}') - LOG.info(f'Running clustering, result k-means object will be saved to: {args.output}') - - run_clustering( - args.output, - embeddings.values, - args.sample_size, - args.n_clusters, - args.n_iterations, - ) diff --git a/training/create_protein_bucket_mapping.py b/training/create_protein_bucket_mapping.py deleted file mode 100644 index 48e49d1..0000000 --- a/training/create_protein_bucket_mapping.py +++ /dev/null @@ -1,56 +0,0 @@ -import argparse -import logging -import os - -from tqdm import tqdm - -from utils import load_pickle, save_pickle - -LOG = logging.getLogger(__name__) - - -def create_index(bucket_data_path, output_path): - LOG.info('Creating the index') - protein_id_to_position_mapping = dict() - - for bucket_name in tqdm(os.listdir(bucket_data_path)): - bucket_data = load_pickle(f'{bucket_data_path}/{bucket_name}') - - for dataframe_protein_index, protein_id in enumerate(bucket_data.index): - bucket_id = int(bucket_name.replace('class-', '').replace('.pkl', '')) - - protein_id_to_position_mapping[protein_id] = (bucket_id, dataframe_protein_index) - - del bucket_data - - LOG.info(f'Saving the index to {output_path}') - save_pickle(output_path, protein_id_to_position_mapping) - - LOG.info("DONE") - - -''' -Creates an index mapping protein id to its bucket id and position in the bucket's DataFrame. -The index is saved as a pickle file. - -Implementation details: -- The `create_index` goes through all the bucket data files and - creates the mapping in the form of a tuple (bucket_id, dataframe_protein_index). -- Prints the progress every 20 buckets. - -EXAMPLE USE: -create_protein_bucket-mapping.py \ - --bucket-data-path './data/bucket-data' \ - --output-path './data/bucket-mapping.pickle' -''' -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--bucket-path', type=str, help='Path to the bucket data') - parser.add_argument('--output', type=str, help='Path where the index will be saved') - - args = parser.parse_args() - - assert args.bucket_path is not None - assert args.output is not None - - create_index(args.bucket_path, args.output) diff --git a/training/pyproject.toml b/training/pyproject.toml index dceea6a..c7c62c0 100644 --- a/training/pyproject.toml +++ b/training/pyproject.toml @@ -5,3 +5,9 @@ line-length = 120 [tool.isort] profile = "black" known_third_party = "wandb" + +[project] +name = "alphafind_training" +version = "0.0.1" +description = "AlphaFold training -- setup for the similarity search on vast protein data" +readme = "README.md" diff --git a/training/pytest.ini b/training/pytest.ini index 7bcb504..83be5fe 100644 --- a/training/pytest.ini +++ b/training/pytest.ini @@ -1,4 +1,4 @@ [pytest] -testpaths = . -python_files = tests.py +testpaths = tests/ +python_files = tests/test_*.py addopts = --cov=. --cov-report=term-missing --cov-report=xml \ No newline at end of file diff --git a/training/run.sh b/training/run.sh index 61f79e0..6702b0b 100644 --- a/training/run.sh +++ b/training/run.sh @@ -1,19 +1,9 @@ -#/bin/bash +#!/bin/bash -# 1) ---- Create embeddings -python3 create_embedding.py --input=./data/cifs --output=./data/embedding.pkl --granularity 10 +# Set the base directory +BASE_DIR="$(dirname "$0")" +DATA_DIR="$BASE_DIR/data" +MODELS_DIR="$BASE_DIR/models" -# 2) ---- Create a K-Means object -# Clusters and saves the k-means object to to `data/kmeans.idx` -python3 cluster.py --input=./data/embedding.pkl --output=data/kmeans.idx --n-clusters=2 - -# 3) ---- Train a model -# Trains and saves a model to `models/` -python3 train.py --input=./data/embedding.pkl --kmeans-path=data/kmeans.idx --output-model-dir=./models/ --n-classes=2 - -# 4) ---- Create bucket-data -# Collects all predictions from the newest model in `models/`, and saves them to `bucket-data/` -python3 create_buckets.py --input=./data/embedding.pkl --model-dir-path=./models/ --output-chunks=./data/chunks --output-predictions=./data/overall --output-bucket-path ./data/bucket-data/ - -# 5) ---- Create bucket-data mapping to protein IDs -python3 create_protein_bucket_mapping.py --bucket-path=./data/bucket-data/ --output=./data/bucket-mapping.pkl \ No newline at end of file +# Run the train_alphafind.py script +python3 "$BASE_DIR/train_alphafind.py" --base-dir="$BASE_DIR" --data-dir="$DATA_DIR" --models-dir="$MODELS_DIR" \ No newline at end of file diff --git a/training/setup.py b/training/setup.py new file mode 100644 index 0000000..8d1a6d3 --- /dev/null +++ b/training/setup.py @@ -0,0 +1,32 @@ +from setuptools import find_packages, setup + +# Read the contents of requirements.txt +with open('requirements.txt') as f: + requirements = f.read().splitlines() + +# Read the contents of requirements-dev.txt +with open('requirements-dev.txt') as f: + dev_requirements = [line for line in f.read().splitlines() if line and not line.startswith('-r')] +print(dev_requirements) + +setup( + name="alphafind_training", + version="0.0.1", + packages=find_packages(where='alphafind_training'), + package_dir={'': 'alphafind_training'}, + install_requires=requirements, + extras_require={ + 'dev': dev_requirements, + }, + author="Terézia Slanináková", + author_email="slaninakova@ics.muni.cz", + description="AlphaFold training -- setup for the similarity search on vast protein data", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/Coda-Research-Group/AlphaFind", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], +) diff --git a/training/tests.py b/training/tests.py deleted file mode 100644 index b19bdee..0000000 --- a/training/tests.py +++ /dev/null @@ -1,51 +0,0 @@ -import os -import re -from pathlib import Path - -import pandas as pd -import pytest -from create_embedding import run - - -@pytest.fixture(scope="function") -def output_file(): - # Setup: Define the output file path - output_path = Path("./data/embedding.pkl") - - yield output_path - - # Teardown: Remove only the embedding.pkl file if it exists - if output_path.exists(): - os.remove(output_path) - - -def parse_protein_id(filename): - # Use regex to extract the protein ID - match = re.search(r'AF-([\w\d]+)-F1-model_v3\.cif', filename) - if match: - return match.group(1) - return None - - -def test_create_embedding(): - cif_path = "./data/cifs" - output_path = "./data/embedding.pkl" - granularity = 10 - - # 45 features for each protein - (10x10 - 10) / 2 - expected_dimensionality = 45 - - run(Path(cif_path), Path(output_path), granularity) - os.chmod(output_path, 0o777) - - assert os.path.exists(output_path) - assert os.path.getsize(output_path) > 0 - # load embedding.pkl and check if it has the correct shape - df = pd.read_pickle(output_path) - assert df.shape[0] == len(os.listdir(cif_path)) - assert df.shape[1] == expected_dimensionality - - # check if the length of the index is equal to the number of proteins - assert sorted(df.index.tolist()) == sorted( - [parse_protein_id(file) for file in os.listdir(cif_path) if file.endswith('.cif')] - ) diff --git a/training/tests/test_create_embedding.py b/training/tests/test_create_embedding.py new file mode 100644 index 0000000..66ecead --- /dev/null +++ b/training/tests/test_create_embedding.py @@ -0,0 +1,52 @@ +import os +import re +import tempfile +from pathlib import Path + +import pandas as pd +import pytest +from alphafind_training.create_embedding import create_embedding + + +@pytest.fixture(scope="function") +def output_file(): + # Setup: Define the output file path + output_path = Path("./data/embedding.pkl") + + yield output_path + + # Teardown: Remove only the embedding.pkl file if it exists + if output_path.exists(): + os.remove(output_path) + + +def parse_protein_id(filename): + # Use regex to extract the protein ID + match = re.search(r'AF-([\w\d]+)-F1-model_v3\.cif', filename) + if match: + return match.group(1) + return None + + +def test_create_embedding(): + with tempfile.TemporaryDirectory() as tmpdir: + cif_path = "./data/cifs" + output_path = f"{tmpdir}/embedding.pkl" + granularity = 10 + + # 45 features for each protein - (10x10 - 10) / 2 + expected_dimensionality = 45 + + create_embedding(Path(cif_path), Path(output_path), granularity) + + assert os.path.exists(output_path) + assert os.path.getsize(output_path) > 0 + # load embedding.pkl and check if it has the correct shape + df = pd.read_pickle(output_path) + assert df.shape[0] == len(os.listdir(cif_path)) + assert df.shape[1] == expected_dimensionality + + # check if the length of the index is equal to the number of proteins + assert sorted(df.index.tolist()) == sorted( + [parse_protein_id(file) for file in os.listdir(cif_path) if file.endswith('.cif')] + ) diff --git a/training/train_alphafind.py b/training/train_alphafind.py new file mode 100644 index 0000000..16873ae --- /dev/null +++ b/training/train_alphafind.py @@ -0,0 +1,64 @@ +import argparse +import os + +from alphafind_training.cluster import create_kmeans +from alphafind_training.create_buckets import create_buckets +from alphafind_training.create_embedding import create_embedding +from alphafind_training.create_protein_bucket_mapping import create_mapping +from alphafind_training.train import train_model + + +def train_alphafind(base_dir, data_dir, models_dir): + # 1) Create embeddings + create_embedding( + input_path=os.path.join(data_dir, "cifs"), output_path=os.path.join(data_dir, "embedding.pkl"), granularity=10 + ) + + # 2) Create a K-Means object + create_kmeans( + input_path=os.path.join(data_dir, "embedding.pkl"), + output_path=os.path.join(data_dir, "kmeans.idx"), + n_clusters=2, + ) + + # 3) Train a model + train_model( + input_path=os.path.join(data_dir, "embedding.pkl"), + kmeans_path=os.path.join(data_dir, "kmeans.idx"), + output_model_dir=models_dir, + n_classes=2, + ) + + # 4) Create bucket-data + create_buckets( + input_path=os.path.join(data_dir, "embedding.pkl"), + model_dir_path=models_dir, + output_chunks=os.path.join(data_dir, "chunks"), + output_predictions=os.path.join(data_dir, "overall"), + output_bucket_path=os.path.join(data_dir, "bucket-data"), + ) + + # 5) Create bucket-data mapping to protein IDs + create_mapping( + bucket_data_path=os.path.join(data_dir, "bucket-data"), output_path=os.path.join(data_dir, "bucket-mapping.pkl") + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Train AlphaFind model") + parser.add_argument( + "--base-dir", default=os.path.dirname(os.path.abspath(__file__)), help="Base directory for scripts" + ) + parser.add_argument( + "--data-dir", + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data"), + help="Data directory", + ) + parser.add_argument( + "--models-dir", + default=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models"), + help="Models directory", + ) + args = parser.parse_args() + + train_alphafind(args.base_dir, args.data_dir, args.models_dir)