From 8853b25a3c61287511f6425d09f14caa75723e64 Mon Sep 17 00:00:00 2001 From: frapercan Date: Mon, 11 Mar 2024 09:10:10 +0100 Subject: [PATCH] embeddings storage --- protein_metamorphisms_is/config/config.yaml | 7 +- .../config/constants.yaml | 9 +++ protein_metamorphisms_is/main.py | 2 + .../operations/base/operator.py | 23 +++++- .../operations/embedding_tasks/__init__.py | 0 .../operations/embedding_tasks/esm.py | 31 ++++++++ .../operations/embedding_tasks/prost_t5.py | 25 ++++++ .../operations/embeddings.py | 79 +++++++++++++++++++ .../operations/structural_alignment.py | 2 +- protein_metamorphisms_is/sql/constants.py | 14 +++- protein_metamorphisms_is/sql/model.py | 20 +++++ 11 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 protein_metamorphisms_is/operations/embedding_tasks/__init__.py create mode 100644 protein_metamorphisms_is/operations/embedding_tasks/esm.py create mode 100644 protein_metamorphisms_is/operations/embedding_tasks/prost_t5.py create mode 100644 protein_metamorphisms_is/operations/embeddings.py diff --git a/protein_metamorphisms_is/config/config.yaml b/protein_metamorphisms_is/config/config.yaml index 87115d3..6479b50 100644 --- a/protein_metamorphisms_is/config/config.yaml +++ b/protein_metamorphisms_is/config/config.yaml @@ -11,7 +11,7 @@ DB_NAME: BioData ## Information System # Uniprot Extraction -search_criteria: '(structure_3d:true)' +search_criteria: '(organism_id:9615) AND (structure_3d:true)' limit: 100 # PDB Extraction @@ -45,3 +45,8 @@ structural_alignment: batch_size: 1000 task_timeout: 20 +# Embedding +embedding: + types: +# - 1 + - 2 diff --git a/protein_metamorphisms_is/config/constants.yaml b/protein_metamorphisms_is/config/constants.yaml index 1895bf7..b761ed9 100644 --- a/protein_metamorphisms_is/config/constants.yaml +++ b/protein_metamorphisms_is/config/constants.yaml @@ -18,3 +18,12 @@ structural_complexity_levels: - name: "Secondary Structures" description: Secondary structures refer to the local spatial arrangement of the protein's backbone, excluding the side chains of the amino acids. The two most common types of secondary structures are alpha-helices and beta-sheets. Alpha-helices are right-handed coils stabilized by hydrogen bonds between the backbone atoms, while beta-sheets consist of two or more strands aligned next to each other, forming a sheet-like structure also stabilized by hydrogen bonding. These structures are fundamental components of a protein's overall three-dimensional conformation and play critical roles in defining its function. Secondary structures are formed as a result of hydrogen bonds between the amide hydrogen and carbonyl oxygen atoms in the peptide backbone, and their formation is driven by the protein's primary sequence. +embedding_types: + - name: "ESM" + description: "Evolutionary Scale Modeling (ESM) embeddings are designed to capture the evolutionary information of protein sequences, utilizing deep learning to generate representations that enhance sequence analysis and prediction tasks." + task_name: "esm" + model_name: facebook/esm2_t6_8M_UR50D + - name: "Prost-T5" + description: "Prot-T5 embeddings leverage the capabilities of the T5 (Text-to-Text Transfer Transformer) model adapted for protein sequences, offering advanced sequence representation by considering both local and global sequence features." + task_name: prost_t5 + model_name: Rostlab/ProstT5 \ No newline at end of file diff --git a/protein_metamorphisms_is/main.py b/protein_metamorphisms_is/main.py index 2234c5f..fc4ae33 100644 --- a/protein_metamorphisms_is/main.py +++ b/protein_metamorphisms_is/main.py @@ -2,6 +2,7 @@ from protein_metamorphisms_is.information_system.pdb import PDBExtractor from protein_metamorphisms_is.information_system.uniprot import UniProtExtractor from protein_metamorphisms_is.operations.cdhit import CDHit +from protein_metamorphisms_is.operations.embeddings import EmbeddingManager from protein_metamorphisms_is.operations.structural_alignment import StructuralAlignmentManager @@ -10,6 +11,7 @@ def main(config_path="config/config.yaml"): UniProtExtractor(conf).start() PDBExtractor(conf).start() CDHit(conf).start() + EmbeddingManager(conf).start() StructuralAlignmentManager(conf).start() diff --git a/protein_metamorphisms_is/operations/base/operator.py b/protein_metamorphisms_is/operations/base/operator.py index 556c738..2e7b16a 100644 --- a/protein_metamorphisms_is/operations/base/operator.py +++ b/protein_metamorphisms_is/operations/base/operator.py @@ -5,7 +5,8 @@ from protein_metamorphisms_is.helpers.logger.logger import setup_logger from protein_metamorphisms_is.sql.base.database_manager import DatabaseManager from protein_metamorphisms_is.sql.constants import handle_structural_complexity_levels, \ - handle_structural_alignment_types + handle_structural_alignment_types, handle_embedding_types +from protein_metamorphisms_is.sql.model import PDBChains class OperatorBase(ABC): @@ -19,6 +20,8 @@ def __init__(self, conf): constants = yaml.safe_load(open(conf['constants'])) handle_structural_complexity_levels(self.session, constants) handle_structural_alignment_types(self.session, constants) + handle_embedding_types(self.session, constants) + @abstractmethod def start(self): @@ -29,3 +32,21 @@ def start(self): the specific data operation logic for each bioinformatics data source. """ pass + + + def load_chains(self): + """ + Retrieve protein chain data from the database. + + Fetches all PDBChains records from the database. The method can be configured to include or exclude multiple chain + models based on the 'allow_multiple_chain_models' (NMR samples) configuration. + + Returns: + list: A list of PDBChains objects representing protein chains. + """ + self.logger.info("Loading protein chains from the database") + if not self.conf.get("allow_multiple_chain_models"): + chains = self.session.query(PDBChains).filter(PDBChains.model == 0).all() + else: + chains = self.session.query(PDBChains).all() + return chains \ No newline at end of file diff --git a/protein_metamorphisms_is/operations/embedding_tasks/__init__.py b/protein_metamorphisms_is/operations/embedding_tasks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/protein_metamorphisms_is/operations/embedding_tasks/esm.py b/protein_metamorphisms_is/operations/embedding_tasks/esm.py new file mode 100644 index 0000000..74b9719 --- /dev/null +++ b/protein_metamorphisms_is/operations/embedding_tasks/esm.py @@ -0,0 +1,31 @@ +from transformers import AutoTokenizer, EsmModel +import torch + + +def embedding_task(session,chains,module,model_name): + # Verificar si CUDA está disponible + if not torch.cuda.is_available(): + raise Exception("CUDA is not available. This script requires a GPU with CUDA.") + + # Configurar el dispositivo + device = torch.device("cuda") + + # Cargar el tokenizador y el modelo + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = EsmModel.from_pretrained(model_name).to(device) + + # Preparar la secuencia + + with torch.no_grad(): # Desactivar el cálculo de gradientes + for chain in chains: + tokens = tokenizer(chain.sequence, return_tensors="pt", truncation=True, padding=True) + # Mover los tokens al dispositivo correcto + tokens = {k: v.to(device) for k, v in tokens.items()} + + # Obtener los embeddings del modelo + outputs = model(**tokens) + embeddings = outputs.last_hidden_state + # embeddings es un tensor de shape (batch_size, sequence_length, hidden_size) + + print(embeddings.shape) + diff --git a/protein_metamorphisms_is/operations/embedding_tasks/prost_t5.py b/protein_metamorphisms_is/operations/embedding_tasks/prost_t5.py new file mode 100644 index 0000000..01fb1c5 --- /dev/null +++ b/protein_metamorphisms_is/operations/embedding_tasks/prost_t5.py @@ -0,0 +1,25 @@ +from transformers import T5Tokenizer, T5EncoderModel +import re +import torch + + +def embedding_task(session,chains,module,model_name): + if not torch.cuda.is_available(): + raise Exception("CUDA is not available. This script requires a GPU with CUDA.") + + device = torch.device("cuda") + model_name = model_name + tokenizer = T5Tokenizer.from_pretrained(model_name, do_lower_case=False) + model = T5EncoderModel.from_pretrained(model_name).to(device) + model.eval() + + with torch.no_grad(): + for chain in chains: + sequence_processed = " ".join(list(re.sub(r"[UZOB]", "X", chain.sequence))) + sequence_processed = " " + sequence_processed if sequence_processed.isupper() else " " + sequence_processed + inputs = tokenizer(sequence_processed, return_tensors="pt", padding=True, truncation=True, + max_length=512, add_special_tokens=True).to(device) + + # Generación de embeddings + outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask) + embeddings = outputs.last_hidden_state.mean(dim=1) diff --git a/protein_metamorphisms_is/operations/embeddings.py b/protein_metamorphisms_is/operations/embeddings.py new file mode 100644 index 0000000..f505ded --- /dev/null +++ b/protein_metamorphisms_is/operations/embeddings.py @@ -0,0 +1,79 @@ +import importlib +import multiprocessing +from datetime import datetime, timedelta +from multiprocessing import Pool + +from sqlalchemy.orm import aliased + + +from protein_metamorphisms_is.operations.base.operator import OperatorBase +from protein_metamorphisms_is.sql.model import PDBChains, Cluster, PDBReference, StructuralAlignmentQueue, \ + StructuralAlignmentType, StructuralAlignmentResults, EmbeddingType + + +class EmbeddingManager(OperatorBase): + """ + lorem ipsum + + Attributes: + conf (dict): Configuration of the instance, including database connections and operational settings. + """ + + def __init__(self, conf): + """ + Initializes an instance of `StructuralAlignmentManager` with configuration settings. + + Args: + conf (dict): Configuration parameters, including database connections and operational settings. + """ + super().__init__(conf) + self.logger.info("Secuence Embedding Manager instance created.") + + def fetch_models_info(self): + """ + Fetches and prepares alignment task modules based on the configuration. + + This method dynamically imports alignment task modules specified in the configuration and stores + references to these modules in a dictionary for later use in the alignment process. + """ + embedding_types = self.session.query(EmbeddingType).all() + self.types = {} + base_module_path = 'protein_metamorphisms_is.operations.embedding_tasks' + + for type_obj in embedding_types: + if type_obj.id in self.conf['embedding']['types']: + # Construye el nombre completo del módulo + module_name = f"{base_module_path}.{type_obj.task_name}" + # Importa dinámicamente el módulo usando importlib + module = importlib.import_module(module_name) + # Almacena la referencia al módulo en el diccionario self.types + self.types[type_obj.id] = {'module': module, 'model_name' : type_obj.model_name} + + print(self.types) + + def start(self): + """ + Begin the structural alignment process. + + This method manages the workflow of the alignment process, including loading clusters, executing alignments, + and handling any exceptions encountered during the process. Progress and errors are logged appropriately. + """ + try: + self.logger.info("Starting structural alignment process.") + chains = self.load_chains() + self.fetch_models_info() + + for type in self.types.values(): + print(type) + module, model = type['module'], type['model_name'] + module.embedding_task(self.session,chains,module,model) + + + + except Exception as e: + self.logger.error(f"Error during structural alignment process: {e}") + raise + + + + diff --git a/protein_metamorphisms_is/operations/structural_alignment.py b/protein_metamorphisms_is/operations/structural_alignment.py index 811c6f2..f6a0a7e 100644 --- a/protein_metamorphisms_is/operations/structural_alignment.py +++ b/protein_metamorphisms_is/operations/structural_alignment.py @@ -91,7 +91,7 @@ def get_update_queue(self): clusters_not_queued = self.session.query(Cluster).filter( Cluster.id.notin_(queued_cluster_ids), - not Cluster.is_representative + Cluster.is_representative == False ).all() self.logger.info(f"Found {len(clusters_not_queued)} clusters not in queue, adding to queue.") diff --git a/protein_metamorphisms_is/sql/constants.py b/protein_metamorphisms_is/sql/constants.py index 5b085ae..fdab088 100644 --- a/protein_metamorphisms_is/sql/constants.py +++ b/protein_metamorphisms_is/sql/constants.py @@ -1,4 +1,4 @@ -from protein_metamorphisms_is.sql.model import StructuralComplexityLevel, StructuralAlignmentType +from protein_metamorphisms_is.sql.model import StructuralComplexityLevel, StructuralAlignmentType, EmbeddingType def handle_structural_complexity_levels(session, constants): @@ -29,3 +29,15 @@ def handle_structural_alignment_types(session, constants): # Comprometer los cambios en la base de datos session.commit() + + +def handle_embedding_types(session, constants): + embedding_types = constants['embedding_types'] + + for type_data in embedding_types: + exists = session.query(EmbeddingType).filter_by(name=type_data['name']).first() + if not exists: + embedding_type = EmbeddingType(**type_data) + session.add(embedding_type) + + session.commit() diff --git a/protein_metamorphisms_is/sql/model.py b/protein_metamorphisms_is/sql/model.py index 68bc661..c7eb688 100644 --- a/protein_metamorphisms_is/sql/model.py +++ b/protein_metamorphisms_is/sql/model.py @@ -284,6 +284,26 @@ class StructuralAlignmentType(Base): task_name = Column(String) +class EmbeddingType(Base): + """ + Represents a type of protein analysis embedding. + + This class is designed to manage different embedding techniques used in protein sequence analysis, offering a structured way to categorize and store information about various embedding methods such as ESM and Prot-T5. + + Attributes: + id (Integer): Unique identifier for each embedding type. + name (String): Unique name of the embedding type. + description (String): Detailed description of the embedding technique. + task_name (String): Name of the specific task associated with this embedding type, if applicable. + """ + __tablename__ = 'embedding_types' + id = Column(Integer, primary_key=True) + name = Column(String, nullable=False, unique=True) + description = Column(String) + task_name = Column(String) + model_name = Column(String) + + class StructuralAlignmentQueue(Base): """ Manages a queue of pending structural alignment tasks, overseeing their execution and monitoring.