From d1950f2ba6d7767a1fe90b9ee295ee57cbc95c01 Mon Sep 17 00:00:00 2001
From: gfzhou <gfzhou01@gmail.com>
Date: Wed, 12 Jun 2024 10:38:09 -0700
Subject: [PATCH 1/2] Added residue replacement to handle protein chemical
 modifications and NCAAs

---
 README.md                                     | 31 ++++++++++++++++
 examples/residue_replacement/1h4x.fasta       |  3 ++
 .../residue_replacement/SEP_ideal_trim.sdf    | 37 +++++++++++++++++++
 .../config/inference/residue_replacement.yaml | 19 ++++++++++
 rf2aa/data/covale.py                          | 25 +++++++++++++
 rf2aa/run_inference.py                        | 14 ++++---
 6 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 examples/residue_replacement/1h4x.fasta
 create mode 100644 examples/residue_replacement/SEP_ideal_trim.sdf
 create mode 100644 rf2aa/config/inference/residue_replacement.yaml
diff --git a/README.md b/README.md
index c49448a..08d814e 100644
--- a/README.md
+++ b/README.md
@@ -263,6 +263,37 @@ becomes this so it can be parsed correctly:
 
 We know this syntax is hard to work with and we are happy to review PRs if anyone in the community can figure out how to specify all the necessary requirements in a more user friendly way!
 
+<a id="residue_replacement"></a>
+### Predicting Proteins with Chemical Modifications or Non-Carnonical Amino Acids
+To predict proteins with chemically modified residues or non-canonical amino acids, you can use residue replacement. This involves replacing the chemically modified residue or NCAA with a small molecule file that defines the structure of the modified residue. Here is an example of predicting a phosphorylated protein structure. (from `rf2aa/config/inference/residue_replacement.yaml`)
+```
+defaults:
+  - base
+job_name: "1h4x"
+
+protein_inputs:
+  A:
+    fasta_file: examples/residue_replacement/1h4x.fasta
+
+residue_replacement:
+  B:
+    protein_chain: A
+    residue_index_to_replace: 57
+    input: examples/residue_replacement/SEP_ideal_trim.sdf
+    input_type: "sdf"
+    N_index_atom: 1
+    C_index_atom: 5
+
+loader_params:
+  MAXCYCLE: 10
+```
+To predict the example, run:
+```
+python -m rf2aa.run_inference --config-name residue_replacement
+```
+In this example, we use the phosphoserine structure defined in `SEP_ideal_trim.sdf` and treat it as an atomized residue to replace the residue 57 in chain A. Please note that one extra oxygen atom has to be removed from the carboxylic group. `N_index_atom` and `C_index_atom` define the atoms to connect to the previous and the next residues, respectively. 
+
+
 <a id="outputs"></a>
 ### Understanding model outputs
 
diff --git a/examples/residue_replacement/1h4x.fasta b/examples/residue_replacement/1h4x.fasta
new file mode 100644
index 0000000..6efa63e
--- /dev/null
+++ b/examples/residue_replacement/1h4x.fasta
@@ -0,0 +1,3 @@
+>1H4X_1|Chains A, B|ANTI-SIGMA F FACTOR ANTAGONIST|BACILLUS SPHAERICUS (1421)
+MAFQLEMVTRETVVIRLFGELDHHAVEQIRAKISTAIFQGAVTTIIWNFERLSFMDSSGVGLVLGRMRELEAVAGRTILLNPSPTMRKVFQFSGLGPWMMDATEEEAIDRVRGIVNG
+
diff --git a/examples/residue_replacement/SEP_ideal_trim.sdf b/examples/residue_replacement/SEP_ideal_trim.sdf
new file mode 100644
index 0000000..359b31b
--- /dev/null
+++ b/examples/residue_replacement/SEP_ideal_trim.sdf
@@ -0,0 +1,37 @@
+SEP
+  PyMOL2.6          3D                             0
+
+ 10  9  0  0  1  0  0  0  0  0999 V2000
+    1.8550    0.4210    1.7510 N   0  0  0  0  0  0  0  0  0  0  0  0
+    0.4010    0.6200    1.6870 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -0.1390    0.0150    0.3910 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.4770    0.6550   -0.7270 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2490   -0.0530    2.8670 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.2540   -1.0380    3.3540 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1350   -0.0270   -2.0500 P   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.6010    0.1720   -2.0740 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.5200    0.6490   -3.3560 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.1910   -1.6030   -2.0410 O   0  0  0  0  0  0  0  0  0  0  0  0
+  1  2  1  0  0  0  0
+  2  3  1  0  0  0  0
+  2  5  1  0  0  0  0
+  3  4  1  0  0  0  0
+  4  7  1  0  0  0  0
+  5  6  2  0  0  0  0
+  7  8  2  0  0  0  0
+  7  9  1  0  0  0  0
+  7 10  1  0  0  0  0
+M  END
+>  <OPENEYE_ISO_SMILES>
+C([C@@H](C(=O)O)N)OP(=O)(O)O
+
+>  <OPENEYE_INCHI>
+InChI=1S/C3H8NO6P/c4-2(3(5)6)1-10-11(7,8)9/h2H,1,4H2,(H,5,6)(H2,7,8,9)/t2-/m0/s1
+
+>  <OPENEYE_INCHIKEY>
+BZQFBWGGLXLEPQ-REOHCLBHSA-N
+
+>  <FORMULA>
+C3H8NO6P
+
+$$$$
diff --git a/rf2aa/config/inference/residue_replacement.yaml b/rf2aa/config/inference/residue_replacement.yaml
new file mode 100644
index 0000000..687a7ba
--- /dev/null
+++ b/rf2aa/config/inference/residue_replacement.yaml
@@ -0,0 +1,19 @@
+defaults:
+  - base
+job_name: "1h4x"
+
+protein_inputs:
+  A:
+    fasta_file: examples/residue_replacement/1h4x.fasta
+
+residue_replacement:
+  B:
+    protein_chain: A
+    residue_index_to_replace: 57
+    input: examples/residue_replacement/SEP_ideal_trim.sdf
+    input_type: "sdf"
+    N_index_atom: 1
+    C_index_atom: 5
+
+loader_params:
+  MAXCYCLE: 10
\ No newline at end of file
diff --git a/rf2aa/data/covale.py b/rf2aa/data/covale.py
index 2aeeb49..65014e8 100644
--- a/rf2aa/data/covale.py
+++ b/rf2aa/data/covale.py
@@ -27,6 +27,31 @@ class AtomizedResidue:
     original_chain: str
     index_in_original_chain: int
 
+def load_residue_replacement( residue_replacement, model_runner ):
+    chainid_to_input = {}
+    residues_to_atomize = []
+    for chain in residue_replacement:
+        input_file = residue_replacement[chain]["input"]
+        input_type = residue_replacement[chain]["input_type"]
+        assert input_type in ["sdf", "mol2", "pdb"], "only sdf, mol2 and pdb files are supported"
+        obmol, msa, ins, xyz, mask = parse_mol(
+            input_file, filetype=input_type, string=False, generate_conformer=True
+        )
+
+        input = compute_features_from_obmol(obmol, msa, xyz, model_runner)
+        chainid_to_input[chain] = input
+        N_index_atom = int(residue_replacement[chain].N_index_atom)
+        C_index_atom = int(residue_replacement[chain].C_index_atom)
+        residues_to_atomize.append(AtomizedResidue(
+            chain,
+            0,
+            N_index_atom-1,
+            C_index_atom-1,
+            residue_replacement[chain].protein_chain,
+            int(residue_replacement[chain].residue_index_to_replace) - 1   
+        ))
+        
+    return chainid_to_input, residues_to_atomize
 
 def load_covalent_molecules(protein_inputs, config, model_runner):
     if config.covale_inputs is None:
diff --git a/rf2aa/run_inference.py b/rf2aa/run_inference.py
index e25f322..4c6b87c 100644
--- a/rf2aa/run_inference.py
+++ b/rf2aa/run_inference.py
@@ -5,7 +5,7 @@
 from dataclasses import asdict
 
 from rf2aa.data.merge_inputs import merge_all
-from rf2aa.data.covale import load_covalent_molecules
+from rf2aa.data.covale import load_covalent_molecules, load_residue_replacement
 from rf2aa.data.nucleic_acid import load_nucleic_acid
 from rf2aa.data.protein import generate_msa_and_load_protein
 from rf2aa.data.small_molecule import load_small_molecule
@@ -86,10 +86,14 @@ def parse_inference_config(self):
                 sm_inputs[chain] = sm_input
 
         if self.config.residue_replacement is not None:
-            # add to the sm_inputs list
-            # add to residues to atomize
-            raise NotImplementedError("Modres inference is not implemented")
-        
+            for chain in self.config.residue_replacement:
+                protein_chain = self.config.residue_replacement[chain].protein_chain
+                if protein_chain not in protein_inputs:
+                    raise ValueError(f"Protein chain {protein_chain} not found in protein inputs")
+                
+                sm_inputs, residues_to_atomize_replacement = load_residue_replacement(self.config.residue_replacement, self)
+                sm_inputs.update(sm_inputs)
+                residues_to_atomize.extend(residues_to_atomize_replacement)
         raw_data = merge_all(protein_inputs, na_inputs, sm_inputs, residues_to_atomize, deterministic=self.deterministic)
         self.raw_data = raw_data
 

From bf8822e20833434af8aac7ffa290f468b2e4b250 Mon Sep 17 00:00:00 2001
From: Guangfeng Zhou <gfzhou01@gmail.com>
Date: Wed, 12 Jun 2024 10:54:12 -0700
Subject: [PATCH 2/2] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 08d814e..5ab3dd3 100644
--- a/README.md
+++ b/README.md
@@ -264,7 +264,7 @@ becomes this so it can be parsed correctly:
 We know this syntax is hard to work with and we are happy to review PRs if anyone in the community can figure out how to specify all the necessary requirements in a more user friendly way!
 
 <a id="residue_replacement"></a>
-### Predicting Proteins with Chemical Modifications or Non-Carnonical Amino Acids
+### Predicting Proteins with Chemical Modifications or Non-Canonical Amino Acids
 To predict proteins with chemically modified residues or non-canonical amino acids, you can use residue replacement. This involves replacing the chemically modified residue or NCAA with a small molecule file that defines the structure of the modified residue. Here is an example of predicting a phosphorylated protein structure. (from `rf2aa/config/inference/residue_replacement.yaml`)
 ```
 defaults: