Add explicit_hydrogen parameter

biotite-dev · Feb 7, 2025 · f386ccb · f386ccb
1 parent 4c7204a
commit f386ccb
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 10 deletions.
diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py
@@ -76,6 +76,7 @@ def to_mol(
     kekulize=False,
     use_dative_bonds=False,
     include_extra_annotations=(),
+    explicit_hydrogen=True,
 ):
     """
     Convert an :class:`.AtomArray` or :class:`.AtomArrayStack` into a
@@ -105,6 +106,11 @@ def to_mol(
         are always included per default. These standard annotations can be accessed
         with :meth:`rdkit.Chem.rdchem.Atom.GetPDBResidueInfo()` for each atom in the
         returned :class:`rdkit.Chem.rdchem.Mol`.
+    explicit_hydrogen : bool, optional
+        If set to true, the conversion process expects that all hydrogen atoms are
+        explicit, i.e. each each hydrogen atom must be part of the :class:`AtomArray`.
+        If set to false, the conversion process treats all hydrogen atoms as implicit
+        and all hydrogen atoms in the :class:`AtomArray` are removed.
 
     Returns
     -------
@@ -141,12 +147,24 @@ def to_mol(
     HB3
     HXT
     """
+    hydrogen_mask = atoms.element == "H"
+    if explicit_hydrogen:
+        if not hydrogen_mask.any():
+            warnings.warn(
+                "No hydrogen found in the input, although 'explicit_hydrogen' is 'True'"
+            )
+    else:
+        atoms = atoms[..., ~hydrogen_mask]
+
     mol = EditableMol(Mol())
 
     has_annot = frozenset(atoms.get_annotation_categories())
     extra_annot = set(include_extra_annotations) - _STANDARD_ANNOTATIONS
+
     for i in range(atoms.array_length()):
         rdkit_atom = Atom(atoms.element[i].capitalize())
+        if explicit_hydrogen:
+            rdkit_atom.SetNoImplicit(True)
         if "charge" in has_annot:
             rdkit_atom.SetFormalCharge(atoms.charge[i].item())
 
@@ -176,11 +194,12 @@ def to_mol(
 
     if atoms.bonds is None:
         raise BadStructureError("An AtomArray with associated BondList is required")
-    bonds = atoms.bonds.as_array()
     if kekulize:
-        bonds = bonds.copy()
+        bonds = atoms.bonds.copy()
         bonds.remove_aromaticity()
-    for atom_i, atom_j, bond_type in atoms.bonds.as_array():
+    else:
+        bonds = atoms.bonds
+    for atom_i, atom_j, bond_type in bonds.as_array():
         if not use_dative_bonds and bond_type == BondType.COORDINATION:
             bond_type = BondType.SINGLE
         mol.AddBond(
@@ -367,7 +386,7 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None):
         except KekulizeException:
             warnings.warn(
                 "Kekulization failed, "
-                "using 'BondType.ANY' instead for aromatic bonds instead",
+                "using 'BondType.AROMATIC' instead for aromatic bonds instead",
                 LossyConversionWarning,
             )
         rdkit_bonds = list(mol.GetBonds())

diff --git a/tests/interface/test_rdkit.py b/tests/interface/test_rdkit.py
@@ -20,11 +20,9 @@ def _load_smiles():
         return file.read().splitlines()
 
 
-@pytest.mark.filterwarnings(
-    "ignore:"
-    "The coordinates are missing for some atoms. "
-    "The fallback coordinates will be used instead"
-)
+@pytest.mark.filterwarnings("ignore:Missing coordinates.*")
+@pytest.mark.filterwarnings("ignore:.*coordinates are missing.*")
+@pytest.mark.filterwarnings("ignore::biotite.interface.LossyConversionWarning")
 @pytest.mark.parametrize(
     "res_name", np.random.default_rng(0).choice(info.all_residues(), size=200).tolist()
 )
@@ -46,7 +44,7 @@ def test_conversion_from_biotite(res_name):
     # Some compounds in the CCD have missing coordinates
     assert np.allclose(test_atoms.coord, ref_atoms.coord, equal_nan=True)
 
-    # There should be now undefined bonds
+    # There should be no undefined bonds
     assert (test_atoms.bonds.as_array()[:, 2] != struc.BondType.ANY).all()
     # Kekulization returns one of multiple resonance structures, so the returned one
     # might not be the same as the input