From 6220e85578a1e0e30617cbf860f0fbd24e66add4 Mon Sep 17 00:00:00 2001
From: Loris Ercole <30901257+lorisercole@users.noreply.github.com>
Date: Thu, 7 May 2020 12:12:59 +0200
Subject: [PATCH] Support more complex formula formats in
 `aiida.orm.data.cif.parse_formula` (#3954)

The new implementation now not only supports the very strict format of
the `_chemical_formula` tag of the CIF file format, but is also allows
more complex versions of the Hill notation, including element groups
denoted by curly/square brackets or parentheses.

Additionally, `CifData.get_formulae` has a new optional argument called
`custom_tags` which takes a single string or list of string that
correspond to CIF tags other than the default `_chemical_formula_{}`.
Certain CIF file provides provide the formulae in these non-default tags.
---
 aiida/orm/nodes/data/cif.py | 67 +++++++++++++++++++++++++------------
 tests/orm/data/test_cif.py  | 54 ++++++++++++++++++++++++++++++
 tests/test_dataclasses.py   | 18 ----------
 3 files changed, 99 insertions(+), 40 deletions(-)
 create mode 100644 tests/orm/data/test_cif.py

diff --git a/aiida/orm/nodes/data/cif.py b/aiida/orm/nodes/data/cif.py
index ef7abc411f..1b8b6f442a 100644
--- a/aiida/orm/nodes/data/cif.py
+++ b/aiida/orm/nodes/data/cif.py
@@ -10,6 +10,7 @@
 # pylint: disable=invalid-name,too-many-locals,too-many-statements
 """Tools for handling Crystallographic Information Files (CIF)"""
 
+import re
 from aiida.common.utils import Capturing
 
 from .singlefile import SinglefileData
@@ -196,27 +197,46 @@ def pycifrw_from_cif(datablocks, loops=None, names=None):
 
 def parse_formula(formula):
     """
-    Parses the Hill formulae, written with spaces for separators.
+    Parses the Hill formulae. Does not need spaces as separators.
+    Works also for partial occupancies and for chemical groups enclosed in round/square/curly brackets.
+    Elements are counted and a dictionary is returned.
+    e.g.  'C[NH2]3NO3'  -->  {'C': 1, 'N': 4, 'H': 6, 'O': 3}
     """
-    import re
+
+    def chemcount_str_to_number(string):
+        if not string:
+            quantity = 1
+        else:
+            quantity = float(string)
+            if quantity.is_integer():
+                quantity = int(quantity)
+        return quantity
 
     contents = {}
-    for part in re.split(r'\s+', formula):
-        m = re.match(r'(\D+)([\.\d]+)?', part)
 
-        if m is None:
+    # split blocks with parentheses
+    for block in re.split(r'(\([^\)]*\)[^A-Z\(\[\{]*|\[[^\]]*\][^A-Z\(\[\{]*|\{[^\}]*\}[^A-Z\(\[\{]*)', formula):
+        if not block:  # block is void
             continue
 
-        specie = m.group(1)
-        quantity = m.group(2)
-        if quantity is None:
-            quantity = 1
+        # get molecular formula (within parentheses) & count
+        group = re.search(r'[\{\[\(](.+)[\}\]\)]([\.\d]*)', block)
+        if group is None:  # block does not contain parentheses
+            molformula = block
+            molcount = 1
         else:
-            if re.match(r'^\d+$', quantity):
-                quantity = int(quantity)
-            else:
-                quantity = float(quantity)
-        contents[specie] = quantity
+            molformula = group.group(1)
+            molcount = chemcount_str_to_number(group.group(2))
+
+        for part in re.findall(r'[A-Z][^A-Z\s]*', molformula.replace(' ', '')):  # split at uppercase letters
+            match = re.match(r'(\D+)([\.\d]+)?', part)  # separates element and count
+
+            if match is None:
+                continue
+
+            species = match.group(1)
+            quantity = chemcount_str_to_number(match.group(2)) * molcount
+            contents[species] = contents.get(species, 0) + quantity
     return contents
 
 
@@ -527,7 +547,7 @@ def set_parse_policy(self, parse_policy):
         else:
             raise ValueError('Got unknown parse_policy {}'.format(parse_policy))
 
-    def get_formulae(self, mode='sum'):
+    def get_formulae(self, mode='sum', custom_tags=None):
         """
         Return chemical formulae specified in CIF file.
 
@@ -536,12 +556,19 @@ def get_formulae(self, mode='sum'):
         """
         # note: If formulae are not None, they could be returned
         # directly (but the function is very cheap anyhow).
-        formula_tag = '_chemical_formula_{}'.format(mode)
+        formula_tags = ['_chemical_formula_{}'.format(mode)]
+        if custom_tags:
+            if not isinstance(custom_tags, (list, tuple)):
+                custom_tags = [custom_tags]
+            formula_tags.extend(custom_tags)
+
         formulae = []
         for datablock in self.values.keys():
             formula = None
-            if formula_tag in self.values[datablock].keys():
-                formula = self.values[datablock][formula_tag]
+            for formula_tag in formula_tags:
+                if formula_tag in self.values[datablock].keys():
+                    formula = self.values[datablock][formula_tag]
+                    break
             formulae.append(formula)
 
         return formulae
@@ -577,8 +604,6 @@ def has_partial_occupancies(self):
 
         :return: True if there are partial occupancies, False otherwise
         """
-        import re
-
         tag = '_atom_site_occupancy'
 
         epsilon = 1e-6
@@ -628,8 +653,6 @@ def has_undefined_atomic_sites(self):
         :return: boolean, True if no atomic sites are defined or if any of the defined sites contain undefined positions
             and False otherwise
         """
-        import re
-
         tag_x = '_atom_site_fract_x'
         tag_y = '_atom_site_fract_y'
         tag_z = '_atom_site_fract_z'
diff --git a/tests/orm/data/test_cif.py b/tests/orm/data/test_cif.py
new file mode 100644
index 0000000000..5dd7e5eddc
--- /dev/null
+++ b/tests/orm/data/test_cif.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+###########################################################################
+# Copyright (c), The AiiDA team. All rights reserved.                     #
+# This file is part of the AiiDA code.                                    #
+#                                                                         #
+# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
+# For further information on the license, see the LICENSE.txt file        #
+# For further information please visit http://www.aiida.net               #
+###########################################################################
+"""Tests for cif related functions."""
+
+import pytest
+
+from aiida.orm.nodes.data.cif import parse_formula
+
+
+def test_parse_formula():
+    """Test the `parse_formula` utility function."""
+    assert parse_formula('C H') == {'C': 1, 'H': 1}
+    assert parse_formula('C5 H1') == {'C': 5, 'H': 1}
+    assert parse_formula('Ca5 Ho') == {'Ca': 5, 'Ho': 1}
+    assert parse_formula('H0.5 O') == {'H': 0.5, 'O': 1}
+    assert parse_formula('C0 O0') == {'C': 0, 'O': 0}
+    assert parse_formula('C1 H1 ') == {'C': 1, 'H': 1}
+    assert parse_formula(' C1 H1') == {'C': 1, 'H': 1}
+    assert parse_formula('CaHClO') == {'Ca': 1, 'H': 1, 'Cl': 1, 'O': 1}
+    assert parse_formula('C70 H108 Al4 La4 N4 O10') == {'C': 70, 'H': 108, 'Al': 4, 'La': 4, 'N': 4, 'O': 10}
+    assert parse_formula('C70H108Al4Li4N4O10') == {'C': 70, 'H': 108, 'Al': 4, 'Li': 4, 'N': 4, 'O': 10}
+    assert parse_formula('C36 H59LiN2 O3 Si') == {'C': 36, 'H': 59, 'Li': 1, 'N': 2, 'O': 3, 'Si': 1}
+    assert parse_formula('C63.5H83.5Li2N2O3.25P2') == {'C': 63.5, 'H': 83.5, 'Li': 2, 'N': 2, 'O': 3.25, 'P': 2}
+    assert parse_formula('Fe Li0.667 O4 P1') == {'Fe': 1, 'Li': 0.667, 'O': 4, 'P': 1}
+    assert parse_formula('Fe2.05Ni0.05O4 Zn0.9') == {'Fe': 2.05, 'Ni': 0.05, 'O': 4, 'Zn': 0.9}
+    assert parse_formula('Li3O6(Al0.23Li0.77)2(Li0.07Te0.93)') == {'Li': 4.61, 'O': 6, 'Al': 0.46, 'Te': 0.93}
+    assert parse_formula('Li2{Cr0.05Li0.95X0.00}{Cr0.24Li0.76}2{Li0.02Te0.98}O6') == {
+        'Li': 4.49,
+        'Cr': 0.53,
+        'X': 0,
+        'Te': 0.98,
+        'O': 6
+    }
+    assert parse_formula('C[NH2]3NO3') == {'C': 1, 'N': 4, 'H': 6, 'O': 3}
+    assert parse_formula('H80 C104{C0.50 X0.50}8N8 Cl4(Cl0.50X0.50)8.0O8') == {
+        'H': 80,
+        'C': 108.0,
+        'X': 8.0,
+        'N': 8,
+        'Cl': 8.0,
+        'O': 8
+    }
+    assert parse_formula('Na1.28[NH]0.28{N H2}0.72') == {'Na': 1.28, 'N': 1.0, 'H': 1.72}
+
+    for test_formula in ('H0.5.2 O', 'Fe2.05Ni0.05.4', 'Na1.28[NH]0.28.3{NH2}0.72'):
+        with pytest.raises(ValueError):
+            parse_formula(test_formula)
diff --git a/tests/test_dataclasses.py b/tests/test_dataclasses.py
index 47f9a7199c..ec908c05ea 100644
--- a/tests/test_dataclasses.py
+++ b/tests/test_dataclasses.py
@@ -575,24 +575,6 @@ def test_refine(self):
         with self.assertRaises(ValueError):
             ret_dict = refine_inline(c)
 
-    @unittest.skipIf(not has_ase(), 'Unable to import ase')
-    @unittest.skipIf(not has_pycifrw(), 'Unable to import PyCifRW')
-    @unittest.skipIf(not has_spglib(), 'Unable to import spglib')
-    def test_parse_formula(self):
-        from aiida.orm.nodes.data.cif import parse_formula
-
-        self.assertEqual(parse_formula('C H'), {'C': 1, 'H': 1})
-        self.assertEqual(parse_formula('C5 H1'), {'C': 5, 'H': 1})
-        self.assertEqual(parse_formula('Ca5 Ho'), {'Ca': 5, 'Ho': 1})
-        self.assertEqual(parse_formula('H0.5 O'), {'H': 0.5, 'O': 1})
-        self.assertEqual(parse_formula('C0 O0'), {'C': 0, 'O': 0})
-        self.assertEqual(parse_formula('C1 H1 '), {'C': 1, 'H': 1})  # Trailing spaces should be accepted
-        self.assertEqual(parse_formula(' C1 H1'), {'C': 1, 'H': 1})  # Leading spaces should be accepted
-
-        # Invalid literal for float()
-        with self.assertRaises(ValueError):
-            parse_formula('H0.5.2 O')
-
     @unittest.skipIf(not has_pycifrw(), 'Unable to import PyCifRW')
     def test_scan_type(self):
         """Check that different scan_types of PyCifRW produce the same result."""