From 6220e85578a1e0e30617cbf860f0fbd24e66add4 Mon Sep 17 00:00:00 2001 From: Loris Ercole <30901257+lorisercole@users.noreply.github.com> Date: Thu, 7 May 2020 12:12:59 +0200 Subject: [PATCH] Support more complex formula formats in `aiida.orm.data.cif.parse_formula` (#3954) The new implementation now not only supports the very strict format of the `_chemical_formula` tag of the CIF file format, but is also allows more complex versions of the Hill notation, including element groups denoted by curly/square brackets or parentheses. Additionally, `CifData.get_formulae` has a new optional argument called `custom_tags` which takes a single string or list of string that correspond to CIF tags other than the default `_chemical_formula_{}`. Certain CIF file provides provide the formulae in these non-default tags. --- aiida/orm/nodes/data/cif.py | 67 +++++++++++++++++++++++++------------ tests/orm/data/test_cif.py | 54 ++++++++++++++++++++++++++++++ tests/test_dataclasses.py | 18 ---------- 3 files changed, 99 insertions(+), 40 deletions(-) create mode 100644 tests/orm/data/test_cif.py diff --git a/aiida/orm/nodes/data/cif.py b/aiida/orm/nodes/data/cif.py index ef7abc411f..1b8b6f442a 100644 --- a/aiida/orm/nodes/data/cif.py +++ b/aiida/orm/nodes/data/cif.py @@ -10,6 +10,7 @@ # pylint: disable=invalid-name,too-many-locals,too-many-statements """Tools for handling Crystallographic Information Files (CIF)""" +import re from aiida.common.utils import Capturing from .singlefile import SinglefileData @@ -196,27 +197,46 @@ def pycifrw_from_cif(datablocks, loops=None, names=None): def parse_formula(formula): """ - Parses the Hill formulae, written with spaces for separators. + Parses the Hill formulae. Does not need spaces as separators. + Works also for partial occupancies and for chemical groups enclosed in round/square/curly brackets. + Elements are counted and a dictionary is returned. + e.g. 'C[NH2]3NO3' --> {'C': 1, 'N': 4, 'H': 6, 'O': 3} """ - import re + + def chemcount_str_to_number(string): + if not string: + quantity = 1 + else: + quantity = float(string) + if quantity.is_integer(): + quantity = int(quantity) + return quantity contents = {} - for part in re.split(r'\s+', formula): - m = re.match(r'(\D+)([\.\d]+)?', part) - if m is None: + # split blocks with parentheses + for block in re.split(r'(\([^\)]*\)[^A-Z\(\[\{]*|\[[^\]]*\][^A-Z\(\[\{]*|\{[^\}]*\}[^A-Z\(\[\{]*)', formula): + if not block: # block is void continue - specie = m.group(1) - quantity = m.group(2) - if quantity is None: - quantity = 1 + # get molecular formula (within parentheses) & count + group = re.search(r'[\{\[\(](.+)[\}\]\)]([\.\d]*)', block) + if group is None: # block does not contain parentheses + molformula = block + molcount = 1 else: - if re.match(r'^\d+$', quantity): - quantity = int(quantity) - else: - quantity = float(quantity) - contents[specie] = quantity + molformula = group.group(1) + molcount = chemcount_str_to_number(group.group(2)) + + for part in re.findall(r'[A-Z][^A-Z\s]*', molformula.replace(' ', '')): # split at uppercase letters + match = re.match(r'(\D+)([\.\d]+)?', part) # separates element and count + + if match is None: + continue + + species = match.group(1) + quantity = chemcount_str_to_number(match.group(2)) * molcount + contents[species] = contents.get(species, 0) + quantity return contents @@ -527,7 +547,7 @@ def set_parse_policy(self, parse_policy): else: raise ValueError('Got unknown parse_policy {}'.format(parse_policy)) - def get_formulae(self, mode='sum'): + def get_formulae(self, mode='sum', custom_tags=None): """ Return chemical formulae specified in CIF file. @@ -536,12 +556,19 @@ def get_formulae(self, mode='sum'): """ # note: If formulae are not None, they could be returned # directly (but the function is very cheap anyhow). - formula_tag = '_chemical_formula_{}'.format(mode) + formula_tags = ['_chemical_formula_{}'.format(mode)] + if custom_tags: + if not isinstance(custom_tags, (list, tuple)): + custom_tags = [custom_tags] + formula_tags.extend(custom_tags) + formulae = [] for datablock in self.values.keys(): formula = None - if formula_tag in self.values[datablock].keys(): - formula = self.values[datablock][formula_tag] + for formula_tag in formula_tags: + if formula_tag in self.values[datablock].keys(): + formula = self.values[datablock][formula_tag] + break formulae.append(formula) return formulae @@ -577,8 +604,6 @@ def has_partial_occupancies(self): :return: True if there are partial occupancies, False otherwise """ - import re - tag = '_atom_site_occupancy' epsilon = 1e-6 @@ -628,8 +653,6 @@ def has_undefined_atomic_sites(self): :return: boolean, True if no atomic sites are defined or if any of the defined sites contain undefined positions and False otherwise """ - import re - tag_x = '_atom_site_fract_x' tag_y = '_atom_site_fract_y' tag_z = '_atom_site_fract_z' diff --git a/tests/orm/data/test_cif.py b/tests/orm/data/test_cif.py new file mode 100644 index 0000000000..5dd7e5eddc --- /dev/null +++ b/tests/orm/data/test_cif.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +########################################################################### +# Copyright (c), The AiiDA team. All rights reserved. # +# This file is part of the AiiDA code. # +# # +# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core # +# For further information on the license, see the LICENSE.txt file # +# For further information please visit http://www.aiida.net # +########################################################################### +"""Tests for cif related functions.""" + +import pytest + +from aiida.orm.nodes.data.cif import parse_formula + + +def test_parse_formula(): + """Test the `parse_formula` utility function.""" + assert parse_formula('C H') == {'C': 1, 'H': 1} + assert parse_formula('C5 H1') == {'C': 5, 'H': 1} + assert parse_formula('Ca5 Ho') == {'Ca': 5, 'Ho': 1} + assert parse_formula('H0.5 O') == {'H': 0.5, 'O': 1} + assert parse_formula('C0 O0') == {'C': 0, 'O': 0} + assert parse_formula('C1 H1 ') == {'C': 1, 'H': 1} + assert parse_formula(' C1 H1') == {'C': 1, 'H': 1} + assert parse_formula('CaHClO') == {'Ca': 1, 'H': 1, 'Cl': 1, 'O': 1} + assert parse_formula('C70 H108 Al4 La4 N4 O10') == {'C': 70, 'H': 108, 'Al': 4, 'La': 4, 'N': 4, 'O': 10} + assert parse_formula('C70H108Al4Li4N4O10') == {'C': 70, 'H': 108, 'Al': 4, 'Li': 4, 'N': 4, 'O': 10} + assert parse_formula('C36 H59LiN2 O3 Si') == {'C': 36, 'H': 59, 'Li': 1, 'N': 2, 'O': 3, 'Si': 1} + assert parse_formula('C63.5H83.5Li2N2O3.25P2') == {'C': 63.5, 'H': 83.5, 'Li': 2, 'N': 2, 'O': 3.25, 'P': 2} + assert parse_formula('Fe Li0.667 O4 P1') == {'Fe': 1, 'Li': 0.667, 'O': 4, 'P': 1} + assert parse_formula('Fe2.05Ni0.05O4 Zn0.9') == {'Fe': 2.05, 'Ni': 0.05, 'O': 4, 'Zn': 0.9} + assert parse_formula('Li3O6(Al0.23Li0.77)2(Li0.07Te0.93)') == {'Li': 4.61, 'O': 6, 'Al': 0.46, 'Te': 0.93} + assert parse_formula('Li2{Cr0.05Li0.95X0.00}{Cr0.24Li0.76}2{Li0.02Te0.98}O6') == { + 'Li': 4.49, + 'Cr': 0.53, + 'X': 0, + 'Te': 0.98, + 'O': 6 + } + assert parse_formula('C[NH2]3NO3') == {'C': 1, 'N': 4, 'H': 6, 'O': 3} + assert parse_formula('H80 C104{C0.50 X0.50}8N8 Cl4(Cl0.50X0.50)8.0O8') == { + 'H': 80, + 'C': 108.0, + 'X': 8.0, + 'N': 8, + 'Cl': 8.0, + 'O': 8 + } + assert parse_formula('Na1.28[NH]0.28{N H2}0.72') == {'Na': 1.28, 'N': 1.0, 'H': 1.72} + + for test_formula in ('H0.5.2 O', 'Fe2.05Ni0.05.4', 'Na1.28[NH]0.28.3{NH2}0.72'): + with pytest.raises(ValueError): + parse_formula(test_formula) diff --git a/tests/test_dataclasses.py b/tests/test_dataclasses.py index 47f9a7199c..ec908c05ea 100644 --- a/tests/test_dataclasses.py +++ b/tests/test_dataclasses.py @@ -575,24 +575,6 @@ def test_refine(self): with self.assertRaises(ValueError): ret_dict = refine_inline(c) - @unittest.skipIf(not has_ase(), 'Unable to import ase') - @unittest.skipIf(not has_pycifrw(), 'Unable to import PyCifRW') - @unittest.skipIf(not has_spglib(), 'Unable to import spglib') - def test_parse_formula(self): - from aiida.orm.nodes.data.cif import parse_formula - - self.assertEqual(parse_formula('C H'), {'C': 1, 'H': 1}) - self.assertEqual(parse_formula('C5 H1'), {'C': 5, 'H': 1}) - self.assertEqual(parse_formula('Ca5 Ho'), {'Ca': 5, 'Ho': 1}) - self.assertEqual(parse_formula('H0.5 O'), {'H': 0.5, 'O': 1}) - self.assertEqual(parse_formula('C0 O0'), {'C': 0, 'O': 0}) - self.assertEqual(parse_formula('C1 H1 '), {'C': 1, 'H': 1}) # Trailing spaces should be accepted - self.assertEqual(parse_formula(' C1 H1'), {'C': 1, 'H': 1}) # Leading spaces should be accepted - - # Invalid literal for float() - with self.assertRaises(ValueError): - parse_formula('H0.5.2 O') - @unittest.skipIf(not has_pycifrw(), 'Unable to import PyCifRW') def test_scan_type(self): """Check that different scan_types of PyCifRW produce the same result."""