From e2f235f9f46cbf2a275227b52c0195f1fc0d1b46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yifan=20Li=E6=9D=8E=E4=B8=80=E5=B8=86?= Date: Tue, 2 Apr 2024 20:43:48 -0400 Subject: [PATCH] add support for n2p2 data format (#627) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 1 + dpdata/plugins/n2p2.py | 195 +++++++++++++++++++++++++++++++++++++++++ tests/n2p2/input.data | 20 +++++ tests/test_n2p2.py | 72 +++++++++++++++ 4 files changed, 288 insertions(+) create mode 100644 dpdata/plugins/n2p2.py create mode 100644 tests/n2p2/input.data create mode 100644 tests/test_n2p2.py diff --git a/README.md b/README.md index 67942ac8..ab489a75 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ The `System` or `LabeledSystem` can be constructed from the following file forma | ABACUS | STRU | True | True | LabeledSystem | 'abacus/relax' | | ase | structure | True | True | MultiSystems | 'ase/structure' | | DFTB+ | dftbplus | False | True | LabeledSystem | 'dftbplus' | +| n2p2 | n2p2 | True | True | LabeledSystem | 'n2p2' | The Class `dpdata.MultiSystems` can read data from a dir which may contains many files of different systems, or from single xyz file which contains different systems. diff --git a/dpdata/plugins/n2p2.py b/dpdata/plugins/n2p2.py new file mode 100644 index 00000000..7162f09f --- /dev/null +++ b/dpdata/plugins/n2p2.py @@ -0,0 +1,195 @@ +import numpy as np + +from dpdata.format import Format + +from ..unit import EnergyConversion, ForceConversion, LengthConversion + +length_convert = LengthConversion("bohr", "angstrom").value() +energy_convert = EnergyConversion("hartree", "eV").value() +force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() + + +def match_indices(atype1, atype2): + # Ensure atype2 is a numpy array for efficient operations + atype2 = np.array(atype2) + # Placeholder for matched indices + matched_indices = [] + # Track used indices to handle duplicates + used_indices = set() + + # Iterate over each element in atype1 + for element in atype1: + # Find all indices of the current element in atype2 + # np.where returns a tuple, so [0] is used to access the array of indices + indices = np.where(atype2 == element)[0] + + # Find the first unused index + for index in indices: + if index not in used_indices: + # Add the index to the results and mark it as used + matched_indices.append(index) + used_indices.add(index) + break # Move to the next element in atype1 + + return matched_indices + + +@Format.register("n2p2") +class N2P2Format(Format): + """n2p2. + + This class support the conversion from and to the training data of n2p2 format. + For more information about the n2p2 format, please refer to https://compphysvienna.github.io/n2p2/topics/cfg_file.html + """ + + def from_labeled_system(self, file_name, **kwargs): + """Read from n2p2 format. + + Parameters + ---------- + file_name : str + file name, i.e. the first argument + **kwargs : dict + keyword arguments that will be passed from the method + + Returns + ------- + data : dict + system data, whose keys are defined in LabeledSystem.DTYPES + """ + cells = [] + coords = [] + atypes = [] + forces = [] + energies = [] + natom0 = None + natoms0 = None + atom_types0 = None + with open(file_name) as file: + for line in file: + line = line.strip() # Remove leading/trailing whitespace + if line.lower() == "begin": + current_section = [] # Start a new section + cell = [] + coord = [] + atype = [] + force = [] + energy = None + elif line.lower() == "end": + # If we are at the end of a section, process the section + assert ( + len(coord) == len(atype) == len(force) + ), "Number of atoms, atom types, and forces must match." + + # Check if the number of atoms is consistent across all frames + natom = len(coord) + if natom0 is None: + natom0 = natom + else: + assert ( + natom == natom0 + ), "The number of atoms in all frames must be the same." + + # Check if the number of atoms of each type is consistent across all frames + atype = np.array(atype) + unique_dict = {element: None for element in atype} + unique_atypes = np.array(list(unique_dict.keys())) + unique_atypes_list = list(unique_atypes) + ntypes = len(unique_atypes) + natoms = [len(atype[atype == at]) for at in unique_atypes] + if natoms0 is None: + natoms0 = natoms + else: + assert ( + natoms == natoms0 + ), "The number of atoms of each type in all frames must be the same." + if atom_types0 is None: + atom_types0 = atype + atom_order = match_indices(atom_types0, atype) + + cell = np.array(cell, dtype=float) + coord = np.array(coord, dtype=float)[atom_order] + force = np.array(force, dtype=float)[atom_order] + + cells.append(cell) + coords.append(coord) + forces.append(force) + energies.append(float(energy)) + + current_section = None # Reset for the next section + elif current_section is not None: + # If we are inside a section, append the line to the current section + # current_section.append(line) + line_contents = line.split() + if line_contents[0] == "lattice": + cell.append(line_contents[1:]) + elif line_contents[0] == "atom": + coord.append(line_contents[1:4]) + atype.append(line_contents[4]) + force.append(line_contents[7:10]) + elif line_contents[0] == "energy": + energy = line_contents[1] + + atom_names = unique_atypes_list + atom_numbs = natoms + atom_types = np.zeros(len(atom_types0), dtype=int) + for i in range(ntypes): + atom_types[atom_types0 == unique_atypes_list[i]] = i + + cells = np.array(cells) * length_convert + coords = np.array(coords) * length_convert + forces = np.array(forces) * force_convert + energies = np.array(energies) * energy_convert + + return { + "atom_names": list(atom_names), + "atom_numbs": list(atom_numbs), + "atom_types": atom_types, + "coords": coords, + "cells": cells, + "nopbc": False, + "orig": np.zeros(3), + "energies": energies, + "forces": forces, + } + + def to_labeled_system(self, data, file_name, **kwargs): + """Write n2p2 format. + + By default, LabeledSystem.to will fallback to System.to. + + Parameters + ---------- + data : dict + system data, whose keys are defined in LabeledSystem.DTYPES + file_name : str + file name, where the data will be written + *args : list + arguments that will be passed from the method + **kwargs : dict + keyword arguments that will be passed from the method + """ + buff = [] + nframe = len(data["energies"]) + natom = len(data["atom_types"]) + atom_names = data["atom_names"] + for frame in range(nframe): + coord = data["coords"][frame] / length_convert + force = data["forces"][frame] / force_convert + energy = data["energies"][frame] / energy_convert + cell = data["cells"][frame] / length_convert + atype = data["atom_types"] + buff.append("begin") + for i in range(3): + buff.append( + f"lattice {cell[i][0]:15.6f} {cell[i][1]:15.6f} {cell[i][2]:15.6f}" + ) + for i in range(natom): + buff.append( + f"atom {coord[i][0]:15.6f} {coord[i][1]:15.6f} {coord[i][2]:15.6f} {atom_names[atype[i]]:>7} {0:15.6f} {0:15.6f} {force[i][0]:15.6e} {force[i][1]:15.6e} {force[i][2]:15.6e}" + ) + buff.append(f"energy {energy:15.6f}") + buff.append(f"charge {0:15.6f}") + buff.append("end") + with open(file_name, "w") as fp: + fp.write("\n".join(buff)) diff --git a/tests/n2p2/input.data b/tests/n2p2/input.data new file mode 100644 index 00000000..02eef060 --- /dev/null +++ b/tests/n2p2/input.data @@ -0,0 +1,20 @@ +begin +lattice 18.897261 0.000000 0.000000 +lattice 0.000000 18.897261 0.000000 +lattice 0.000000 0.000000 18.897261 +atom 1.889726 0.000000 0.000000 O 0.000000 0.000000 9.723452e-03 0.000000e+00 0.000000e+00 +atom 0.000000 0.000000 2.834589 H 0.000000 0.000000 0.000000e+00 0.000000e+00 1.458518e-02 +atom 1.889726 0.000000 5.669178 H 0.000000 0.000000 9.723452e-03 0.000000e+00 2.917036e-02 +energy 0.044099 +charge 0.000000 +end +begin +lattice 18.897261 0.000000 0.000000 +lattice 0.000000 18.897261 0.000000 +lattice 0.000000 0.000000 18.897261 +atom 3.779452 1.889726 1.889726 O 0.000000 0.000000 4.861726e-02 3.889381e-02 3.889381e-02 +atom 1.889726 1.889726 4.724315 H 0.000000 0.000000 3.889381e-02 3.889381e-02 5.347899e-02 +atom 3.779452 1.889726 7.558904 H 0.000000 0.000000 4.861726e-02 3.889381e-02 6.806416e-02 +energy 0.084523 +charge 0.000000 +end \ No newline at end of file diff --git a/tests/test_n2p2.py b/tests/test_n2p2.py new file mode 100644 index 00000000..855a2752 --- /dev/null +++ b/tests/test_n2p2.py @@ -0,0 +1,72 @@ +import os +import unittest + +import numpy as np +from context import dpdata + +from dpdata.unit import EnergyConversion, ForceConversion, LengthConversion + +length_convert = LengthConversion("bohr", "angstrom").value() +energy_convert = EnergyConversion("hartree", "eV").value() +force_convert = ForceConversion("hartree/bohr", "eV/angstrom").value() + + +class TestN2P2(unittest.TestCase): + def setUp(self): + self.data_ref = { + "atom_numbs": [1, 2], + "atom_names": ["O", "H"], + "atom_types": np.array([0, 1, 1]), + "orig": np.array([0.0, 0.0, 0.0]), + "cells": np.array( + [ + [[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]], + [[10.0, 0.0, 0.0], [0.0, 10.0, 0.0], [0.0, 0.0, 10.0]], + ] + ), + "coords": np.array( + [ + [[1.0, 0.0, 0.0], [0.0, 0.0, 1.5], [1.0, 0.0, 3.0]], + [[2.0, 1.0, 1.0], [1.0, 1.0, 2.5], [2.0, 1.0, 4.0]], + ] + ), + "energies": np.array([1.2, 2.3]), + "forces": np.array( + [ + [[0.5, 0.0, 0.0], [0.0, 0.0, 0.75], [0.5, 0.0, 1.5]], + [[2.5, 2.0, 2.0], [2.0, 2.0, 2.75], [2.5, 2.0, 3.5]], + ] + ), + } + + def test_n2p2_from_labeled_system(self): + data = dpdata.LabeledSystem("n2p2/input.data", fmt="n2p2") + for key in self.data_ref: + if key == "atom_numbs": + self.assertEqual(data[key], self.data_ref[key]) + elif key == "atom_names": + self.assertEqual(data[key], self.data_ref[key]) + elif key == "atom_types": + np.testing.assert_array_equal(data[key], self.data_ref[key]) + else: + np.testing.assert_array_almost_equal( + data[key], self.data_ref[key], decimal=5 + ) + + def test_n2p2_to_labeled_system(self): + output_file = "n2p2/output.data" + data = dpdata.LabeledSystem.from_dict({"data": self.data_ref}) + data.to_n2p2(output_file) + ref_file = "n2p2/input.data" + with open(ref_file) as file1, open(output_file) as file2: + file1_lines = file1.readlines() + file2_lines = file2.readlines() + + file1_lines = [line.strip("\n") for line in file1_lines] + file2_lines = [line.strip("\n") for line in file2_lines] + + self.assertListEqual(file1_lines, file2_lines) + + def tearDown(self): + if os.path.isfile("n2p2/output.data"): + os.remove("n2p2/output.data")