From e8601c4c501db7e9f43c5ae30f08446b3a52c7c9 Mon Sep 17 00:00:00 2001
From: Michele Ceriotti <ceriottm@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:47:20 +0100
Subject: [PATCH] Adding utilities to parse i-PI output files into more
 standard python / ase objects (#313)

* Added utilities to parse ipi output files, and also mention this in the docs
* Disable installation of tests, as it's broken!
---
 docs/src/getting-started.rst                  |   3 +-
 docs/src/output-files.rst                     |  33 +++
 .../piglet_liquid_water/input.xml             |   1 +
 ipi/__init__.py                               |  14 +-
 ipi/utils/__init__.py                         |   1 +
 ipi/utils/io/io_units.py                      |   5 +-
 ipi/utils/messages.py                         |   2 +-
 ipi/utils/parsing.py                          | 222 ++++++++++++++++++
 setup.cfg                                     |   2 +-
 setup.py                                      |   8 +-
 10 files changed, 277 insertions(+), 14 deletions(-)
 create mode 100644 ipi/utils/parsing.py
diff --git a/docs/src/getting-started.rst b/docs/src/getting-started.rst
index 8d352a782..113c7516a 100644
--- a/docs/src/getting-started.rst
+++ b/docs/src/getting-started.rst
@@ -16,9 +16,8 @@ as the system has installed:
 
 -  Python version 3.6 or greater (starting from version 2.0, i-PI is not Python2
    compatible)
-
 -  The Python numerical library NumPy
-src/getting-started.rst-
+
 See :ref:`runningsimulations` for more details on how to launch i-PI.
 
 Additionally, most client codes will have their own requirements. Many
diff --git a/docs/src/output-files.rst b/docs/src/output-files.rst
index eda6e1f42..e075f21ec 100644
--- a/docs/src/output-files.rst
+++ b/docs/src/output-files.rst
@@ -218,3 +218,36 @@ run
 
    > python i-pi RESTART
 
+
+Reading output files
+--------------------
+
+It can be useful to parse the output files of i-PI into a format that can
+be readily manipulated in a custom Python script. To this end, i-PI provides
+a few utilities. `ipi.read_output` that can be used to parse a property output 
+file, that returns data blocks as a dictionary of numpy array, and additional
+information from the header (such as units, and the description of each
+property) as a separate dictionary
+
+.. code-block::
+
+   from ipi import read_output
+   data, info = read_output("simulation.out")
+
+Trajectory files can be read with `ipi.read_trajectory`. This reads the 
+trajectory output into a list of `ase.Atoms` objects (hence this functionality
+has a dependency on `ase`), converting positions and cell to angstrom, and 
+moving other properties to arrays and converting 
+them to ASE units (e.g. forces are in eV/Å). `extras` output files (that
+contain either numerical data, or raw strings, returned by the driver code
+in addition to energy and forces) can be processed by using `format='extras'`
+and an option: the call will then return a dictionary with an entry having the
+name of the type of extra (if present) and either a list of the raw strings, 
+or a numpy array with the data. A second dictionary entry contains the list
+of step numbers. 
+
+.. code-block::
+
+   from ipi import read_trajectory
+   data = read_trajectory("simulation.dipoles", format="extras")
+
diff --git a/examples/features/path_integral_molecular_dynamics/piglet_liquid_water/input.xml b/examples/features/path_integral_molecular_dynamics/piglet_liquid_water/input.xml
index 8065346ea..1db387739 100644
--- a/examples/features/path_integral_molecular_dynamics/piglet_liquid_water/input.xml
+++ b/examples/features/path_integral_molecular_dynamics/piglet_liquid_water/input.xml
@@ -2,6 +2,7 @@
   <output prefix='simulation'>
     <properties stride='1' filename='out'>  [ step, time{picosecond}, conserved, temperature{kelvin}, kinetic_cv, potential, pressure_cv{megapascal}] </properties>
     <trajectory filename='pos' stride='20'> positions{angstrom} </trajectory>
+    <trajectory filename='for' stride='20'> forces </trajectory>
   </output>
   <total_steps> 100 </total_steps>
   <prng>
diff --git a/ipi/__init__.py b/ipi/__init__.py
index c37bad218..54fecc771 100644
--- a/ipi/__init__.py
+++ b/ipi/__init__.py
@@ -2,6 +2,18 @@
 The i-PI package.
 """
 
-__all__ = ["clients", "engine", "inputs", "interfaces", "utils", "ipi_global_settings"]
+# expose some utility functions in a more direct way
+from ipi.utils.parsing import read_output, read_trajectory
+
+__all__ = [
+    "clients",
+    "engine",
+    "inputs",
+    "interfaces",
+    "utils",
+    "ipi_global_settings",
+    "read_output",
+    "read_trajectory",
+]
 
 ipi_global_settings = {"floatformat": "%16.8e"}
diff --git a/ipi/utils/__init__.py b/ipi/utils/__init__.py
index acbbb8f32..e084081d6 100644
--- a/ipi/utils/__init__.py
+++ b/ipi/utils/__init__.py
@@ -11,6 +11,7 @@
     "mathtools",
     "prng",
     "inputvalue",
+    "parsing",
     "nmtransform",
     "messages",
     "softexit",
diff --git a/ipi/utils/io/io_units.py b/ipi/utils/io/io_units.py
index fcd649d52..b01685575 100755
--- a/ipi/utils/io/io_units.py
+++ b/ipi/utils/io/io_units.py
@@ -125,7 +125,4 @@ def process_units(
     atoms.names[:] = names
     atoms.m[:] = masses
 
-    return {
-        "atoms": atoms,
-        "cell": cell,
-    }
+    return {"atoms": atoms, "cell": cell, "comment": comment}
diff --git a/ipi/utils/messages.py b/ipi/utils/messages.py
index 5d3cadf2f..35d122d70 100644
--- a/ipi/utils/messages.py
+++ b/ipi/utils/messages.py
@@ -107,7 +107,7 @@ def banner():
 |  #################################  |
 \__#_/     \____/     \____/     \_#__/
    #    _        _______  _____    #
-   #   (_)      |_   __ \|_   _|   #      -*-     v 2.6.0  -*-
+   #   (_)      |_   __ \|_   _|   #      -*-     v 2.6.3  -*-
    #   __  ______ | |__) | | |     #
    Y  [  ||______||  ___/  | |     #      A Universal Force Engine
   0 0  | |       _| |_    _| |_    #
diff --git a/ipi/utils/parsing.py b/ipi/utils/parsing.py
new file mode 100644
index 000000000..a1612ea5b
--- /dev/null
+++ b/ipi/utils/parsing.py
@@ -0,0 +1,222 @@
+""" Utility functions to parse i-PI output files.
+
+These are meant to be used in Python post-processing pipelines, so
+trajectory files are read as ASE objects (assuming units to be 
+Angstrom and eV), and output files are read in a dictionary of 
+numpy arrays.
+"""
+
+import re
+import numpy as np
+from ipi.utils.units import unit_to_user
+
+try:
+    import ase
+except ImportError:
+    ase = None
+
+__all__ = ["read_output", "read_trajectory"]
+
+
+def read_output(filename):
+    """Reads an i-PI output file and returns a dictionary with the properties in a tidy order,
+    and information on units and descriptions of the content.
+
+    Usage:
+        read_output("filename")
+
+    Returns:
+        values, info
+
+        values: a dictionary with the property names as keys, and the values as numpy arrays
+        info: a dictionary with the property names as keys and as values tuples of (units, description)
+    """
+
+    # Regex pattern to match header lines and capture relevant parts
+    header_pattern = re.compile(
+        r"#\s*(column|cols\.)\s+(\d+)(?:-(\d+))?\s*-->\s*([^\s\{]+)(?:\{([^\}]+)\})?\s*:\s*(.*)"
+    )
+
+    # Reading the file
+    with open(filename, "r") as file:
+        lines = file.readlines()
+
+    header_lines = [line for line in lines if line.startswith("#")]
+    data_lines = [line for line in lines if not line.startswith("#") and line.strip()]
+
+    # Interprets properties
+    properties = {}
+    for line in header_lines:
+        match = header_pattern.match(line)
+        if match:
+            # Extracting matched groups
+            col_type, start_col, end_col, property_name, units, description = (
+                match.groups()
+            )
+            col_info = f"{start_col}-{end_col}" if end_col else start_col
+            properties[col_info] = {
+                "name": property_name,
+                "units": units,
+                "description": description,
+            }
+
+    # Parse data
+    values_dict = {}
+    info_dict = {}
+    for col_info, prop_info in properties.items():
+        # Initialize list to hold values for each property
+        values_dict[prop_info["name"]] = []
+        # Save units and description
+        info_dict[prop_info["name"]] = (prop_info["units"], prop_info["description"])
+
+    for line in data_lines:
+        values = line.split()
+        for column_info, prop_info in properties.items():
+            if "-" in column_info:  # Multi-column property
+                start_col, end_col = map(
+                    int, column_info.split("-")
+                )  # 1-based indexing
+                prop_values = values[
+                    start_col - 1 : end_col
+                ]  # Adjust to 0-based indexing
+            else:  # Single column property
+                col_index = int(column_info) - 1  # Adjust to 0-based indexing
+                prop_values = [values[col_index]]
+
+            values_dict[prop_info["name"]].append([float(val) for val in prop_values])
+
+    for prop_name, prop_values in values_dict.items():
+        values_dict[prop_name] = np.array(
+            prop_values
+        ).squeeze()  # make 1-col into a flat array
+
+    return values_dict, info_dict
+
+
+def read_trajectory(
+    filename,
+    format=None,
+    dimension="automatic",
+    units="automatic",
+    cell_units="automatic",
+):
+    """Reads a file in i-PI format and returns it in ASE format.
+
+    `format` can be `xyz` (i-PI formatted), `pdb`, `binary`, `json`, `ase`, and if not specified it'll
+    be inferred from the filename extension. `extras` will read a trajectory containing the additional
+    data returned from a calculator; will try to read as a float array, and if it fails it'll resort
+    to returning a list of raw strings.
+    units can be inferred from the file content, or specified with `dimension`, `units` and `cell_units`
+    """
+
+    if ase is None:
+        raise ImportError(
+            "read_trajectory requires the `ase` package to return the structure in ASE format"
+        )
+
+    from ipi.utils.io import read_file
+
+    if format is None:
+        # tries to infer the file format
+        format = filename[filename.rfind(".") + 1 :]
+        if format not in ["xyz", "pdb", "binary", "json", "ase"]:
+            raise ValueError(f"Unrecognized file format: {format}")
+
+    file_handle = open(filename, "r")
+    bohr2angstrom = unit_to_user("length", "angstrom", 1.0)
+    comment_regex = re.compile(r"(\w+)\{([^}]+)\}")
+    step_regex = re.compile(r"Step:\s+(\d+)")
+
+    frames = []
+    while True:
+        try:
+            if format == "extras":
+                file = open(filename, "r")
+                step_regex = re.compile(r"#EXTRAS\((\w+)\)# *Step:\s+(\d+)")
+                step_list = []
+                data_list = []
+                data_frame = []
+                is_array = True
+                property_name = ""
+                for line in file:
+                    matches = step_regex.findall(line)
+                    if len(matches) > 0:
+                        if len(data_frame) > 0:
+                            try:
+                                data_processed = np.loadtxt(data_frame)
+                            except:
+                                is_array = False
+                                data_processed = "\n".join(data_frame)
+                            data_list.append(data_processed)
+                            data_frame = []
+
+                        # Found a new step, update current_step and initialize the list for data
+                        step_list.append(int(matches[0][1]))
+                        if property_name == "":
+                            property_name = matches[0][0]
+                        elif property_name != matches[0][0]:
+                            raise ValueError(
+                                f"Inconsistent property {matches[0][0]} found in extras containing {property_name}"
+                            )
+                    else:
+                        data_frame.append(line)
+
+                if len(data_frame) > 0:
+                    try:
+                        data_processed = np.loadtxt(data_frame)
+                    except:
+                        is_array = False
+                        data_processed = "\n".join(data_frame)
+                    data_list.append(data_processed)
+                if is_array:
+                    data_list = np.array(data_list).squeeze()
+                return {
+                    "step": np.asarray(step_list, dtype=int),
+                    property_name: data_list,
+                }
+            else:
+                ret = read_file(
+                    format,
+                    file_handle,
+                    dimension=dimension,
+                    units=units,
+                    cell_units=cell_units,
+                )
+
+                frame = ase.Atoms(
+                    ret["atoms"].names,
+                    positions=ret["atoms"].q.reshape((-1, 3)) * bohr2angstrom,
+                    cell=ret["cell"].h.T * bohr2angstrom,
+                    pbc=True,
+                )
+
+                # parse comment to get the property
+                matches = comment_regex.findall(ret["comment"])
+
+                # get what we have found
+                if len(matches) >= 2:
+                    what = matches[-2][0]
+                else:  # defaults to reading positions
+                    what = "positions"
+
+                # ... and the step
+                matches = step_regex.findall(ret["comment"])
+                if len(matches) >= 1:
+                    frame.info["step"] = int(matches[-1][0])
+
+                # if we have forces, set positions to zero (that data is missing!) and set forces instead
+                if what == "forces":
+                    # set forces and convert to eV/angstrom
+                    frame.positions *= 0
+                    frame.arrays["forces"] = ret["atoms"].q.reshape(
+                        (-1, 3)
+                    ) * unit_to_user("force", "ev/ang", 1.0)
+
+                frames.append(frame)
+
+        except EOFError:
+            break
+        except:
+            raise
+
+    return frames
diff --git a/setup.cfg b/setup.cfg
index a2c9dcb6a..ff9f9d8b3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = ipi
-version = 2.6.2
+version = 2.6.3
 description = A Python interface for ab initio path integral molecular dynamics simulations
 long_description = file: README.md
 long_description_content_type = text/x-rst
diff --git a/setup.py b/setup.py
index 69e020cf3..bf8364962 100644
--- a/setup.py
+++ b/setup.py
@@ -2,19 +2,17 @@
 
 from setuptools import setup, find_packages
 
+# exclude i-pi-driver that is a compiled fortran file when installing from a local dir
+scripts = [str(p) for p in Path("bin").iterdir() if p.name != "i-pi-driver"]
 
 setup(
     packages=[
         *find_packages(exclude=["ipi_tests*"]),
         "ipi._driver",
         "ipi._driver.pes",
-        "ipi.tests",
-        "ipi.tests.regression_tests",
     ],
     package_dir={
         "ipi._driver": "drivers/py",
-        "ipi.tests": "ipi_tests",
     },
-    package_data={"ipi.tests.regression_tests": ["tests/NVE/NVE_1/harmonic_python/*"]},
-    scripts=[str(p) for p in Path("bin").iterdir()],
+    scripts=scripts,
 )