From 781a04f430ab8531e6510bdde625bc4115508f19 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 27 Feb 2024 15:05:54 -0500
Subject: [PATCH 01/29] minimal script to train a MTP from lammps. Beware the
 README to run on a mac

---
 README_mtp.md                  |  58 ++++++++++++
 crystal_diffusion/train_mtp.py | 158 +++++++++++++++++++++++++++++++++
 requirements.txt               |   2 +
 3 files changed, 218 insertions(+)
 create mode 100644 README_mtp.md
 create mode 100644 crystal_diffusion/train_mtp.py

diff --git a/README_mtp.md b/README_mtp.md
new file mode 100644
index 00000000..f0643f5a
--- /dev/null
+++ b/README_mtp.md
@@ -0,0 +1,58 @@
+# MTP README
+
+## How to setup:
+
+We use the maml (MAterials Machine Learning) library for the MTP implementation.
+
+pip install maml
+
+To make MTP works, we also need the mlp library
+https://mlip.skoltech.ru
+
+## Instructions to install on a Mac
+
+First, install open-mpi with brew.
+Beware that open-mpi uses Mac gcc compiler, not the brew gcc, which is not linked to a gfortran compiler.
+Set the path properly.
+
+brew install gcc  
+export HOMEBREW_CC=gcc-13  # gcc-13 is brew gcc: /usr/local/bin/gcc-13
+export HOMEBREW_CXX=g++-13  # brew g++
+brew install openmpi --build-from-source
+
+
+If openmpi is already installed, but doesn't work because of the gcc / gfortran unlinkable, you can reinstall
+
+export HOMEBREW_CC=gcc-13  # gcc-13 is brew gcc: /usr/local/bin/gcc-13
+export HOMEBREW_CXX=g++-13  # brew g++
+brew reinstall openmpi --build-from-source
+
+Then, you can install MLIP
+
+git clone https://gitlab.com/ashapeev/mlip-2.git
+cd mlip-2
+./configure
+make mlip
+
+If the installation is successful, the command is mlip-2/bin/mlp
+This should be recognized as mlp, but it was not in my case.
+This creates issues with maml that was calling mlp with python subprocess.Popen
+
+The fix was to modify the maml code (change the root path to fix your environment)
+In
+/Users/simonb/miniconda/envs/crystal_diffusion/lib/python3.11/site-packages/maml/apps/pes/_mtp.py
+
+You want to change the "mlp" variable passed in subprocess.Popen by the full path of the MLIP-2 mlp command.
+e.g. at line 597
+with open("min_dist", "w") as f, subprocess.Popen(["mlp", "mindist", atoms_filename], stdout=f) as p:
+
+was replaced with:
+
+mlp_command = '/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-2/bin/mlp'
+with open("min_dist", "w") as f, subprocess.Popen([mlp_command, "mindist", atoms_filename], stdout=f) as p:
+
+same at line 615, 722, 724
+
+Also, comment out the block starting with 
+if not which("mlp"):
+at lines 574 and 696.
\ No newline at end of file
diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
new file mode 100644
index 00000000..3b4e6484
--- /dev/null
+++ b/crystal_diffusion/train_mtp.py
@@ -0,0 +1,158 @@
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import yaml
+from maml.apps.pes import MTPotential
+from monty.serialization import loadfn
+from pymatgen.core import Structure
+from sklearn.metrics import mean_absolute_error
+
+
+# TODO list of yaml files should come from an external call
+# yaml dump file
+lammps_yaml = ['lammps_scripts/Si/si-custom/dump.si-300-1.yaml']
+# yaml thermodynamic variables
+lammps_thermo_yaml = ['lammps_scripts/Si/si-custom/thermo_log.yaml']
+# note that the YAML output does not contain the map from index to atomic species
+# this will have to be taken from elsewhere
+# use a manual map for now
+atom_dict = {1: 'Si'}
+
+
+def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
+        Tuple[List[Structure], List[List[float]]]:
+    """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
+
+    Args:
+        filename: path to LAMMPS output file in yaml format
+        atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str)
+
+    Returns:
+        list of pymatgen Structure containing the atoms and their positions
+        list of forces (n x 3) for each atom
+    """
+    structures = []
+    forces = []
+    with (open(filename, 'r') as f):
+        l_yaml = yaml.safe_load_all(f)
+        for d in l_yaml:  # loop over LAMMPS outputs and convert in pymatgen Structure objects
+            # lattice in yaml is 3 x 2 [0, x_lim]
+            # we assume a rectangular lattice for now with the 2nd coordinates as the lattice vectors
+            lattice = np.zeros((3, 3))
+            for i, x in enumerate(d['box']):
+                lattice[i, i] = x[1]
+            type_idx = d['keywords'].index('type')
+            species = [atom_dict[x[type_idx]] for x in d['data']]  # convert to atom type
+            coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']]
+            coords = [[x[i] for i in coords_idx] for x in d['data']]
+            PMStructure = Structure(lattice=lattice,
+                                    species=species,
+                                    coords=coords)
+            structures.append(PMStructure)
+            force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
+            structure_forces = [[x[i] for i in force_idx] for x in d['data']]
+            forces.append(structure_forces)
+    return structures, forces
+
+
+def extract_energy_from_thermo_log(filename: str) -> List[float]:
+    """Read energies from LAMMPS thermodynamic output file.
+
+    Args:
+        filename: path to LAMMPS thermodynamic output file in yaml format.
+
+    Returns:
+        list of energies (1 value per configuration)
+    """
+    with open(filename, 'r') as f:
+        log_yaml = yaml.safe_load(f)
+        kin_idx = log_yaml['keywords'].index('KinEng')
+        pot_idx = log_yaml['keywords'].index('PotEng')
+        energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
+    return energies
+
+
+def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str], atom_dict: Dict[int, Any]) -> \
+    Dict[str, Any]:
+    """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
+
+    Args:
+        output_yaml: list of LAMMPS output files as yaml.
+        thermo_yaml: list of LAMMPS thermodynamic output files as yaml.
+        atom_dict: mapping of LAMMPS indices to atom type.
+
+    Returns:
+        dict with structure, energies and forces usable by MTP.
+    """
+    mtp_inputs = {
+        'structure': [],
+        'energy': [],
+        'forces': []
+    }
+    for filename in output_yaml:
+        structures, forces = extract_structure_and_forces_from_file(filename, atom_dict)
+        mtp_inputs['structure'] += structures
+        mtp_inputs['forces'] += forces
+    for filename in thermo_yaml:
+        mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
+    return mtp_inputs
+
+
+def train_mtp(train_inputs: Dict[str, Any], valid_inputs: Dict[str, Any], max_dist: float=5) -> \
+    Tuple[pd.DataFrame, pd.DataFrame]:
+    """Create and evaluate a MTP potential.
+
+    Args:
+        train_inputs: inputs for training. Should contain structure, energies and forces
+        valid_inputs: inputs for validation.
+        max_dist (optional): radial cutoff. Defaults to 5.
+
+    Returns:
+       dataframe with original and predicted energies and forces.
+    """
+    # TODO more kwargs for MTP training. See maml documentation.
+    # create MTP
+    mtp = MTPotential()
+
+    # train
+    mtp.train(
+        train_structures=train_inputs["structure"],
+        train_energies=train_inputs["energy"],
+        train_forces=train_inputs["forces"],
+        train_stresses=None,
+        max_dist=5,
+        stress_weight=0,
+    )
+
+    # evaluate
+    df_orig, df_predict = mtp.evaluate(
+        test_structures=train_inputs["structure"],
+        test_energies=train_inputs["energy"],
+        test_forces=train_inputs["forces"],
+        test_stresses=None,
+    )
+
+    # TODO also output the MTP
+    return df_orig, df_predict
+
+
+def main():
+    mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict)
+    df_orig, df_predict = train_mtp(mtp_inputs, mtp_inputs)
+    # from demo in maml
+    E_p = np.array(df_predict[df_predict["dtype"] == "energy"]["y_orig"])
+    E_p /= df_predict[df_predict["dtype"] == "energy"]["n"]
+    E_o = np.array(df_orig[df_orig["dtype"] == "energy"]["y_orig"])
+    E_o /= df_orig[df_orig["dtype"] == "energy"]["n"]
+    print(f"MAE of training energy prediction is {mean_absolute_error(E_o, E_p) * 1000} meV/atom")
+
+    F_p = np.array(df_predict[df_predict["dtype"] == "force"]["y_orig"])
+    F_p /= df_predict[df_predict["dtype"] == "force"]["n"]
+    F_o = np.array(df_orig[df_orig["dtype"] == "force"]["y_orig"])
+    F_o /= df_orig[df_orig["dtype"] == "force"]["n"]
+    print(f"MAE of training force prediction is {mean_absolute_error(F_o, F_p)} eV/Å")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
index cc765920..ce0d9a06 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,9 +5,11 @@ gitpython==3.1.27
 isort==5.13.2
 jupyter==1.0.0
 jinja2==3.1.2
+maml==2023.9.9
 myst-parser==2.0.0
 orion>=0.2.4.post1
 pyarrow==15.0.0
+pymatgen==2024.2.23
 pyyaml==6.0
 pytest==7.1.2
 pytest-cov==3.0.0

From cb66544e5078e9c842caf55c31190637e67a0fd2 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 6 Mar 2024 12:45:05 -0500
Subject: [PATCH 02/29] rewrite for mlip3

---
 crystal_diffusion/train_mtp.py | 408 ++++++++++++++++++++++++++++++---
 1 file changed, 379 insertions(+), 29 deletions(-)

diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index 3b4e6484..1d0b8202 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -1,11 +1,21 @@
-from typing import Any, Dict, List, Tuple
+import io
+import itertools
+import json
+import os
+import re
+import shutil
+import subprocess
+from collections import defaultdict, OrderedDict
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import pandas as pd
 import yaml
 from maml.apps.pes import MTPotential
-from monty.serialization import loadfn
-from pymatgen.core import Structure
+from maml.utils import check_structures_forces_stresses, pool_from
+from monty.io import zopen
+from monty.tempfile import ScratchDir
+from pymatgen.core import Lattice, Structure
 from sklearn.metrics import mean_absolute_error
 
 
@@ -20,6 +30,310 @@
 atom_dict = {1: 'Si'}
 
 
+def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Method to convert a list of docs into DataFrame usable for computing metrics and analysis.
+
+    Modified from maml.utils._data_conversion.py
+
+    Args:
+        docs: list of docs generated by mlip-3. Each doc should be structured as a dict.
+
+    Returns:
+        A DataFrame with energy, force, and nbh grades (MaxVol factor) per atom and per structure. Energy is repeated
+        for each atom and corresponds to the total energy predicted.
+    """
+    # TODO move to another .py file
+    df = defaultdict(list)
+    for s_idx, d in enumerate(docs):
+        outputs = d["outputs"]
+        force_arr = np.array(outputs["forces"])
+        n_atom = force_arr.shape[0]
+        for i, x in enumerate(['x', 'y', 'z']):
+            df[f'f{x}'] += force_arr[:, i].tolist()
+        df['energy'] += [outputs['energy']] * n_atom  # copy the value to all atoms
+        if "nbh_grades" in outputs.keys():
+            nbh_grades = outputs["nbh_grades"]
+            df['nbh_grades'] += nbh_grades
+        df['atom_index'] += list(range(n_atom))
+        df['structure_index'] += [s_idx] * n_atom
+
+    df = pd.DataFrame(df)
+    return df
+
+
+class MTP_with_MLIP3(MTPotential):
+    """MTP with MLIP-3."""
+    def __init__(self,
+                 mlip_path: str,
+                 maml_path: str,
+                 name: Optional[str] = None,
+                 param: Optional[Dict[Any, Any]] = None,
+                 version:Optional[str] = None):
+        """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3
+
+        Args:
+            mlip_path: path to mlip3 mlp command
+            maml_path: path to maml _mtp.py file
+            name: MTPotential argument
+            param: MTPotential argument
+            version: MTPotential argument
+        """
+        super().__init__(name, param, version)
+        self.mlp_command = mlip_path
+        # TODO there is a better way to do this
+        self.module_dir = maml_path
+        self.fitted_mtp = None
+
+    def to_lammps_format(self):
+        # TODO
+        # write params write the fitted mtp in a LAMMPS compatible format
+        # self.write_param(
+        #    fitted_mtp=fitted_mtp,
+        #    Abinitio=0,
+        #    Driver=1,
+        #    Write_cfgs=predict_file,
+        #    Database_filename=original_file,
+        #    **kwargs,
+        # )
+        pass
+
+    def evaluate(self,
+                 fitted_mtp_path: str,
+                 test_structures: List[Structure],
+                 test_energies: List[float],
+                 test_forces: List[List[float]],
+                 test_stresses: Optional[List[List[float]]]=None,
+    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained
+        interatomic potentials.
+
+        Args:
+            fitted_mtp_path: path to fitted mtp file.
+            test_structures: evaluation set of pymatgen Structure Objects.
+            test_energies: list of total energies of each structure to evaluation in test_structures list.
+            test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list.
+                m can be varied with each single structure case.
+            test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in
+                test_structures list. If None, do not evaluate on stresses. Default to None.
+
+        Returns:
+            dataframe with ground truth energies, forces
+            dataframe with predicted energies, forces, MaxVol gamma (nbh grades)
+        """
+        fitted_mtp = fitted_mtp_path
+        original_file = "original.cfgs"
+        predict_file = "predict.cfgs"
+        test_structures, test_forces, test_stresses = check_structures_forces_stresses(
+            test_structures, test_forces, test_stresses
+        )
+        predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses)
+
+        with ScratchDir("."):  # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere
+            # write the structures to evaluate in a mlp compatible format
+            original_file = self.write_cfg(original_file, cfg_pool=predict_pool)
+            df_orig = self.read_cfgs(original_file, nbh_grade=False)  # read original values as a DataFrame
+
+            # calculate_grade is the method to get the forces, energy & maxvol values
+            cmd = [self.mlp_command, "calculate_grade", fitted_mtp, original_file, predict_file]
+            predict_file += '.0'  # added by mlp...
+            with subprocess.Popen(cmd, stdout=subprocess.PIPE) as p:  # run mlp
+                stdout = p.communicate()[0]
+                rc = p.returncode
+            if rc != 0:
+                error_msg = f"mlp exited with return code {rc}"
+                msg = stdout.decode("utf-8").split("\n")[:-1]
+                try:
+                    error_line = next(i for i, m in enumerate(msg) if m.startswith("ERROR"))
+                    error_msg += ", ".join(msg[error_line:])
+                except Exception:
+                    error_msg += msg[-1]
+                raise RuntimeError(error_msg)
+
+            df_predict = self.read_cfgs(predict_file, nbh_grade=True)
+        return df_orig, df_predict
+
+    def read_cfgs(self, filename: str, nbh_grade: bool = False) -> pd.DataFrame:
+        """Read mlp output when MaxVol gamma factor is present.
+
+        Args:
+            filename: name of mlp output file to be parsed.
+            nbh_grade (optional): if True, add the nbh_grades in the resulting dataframe. Defaults to False.
+
+        Returns:
+            dataframe with energies, forces, optional nbh grades (MaxVol gamma)
+        """
+        def formatify(string: str) -> List[float]:
+            """Utility to convert string to a list of float"""
+            return [float(s) for s in string.split()]
+
+        if not self.elements:
+            raise ValueError("No species given.")
+
+        data_pool = []
+        with zopen(filename, "rt") as f:
+            lines = f.read()
+
+        block_pattern = re.compile("BEGIN_CFG\n(.*?)\nEND_CFG", re.S)
+        size_pattern = re.compile("Size\n(.*?)\n SuperCell", re.S | re.I)
+        if nbh_grade:
+            position_pattern = re.compile("nbh_grades\n(.*?)\n Energy", re.S)
+        else:
+            position_pattern = re.compile("fz\n(.*?)\n Energy", re.S)
+        energy_pattern = re.compile("Energy\n(.*?)\n (?=PlusStress|Stress)", re.S)
+        # stress_pattern = re.compile("xy\n(.*?)(?=\n|$)", re.S)  # TODO stress values
+
+        for block in block_pattern.findall(lines):
+            d = {"outputs": {}}
+            size_str = size_pattern.findall(block)[0]
+            size = int(size_str.lstrip())
+            position_str = position_pattern.findall(block)[0]
+            position = np.array(list(map(formatify, position_str.split("\n"))))
+            species = np.array(self.elements)[position[:, 1].astype(np.int64)]
+            forces = position[:, 5:8].tolist()
+
+            energy_str = energy_pattern.findall(block)[0]
+            energy = float(energy_str.lstrip())
+            # TODO add stress
+            # stress_str = stress_pattern.findall(block)[0]
+            # virial_stress = (np.array(list(map(formatify, stress_str.split()))).reshape(6,).tolist())
+            # virial_stress = [virial_stress[self.mtp_stress_order.index(n)] for n in self.vasp_stress_order]
+            d["outputs"]["energy"] = energy
+            d["num_atoms"] = size
+            d["outputs"]["forces"] = forces
+            d["outputs"]["species"] = species
+            # d["outputs"]["virial_stress"] = virial_stress
+            if nbh_grade:
+                nbh_grade_values = position[:, 8].tolist()
+                d["outputs"]["nbh_grades"] = nbh_grade_values
+            data_pool.append(d)
+
+        # originally used convert_docs from maml.utils, but it hard-coded the structure of the dataframe and does not
+        # manage nbh_grades; we use our own implementation instead
+        df = convert_to_dataframe(docs=data_pool)
+        return df
+
+    def train(
+            self,
+            train_structures: List[Structure],
+            train_energies: List[float],
+            train_forces: List[np.array],
+            train_stresses: List[List[float]],
+            unfitted_mtp: str = "08g.mtp",  # TODO fix this
+            fitted_mtp_savedir: str = '../',
+            max_dist: float = 5,
+            radial_basis_size: int = 8,
+            max_iter: int = 1000,  # TODO check the next kwargs in mlip3
+            energy_weight: float = 1,
+            force_weight: float = 1e-2,
+            stress_weight: float = 1e-3,
+            init_params: str = "same",
+            scale_by_force: float = 0,
+            bfgs_conv_tol: float = 1e-3,
+            weighting: float = "vibration",
+    ) -> int:
+        """Training data with moment tensor method using MLIP-3.
+
+        Override the base class method.
+
+        Args:
+            train_structures: The list of Pymatgen Structure object.
+            energies: The list of total energies of each structure in structures list.
+            train_energies: List of total energies of each structure in structures list.
+            train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
+                m can be varied with each single structure case.
+            train_stresses: List of (6, ) virial stresses of each structure in structures list.
+            unfitted_mtp (optional): Define the initial mtp file.
+                Default to 08g.mtp -  the mtp file stored in .params directory. TODO change
+            fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd)
+            max_dist (optional): The actual radial cutoff. Defaults to 5.
+            radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8.
+            max_iter (optional): The number of maximum iteration. Defaults to 1000.
+            energy_weight (optional): The weight of energy. Defaults to 1
+            force_weight (optional): The weight of forces. Defaults to 1e-2
+            stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+            init_params (optional): How to initialize parameters if a potential was not
+                pre-fitted. Choose from "same" and "random". Defaults to "same".
+            scale_by_force (optional): If >0 then configurations near equilibrium
+               (with roughly force < scale_by_force) get more weight. Defaults to 0.
+            bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this
+                over 50 BFGS iterations. Defaults to 1e-3.
+            weighting (optional): How to weight configuration with different sizes relative to each other.
+                Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
+        Returns:
+            rc : return code of the mlp training script
+        """
+        train_structures, train_forces, train_stresses = check_structures_forces_stresses(
+            train_structures, train_forces, train_stresses
+        )
+        train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses)
+        elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures])))
+        self.elements = [str(element) for element in elements]  # TODO move to __init__
+
+        atoms_filename = "train.cfgs"
+
+        with ScratchDir("."):  # create a tmpdir - deleted afterew
+            atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
+
+            if not unfitted_mtp:
+                raise RuntimeError("No specific parameter file provided.")
+            # TODO move initial mtp file to our repo to remove the dependency on module_dir
+            mtp_file_path = os.path.join(self.module_dir, "params", unfitted_mtp)
+            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
+            commands = [self.mlp_command, "mindist", atoms_filename]
+            with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f)  as p:
+                p.communicate()[0]
+
+            with open("min_dist") as f:
+                lines = f.readlines()
+
+            split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
+            min_dist = float(lines[-1].split(split_symbol)[1])
+
+            with open(unfitted_mtp) as f:
+                template = f.read()
+
+            s = template % (len(self.elements), min_dist, max_dist, radial_basis_size)
+            with open(unfitted_mtp, "w") as f:
+                f.write(s)
+
+            save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
+            cmds_list = [
+                self.mlp_command,
+                "train",
+                unfitted_mtp,
+                atoms_filename,
+                f"--save_to={save_fitted_mtp}",
+                f"--iteration_limit={max_iter}",
+                "--al_mode=nbh",  # active learning mode - required to get extrapolation grade
+                # f"--curr-pot-name={unfitted_mtp}",  # TODO check those kwargs
+                #f"--energy-weight={energy_weight}",
+                #f"--force-weight={force_weight}",
+                #f"--stress-weight={stress_weight}",
+                #f"--init-params={init_params}",
+                #f"--scale-by-force={scale_by_force}",
+                #f"--bfgs-conv-tol={bfgs_conv_tol}",
+                #f"--weighting={weighting}",
+             ]
+            with subprocess.Popen(cmds_list, stdout=subprocess.PIPE) as p:
+                stdout = p.communicate()[0]
+                rc = p.returncode
+            if rc != 0:
+                error_msg = f"MLP exited with return code {rc}"
+                msg = stdout.decode("utf-8").split("\n")[:-1]
+                try:
+                    error_line = next(i for i, m in enumerate(msg) if m.startswith("ERROR"))
+                    error_msg += ", ".join(msg[error_line:])
+                except Exception:
+                    error_msg += msg[-1]
+                raise RuntimeError(error_msg)
+            # copy the fitted mtp outside the working directory
+            self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp)
+            shutil.copy(save_fitted_mtp, self.fitted_mtp)
+        return rc
+
+
 def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
         Tuple[List[Structure], List[List[float]]]:
     """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
@@ -46,10 +360,10 @@ def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, A
             species = [atom_dict[x[type_idx]] for x in d['data']]  # convert to atom type
             coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']]
             coords = [[x[i] for i in coords_idx] for x in d['data']]
-            PMStructure = Structure(lattice=lattice,
+            pm_structure = Structure(lattice=lattice,
                                     species=species,
                                     coords=coords)
-            structures.append(PMStructure)
+            structures.append(pm_structure)
             force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
             structure_forces = [[x[i] for i in force_idx] for x in d['data']]
             forces.append(structure_forces)
@@ -99,13 +413,11 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str
     return mtp_inputs
 
 
-def train_mtp(train_inputs: Dict[str, Any], valid_inputs: Dict[str, Any], max_dist: float=5) -> \
-    Tuple[pd.DataFrame, pd.DataFrame]:
+def train_mtp(train_inputs: Dict[str, Any], max_dist: float=5) -> MTP_with_MLIP3:
     """Create and evaluate a MTP potential.
 
     Args:
         train_inputs: inputs for training. Should contain structure, energies and forces
-        valid_inputs: inputs for validation.
         max_dist (optional): radial cutoff. Defaults to 5.
 
     Returns:
@@ -113,8 +425,9 @@ def train_mtp(train_inputs: Dict[str, Any], valid_inputs: Dict[str, Any], max_di
     """
     # TODO more kwargs for MTP training. See maml documentation.
     # create MTP
-    mtp = MTPotential()
-
+    mlp_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/build/mlp"
+    maml_path = "/Users/simonb/miniconda/envs/crystal_diffusion/lib/python3.11/site-packages/maml/apps/pes/"
+    mtp = MTP_with_MLIP3(mlip_path=mlp_path, maml_path=maml_path)
     # train
     mtp.train(
         train_structures=train_inputs["structure"],
@@ -122,36 +435,73 @@ def train_mtp(train_inputs: Dict[str, Any], valid_inputs: Dict[str, Any], max_di
         train_forces=train_inputs["forces"],
         train_stresses=None,
         max_dist=5,
-        stress_weight=0,
+        stress_weight=0,  # TODO kwargs and savedir as args
+        fitted_mtp_savedir='/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/debug_mlip3/'
     )
 
+    return mtp
+
+
+def evaluate_mtp(eval_inputs: Dict[str, Any], mtp: MTP_with_MLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Create and evaluate a MTP potential.
+
+    Args:
+        eval_inputs: inputs to evaluate. Should contain structure, energies and forces
+        mtp: trained MTP potential.
+
+    Returns:
+       dataframe with original and predicted energies and forces.
+    """
     # evaluate
     df_orig, df_predict = mtp.evaluate(
-        test_structures=train_inputs["structure"],
-        test_energies=train_inputs["energy"],
-        test_forces=train_inputs["forces"],
+        fitted_mtp_path='/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/debug_mlip3/08g_fitted.mtp',
+        test_structures=eval_inputs["structure"],
+        test_energies=eval_inputs["energy"],
+        test_forces=eval_inputs["forces"],
         test_stresses=None,
     )
-
-    # TODO also output the MTP
     return df_orig, df_predict
 
 
+def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]:
+    """Get mean absolute error on energy and forces from the outputs of MTP.
+
+    Args:
+        df_orig: dataframe with ground truth values
+        df_predict: dataframe with MTP predictions
+
+    Returns:
+        MAE on energy in eV/atom and MAE on forces in eV/Å
+    """
+    # from demo in maml
+    # get a single predicted energy per structure
+    predicted_energy = df_predict.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
+    # normalize by number of atoms
+    predicted_energy = (predicted_energy['energy'] / predicted_energy['atom_index']).to_numpy()
+    # same for ground truth
+    gt_energy = df_orig.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
+    gt_energy = (gt_energy['energy'] / gt_energy['atom_index']).to_numpy()
+
+    predicted_forces = df_predict.groupby('structure_index').agg({'fx': 'sum', 'fy': 'sum', 'fz': 'sum',
+                                                                  'atom_index': 'count'})
+    predicted_forces = np.concatenate([(predicted_forces[f'f{x}'] / predicted_forces['atom_index']).to_numpy()
+                                  for x in ['x', 'y', 'z']])
+
+    gt_forces = df_orig.groupby('structure_index').agg({'fx': 'sum', 'fy': 'sum', 'fz': 'sum',
+                                                                  'atom_index': 'count'})
+    gt_forces = np.concatenate([(gt_forces[f'f{x}'] / gt_forces['atom_index']).to_numpy() for x in ['x', 'y', 'z']])
+
+    return mean_absolute_error(predicted_energy, gt_energy) , mean_absolute_error(predicted_forces, gt_forces)
+
+
 def main():
     mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict)
-    df_orig, df_predict = train_mtp(mtp_inputs, mtp_inputs)
-    # from demo in maml
-    E_p = np.array(df_predict[df_predict["dtype"] == "energy"]["y_orig"])
-    E_p /= df_predict[df_predict["dtype"] == "energy"]["n"]
-    E_o = np.array(df_orig[df_orig["dtype"] == "energy"]["y_orig"])
-    E_o /= df_orig[df_orig["dtype"] == "energy"]["n"]
-    print(f"MAE of training energy prediction is {mean_absolute_error(E_o, E_p) * 1000} meV/atom")
-
-    F_p = np.array(df_predict[df_predict["dtype"] == "force"]["y_orig"])
-    F_p /= df_predict[df_predict["dtype"] == "force"]["n"]
-    F_o = np.array(df_orig[df_orig["dtype"] == "force"]["y_orig"])
-    F_o /= df_orig[df_orig["dtype"] == "force"]["n"]
-    print(f"MAE of training force prediction is {mean_absolute_error(F_o, F_p)} eV/Å")
+    mtp = train_mtp(mtp_inputs)
+    print("Training is done")
+    df_orig, df_predict = evaluate_mtp(mtp_inputs, mtp)
+    energy_mae, force_mae = get_metrics_from_pred(df_orig, df_predict)
+    print(f"MAE of training energy prediction is {energy_mae * 1000} meV/atom")
+    print(f"MAE of training force prediction is {force_mae} eV/Å")
 
 
 if __name__ == '__main__':

From 5cd18c3e757329b19683172ac2fd9b3e11239852 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 6 Mar 2024 14:42:54 -0500
Subject: [PATCH 03/29] flake8

---
 crystal_diffusion/train_mtp.py | 71 +++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index 1d0b8202..3fcc3540 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -1,11 +1,9 @@
-import io
 import itertools
-import json
 import os
 import re
 import shutil
 import subprocess
-from collections import defaultdict, OrderedDict
+from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
@@ -15,7 +13,7 @@
 from maml.utils import check_structures_forces_stresses, pool_from
 from monty.io import zopen
 from monty.tempfile import ScratchDir
-from pymatgen.core import Lattice, Structure
+from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
 
@@ -61,15 +59,15 @@ def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
     return df
 
 
-class MTP_with_MLIP3(MTPotential):
+class MTPWithMLIP3(MTPotential):
     """MTP with MLIP-3."""
     def __init__(self,
                  mlip_path: str,
                  maml_path: str,
                  name: Optional[str] = None,
                  param: Optional[Dict[Any, Any]] = None,
-                 version:Optional[str] = None):
-        """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3
+                 version: Optional[str] = None):
+        """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3.
 
         Args:
             mlip_path: path to mlip3 mlp command
@@ -83,8 +81,10 @@ def __init__(self,
         # TODO there is a better way to do this
         self.module_dir = maml_path
         self.fitted_mtp = None
+        self.elements = None
 
     def to_lammps_format(self):
+        """Write the trained MTP in a LAMMPS compatible format."""
         # TODO
         # write params write the fitted mtp in a LAMMPS compatible format
         # self.write_param(
@@ -102,11 +102,9 @@ def evaluate(self,
                  test_structures: List[Structure],
                  test_energies: List[float],
                  test_forces: List[List[float]],
-                 test_stresses: Optional[List[List[float]]]=None,
-    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """
-        Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained
-        interatomic potentials.
+                 test_stresses: Optional[List[List[float]]] = None,
+                 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP.
 
         Args:
             fitted_mtp_path: path to fitted mtp file.
@@ -164,7 +162,7 @@ def read_cfgs(self, filename: str, nbh_grade: bool = False) -> pd.DataFrame:
             dataframe with energies, forces, optional nbh grades (MaxVol gamma)
         """
         def formatify(string: str) -> List[float]:
-            """Utility to convert string to a list of float"""
+            """Convert string to a list of float."""
             return [float(s) for s in string.split()]
 
         if not self.elements:
@@ -238,7 +236,6 @@ def train(
 
         Args:
             train_structures: The list of Pymatgen Structure object.
-            energies: The list of total energies of each structure in structures list.
             train_energies: List of total energies of each structure in structures list.
             train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
                 m can be varied with each single structure case.
@@ -282,7 +279,7 @@ def train(
             mtp_file_path = os.path.join(self.module_dir, "params", unfitted_mtp)
             shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
             commands = [self.mlp_command, "mindist", atoms_filename]
-            with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f)  as p:
+            with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f) as p:
                 p.communicate()[0]
 
             with open("min_dist") as f:
@@ -308,14 +305,14 @@ def train(
                 f"--iteration_limit={max_iter}",
                 "--al_mode=nbh",  # active learning mode - required to get extrapolation grade
                 # f"--curr-pot-name={unfitted_mtp}",  # TODO check those kwargs
-                #f"--energy-weight={energy_weight}",
-                #f"--force-weight={force_weight}",
-                #f"--stress-weight={stress_weight}",
-                #f"--init-params={init_params}",
-                #f"--scale-by-force={scale_by_force}",
-                #f"--bfgs-conv-tol={bfgs_conv_tol}",
-                #f"--weighting={weighting}",
-             ]
+                # f"--energy-weight={energy_weight}",
+                # f"--force-weight={force_weight}",
+                # f"--stress-weight={stress_weight}",
+                # f"--init-params={init_params}",
+                # f"--scale-by-force={scale_by_force}",
+                # f"--bfgs-conv-tol={bfgs_conv_tol}",
+                # f"--weighting={weighting}",
+            ]
             with subprocess.Popen(cmds_list, stdout=subprocess.PIPE) as p:
                 stdout = p.communicate()[0]
                 rc = p.returncode
@@ -361,8 +358,8 @@ def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, A
             coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']]
             coords = [[x[i] for i in coords_idx] for x in d['data']]
             pm_structure = Structure(lattice=lattice,
-                                    species=species,
-                                    coords=coords)
+                                     species=species,
+                                     coords=coords)
             structures.append(pm_structure)
             force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']]
             structure_forces = [[x[i] for i in force_idx] for x in d['data']]
@@ -387,8 +384,10 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
     return energies
 
 
-def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str], atom_dict: Dict[int, Any]) -> \
-    Dict[str, Any]:
+def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
+                                   thermo_yaml: List[str],
+                                   atom_dict: Dict[int, Any]
+                                   ) -> Dict[str, Any]:
     """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
 
     Args:
@@ -413,12 +412,11 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str
     return mtp_inputs
 
 
-def train_mtp(train_inputs: Dict[str, Any], max_dist: float=5) -> MTP_with_MLIP3:
-    """Create and evaluate a MTP potential.
+def train_mtp(train_inputs: Dict[str, Any]) -> MTPWithMLIP3:
+    """Create and evaluate an MTP potential.
 
     Args:
         train_inputs: inputs for training. Should contain structure, energies and forces
-        max_dist (optional): radial cutoff. Defaults to 5.
 
     Returns:
        dataframe with original and predicted energies and forces.
@@ -427,7 +425,7 @@ def train_mtp(train_inputs: Dict[str, Any], max_dist: float=5) -> MTP_with_MLIP3
     # create MTP
     mlp_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/build/mlp"
     maml_path = "/Users/simonb/miniconda/envs/crystal_diffusion/lib/python3.11/site-packages/maml/apps/pes/"
-    mtp = MTP_with_MLIP3(mlip_path=mlp_path, maml_path=maml_path)
+    mtp = MTPWithMLIP3(mlip_path=mlp_path, maml_path=maml_path)
     # train
     mtp.train(
         train_structures=train_inputs["structure"],
@@ -442,8 +440,8 @@ def train_mtp(train_inputs: Dict[str, Any], max_dist: float=5) -> MTP_with_MLIP3
     return mtp
 
 
-def evaluate_mtp(eval_inputs: Dict[str, Any], mtp: MTP_with_MLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """Create and evaluate a MTP potential.
+def evaluate_mtp(eval_inputs: Dict[str, Any], mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Create and evaluate an MTP potential.
 
     Args:
         eval_inputs: inputs to evaluate. Should contain structure, energies and forces
@@ -485,16 +483,17 @@ def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tu
     predicted_forces = df_predict.groupby('structure_index').agg({'fx': 'sum', 'fy': 'sum', 'fz': 'sum',
                                                                   'atom_index': 'count'})
     predicted_forces = np.concatenate([(predicted_forces[f'f{x}'] / predicted_forces['atom_index']).to_numpy()
-                                  for x in ['x', 'y', 'z']])
+                                       for x in ['x', 'y', 'z']])
 
     gt_forces = df_orig.groupby('structure_index').agg({'fx': 'sum', 'fy': 'sum', 'fz': 'sum',
-                                                                  'atom_index': 'count'})
+                                                        'atom_index': 'count'})
     gt_forces = np.concatenate([(gt_forces[f'f{x}'] / gt_forces['atom_index']).to_numpy() for x in ['x', 'y', 'z']])
 
-    return mean_absolute_error(predicted_energy, gt_energy) , mean_absolute_error(predicted_forces, gt_forces)
+    return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces)
 
 
 def main():
+    """Train and evaluate an example for MTP."""
     mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict)
     mtp = train_mtp(mtp_inputs)
     print("Training is done")

From aa8c888b16eb62ec65a1c46b0703754182182835 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 6 Mar 2024 16:03:42 -0500
Subject: [PATCH 04/29] mlip3 instructions

---
 README_mtp.md                                 |   93 +-
 mlip3_missing_files/add_missing_files.sh      |   23 +
 .../cmake/Modules/FindCBLAS.cmake             |  176 ++
 .../cmake/Modules/FindMKL.cmake               |   53 +
 .../cmake/Modules/FindOpenBLAS.cmake          |   57 +
 .../cmake/PreventInSourceBuilds.cmake         |   27 +
 mlip3_missing_files/file1.txt                 |   88 +
 mlip3_missing_files/mlp_commands.cpp          | 2192 +++++++++++++++++
 mlip3_missing_files/src_CMakeLists.txt        |   34 +
 mlip3_missing_files/src_common_CMakeLists.txt |   10 +
 .../src_drivers_CMakeLists.txt                |    7 +
 .../src_external_CMakeLists.txt               |    3 +
 mlip3_missing_files/src_mlp_CMakeLists.txt    |    6 +
 mlip3_missing_files/test_CMakeLists.txt       |    4 +
 .../test_examples_00_CMakeLists.txt           |   14 +
 .../test_examples_01_CMakeLists.txt           |   15 +
 .../test_examples_02_CMakeLists.txt           |   15 +
 .../test_examples_03_CMakeLists.txt           |   15 +
 .../test_examples_04_CMakeLists.txt           |   15 +
 .../test_examples_05_CMakeLists.txt           |   14 +
 .../test_examples_06_CMakeLists.txt           |   16 +
 .../test_examples_07_CMakeLists.txt           |   16 +
 .../test_examples_08_CMakeLists.txt           |   15 +
 .../test_examples_CMakeLists.txt              |    9 +
 .../test_selftest_CMakeLists.txt              |   21 +
 25 files changed, 2913 insertions(+), 25 deletions(-)
 create mode 100644 mlip3_missing_files/add_missing_files.sh
 create mode 100644 mlip3_missing_files/cmake/Modules/FindCBLAS.cmake
 create mode 100644 mlip3_missing_files/cmake/Modules/FindMKL.cmake
 create mode 100644 mlip3_missing_files/cmake/Modules/FindOpenBLAS.cmake
 create mode 100644 mlip3_missing_files/cmake/PreventInSourceBuilds.cmake
 create mode 100644 mlip3_missing_files/file1.txt
 create mode 100644 mlip3_missing_files/mlp_commands.cpp
 create mode 100644 mlip3_missing_files/src_CMakeLists.txt
 create mode 100644 mlip3_missing_files/src_common_CMakeLists.txt
 create mode 100644 mlip3_missing_files/src_drivers_CMakeLists.txt
 create mode 100644 mlip3_missing_files/src_external_CMakeLists.txt
 create mode 100644 mlip3_missing_files/src_mlp_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_00_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_01_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_02_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_03_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_04_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_05_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_06_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_07_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_08_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_examples_CMakeLists.txt
 create mode 100644 mlip3_missing_files/test_selftest_CMakeLists.txt

diff --git a/README_mtp.md b/README_mtp.md
index f0643f5a..c36d6260 100644
--- a/README_mtp.md
+++ b/README_mtp.md
@@ -4,10 +4,12 @@
 
 We use the maml (MAterials Machine Learning) library for the MTP implementation.
 
+```
 pip install maml
+```
 
 To make MTP works, we also need the mlp library
-https://mlip.skoltech.ru
+https://gitlab.com/ashapeev/mlip-3
 
 ## Instructions to install on a Mac
 
@@ -15,44 +17,85 @@ First, install open-mpi with brew.
 Beware that open-mpi uses Mac gcc compiler, not the brew gcc, which is not linked to a gfortran compiler.
 Set the path properly.
 
+```
 brew install gcc  
 export HOMEBREW_CC=gcc-13  # gcc-13 is brew gcc: /usr/local/bin/gcc-13
 export HOMEBREW_CXX=g++-13  # brew g++
 brew install openmpi --build-from-source
-
+```
 
 If openmpi is already installed, but doesn't work because of the gcc / gfortran unlinkable, you can reinstall
 
+```
 export HOMEBREW_CC=gcc-13  # gcc-13 is brew gcc: /usr/local/bin/gcc-13
 export HOMEBREW_CXX=g++-13  # brew g++
 brew reinstall openmpi --build-from-source
+```
 
 Then, you can install MLIP
 
-git clone https://gitlab.com/ashapeev/mlip-2.git
-cd mlip-2
-./configure
-make mlip
-
-If the installation is successful, the command is mlip-2/bin/mlp
-This should be recognized as mlp, but it was not in my case.
-This creates issues with maml that was calling mlp with python subprocess.Popen
-
-The fix was to modify the maml code (change the root path to fix your environment)
-In
-/Users/simonb/miniconda/envs/crystal_diffusion/lib/python3.11/site-packages/maml/apps/pes/_mtp.py
+```
+git clone https://gitlab.com/ashapeev/mlip-3.git
+```
 
-You want to change the "mlp" variable passed in subprocess.Popen by the full path of the MLIP-2 mlp command.
-e.g. at line 597
-with open("min_dist", "w") as f, subprocess.Popen(["mlp", "mindist", atoms_filename], stdout=f) as p:
+The build instructions are supposed to be:
 
-was replaced with:
-
-mlp_command = '/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-2/bin/mlp'
-with open("min_dist", "w") as f, subprocess.Popen([mlp_command, "mindist", atoms_filename], stdout=f) as p:
+```
+cd mlip-3
+./configure
+make mlip
+```
 
-same at line 615, 722, 724
+But this doesn't work. Instead, we have to use cmake:
 
-Also, comment out the block starting with 
-if not which("mlp"):
-at lines 574 and 696.
\ No newline at end of file
+```
+cd mlip-3
+./configure
+mkdir build              # create a build directory
+cd build
+cmake ..                 # configuration
+```
+
+But this doesn't work either, because the cmake instructions are missing from the repo.
+
+```
+cp mlip3_missing_files/CMakeLists.txt mlip-3/CMakeLists.txt
+cp mlip3_missing_files/src_CMakeLists.txt mlip-3/src/CMakeLists.txt
+cp mlip3_missing_files/test_CMakeLists.txt mlip-3/test/CMakeLists.txt
+cp mlip3_missing_files/src_mlp_CMakeLists.txt mlip-3/src/mlp/CMakeLists.txt
+cp mlip3_missing_files/src_common_CMakeLists.txt mlip-3/src/common/CMakeLists.txt
+cp mlip3_missing_files/src_drivers_CMakeLists.txt mlip-3/src/drivers/CMakeLists.txt
+cp mlip3_missing_files/src_external_CMakeLists.txt mlip-3/src/external/CMakeLists.txt
+cp mlip3_missing_files/test_selftest_CMakeLists.txt mlip-3/test/self-test/CMakeLists.txt
+cp mlip3_missing_files/test_examples_CMakeLists.txt mlip-3/test/examples/CMakeLists.txt
+cp mlip3_missing_files/test_examples_00_CMakeLists.txt mlip-3/test/examples/00.convert_vasp_outcar/CMakeLists.txt
+cp mlip3_missing_files/test_examples_01_CMakeLists.txt mlip-3/test/examples/01.train/CMakeLists.txt
+cp mlip3_missing_files/test_examples_02_CMakeLists.txt mlip-3/test/examples/02.check_errors/CMakeLists.txt
+cp mlip3_missing_files/test_examples_03_CMakeLists.txt mlip-3/test/examples/03.calculate_efs/CMakeLists.txt
+cp mlip3_missing_files/test_examples_04_CMakeLists.txt mlip-3/test/examples/04.calculate_grade/CMakeLists.txt
+cp mlip3_missing_files/test_examples_05_CMakeLists.txt mlip-3/test/examples/05.cut_extrapolative_neighborhood/CMakeLists.txt
+cp mlip3_missing_files/test_examples_06_CMakeLists.txt mlip-3/test/examples/06.select_add/CMakeLists.txt
+cp mlip3_missing_files/test_examples_07_CMakeLists.txt mlip-3/test/examples/07.relax/CMakeLists.txt
+cp mlip3_missing_files/test_examples_08_CMakeLists.txt mlip-3/test/examples/08.relax_preselect/CMakeLists.txt
+cp -r mlip3_missing_files/cmake mlip-3/
+cp mlip3_missing_files/mlp_commands.cpp mlip-3/src/mlp/mlp_commands.cpp
+```
+A .sh file is provided in the missing_data folder for this.
+
+
+then we can cmake. For the last file, we commented the tests as they were not working. 
+On mac, we use openblas:
+
+```
+cmake .. -DBLAS_ROOT=/usr/local/opt/openblas/
+make
+```
+
+If the installation is successful, the command is 
+```
+mlip-3/build/mlp
+```
+
+You can try running the tests in *mlip-3/test/examples/* and compare to the results in the respective *sample_out* folder.
+
+The *train_mtp.py* script 
\ No newline at end of file
diff --git a/mlip3_missing_files/add_missing_files.sh b/mlip3_missing_files/add_missing_files.sh
new file mode 100644
index 00000000..1fc2ca18
--- /dev/null
+++ b/mlip3_missing_files/add_missing_files.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+cp mlip3_missing_files/CMakeLists.txt mlip-3/CMakeLists.txt
+cp mlip3_missing_files/src_CMakeLists.txt mlip-3/src/CMakeLists.txt
+cp mlip3_missing_files/test_CMakeLists.txt mlip-3/test/CMakeLists.txt
+cp mlip3_missing_files/src_mlp_CMakeLists.txt mlip-3/src/mlp/CMakeLists.txt
+cp mlip3_missing_files/src_common_CMakeLists.txt mlip-3/src/common/CMakeLists.txt
+cp mlip3_missing_files/src_drivers_CMakeLists.txt mlip-3/src/drivers/CMakeLists.txt
+cp mlip3_missing_files/src_external_CMakeLists.txt mlip-3/src/external/CMakeLists.txt
+cp mlip3_missing_files/test_selftest_CMakeLists.txt mlip-3/test/self-test/CMakeLists.txt
+cp mlip3_missing_files/test_examples_CMakeLists.txt mlip-3/test/examples/CMakeLists.txt
+cp mlip3_missing_files/test_examples_00_CMakeLists.txt mlip-3/test/examples/00.convert_vasp_outcar/CMakeLists.txt
+cp mlip3_missing_files/test_examples_01_CMakeLists.txt mlip-3/test/examples/01.train/CMakeLists.txt
+cp mlip3_missing_files/test_examples_02_CMakeLists.txt mlip-3/test/examples/02.check_errors/CMakeLists.txt
+cp mlip3_missing_files/test_examples_03_CMakeLists.txt mlip-3/test/examples/03.calculate_efs/CMakeLists.txt
+cp mlip3_missing_files/test_examples_04_CMakeLists.txt mlip-3/test/examples/04.calculate_grade/CMakeLists.txt
+cp mlip3_missing_files/test_examples_05_CMakeLists.txt mlip-3/test/examples/05.cut_extrapolative_neighborhood/CMakeLists.txt
+cp mlip3_missing_files/test_examples_06_CMakeLists.txt mlip-3/test/examples/06.select_add/CMakeLists.txt
+cp mlip3_missing_files/test_examples_07_CMakeLists.txt mlip-3/test/examples/07.relax/CMakeLists.txt
+cp mlip3_missing_files/test_examples_08_CMakeLists.txt mlip-3/test/examples/08.relax_preselect/CMakeLists.txt
+cp -r mlip3_missing_files/cmake mlip-3/
+cp mlip3_missing_files/mlp_commands.cpp mlip-3/src/mlp/mlp_commands.cpp
+
diff --git a/mlip3_missing_files/cmake/Modules/FindCBLAS.cmake b/mlip3_missing_files/cmake/Modules/FindCBLAS.cmake
new file mode 100644
index 00000000..277562c3
--- /dev/null
+++ b/mlip3_missing_files/cmake/Modules/FindCBLAS.cmake
@@ -0,0 +1,176 @@
+#
+# The script to detect CBLAS headers and libraries
+#
+# The default option (WITH_SYSTEM_BLAS=ON), is to use the system installed
+# BLAS library and its C interface. The priority is with the Intel
+# compiler and the MKL package if it exists with its environment variables
+# set. Then it searches for OpenBLAS installation, and next, it looks for
+# other libraries like Atlas or LAPACK and checks if it can use them. In
+# case it can not be used or is unable to find any system installed
+# libraries, it would build its embedded library and use it.
+# In the case of WITH_SYSTEM_BLAS=OFF value, it would escape the system
+# search and build its embedded library and use it.
+#
+# To use the specific user/system installed library, you can set a BLAS_ROOT
+# variable to a directory that contains a BLAS installation.
+#
+# It will use a system built CBLAS or build an
+# embedded one and define:
+#
+# CBLAS_FOUND         - True if CBLAS found
+# CBLAS_INCLUDE_DIRS  - CBLAS include folder, where to find cblas.h.
+# CBLAS_LIBRARIES     - CBLAS libraries.
+#
+
+if(BLAS_ROOT)
+  if(IS_DIRECTORY "${BLAS_ROOT}")
+    set(CBLAS_INCLUDE_DIRS_HINTS ${BLAS_ROOT}/include ${BLAS_ROOT})
+    set(CBLAS_LIBRARIES_HINTS ${BLAS_ROOT}/lib ${BLAS_ROOT}/lib64 ${BLAS_ROOT})
+
+    find_path(CBLAS_INCLUDE_DIRS NAMES cblas.h HINTS ${CBLAS_INCLUDE_DIRS_HINTS})
+    find_library(CBLAS_LIBRARIES NAMES cblas HINTS ${CBLAS_LIBRARIES_HINTS})
+
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
+
+    if(CBLAS_FOUND)
+      add_library(mlip_cblas UNKNOWN IMPORTED)
+      set_target_properties(mlip_cblas
+        PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${CBLAS_INCLUDE_DIRS}"
+        IMPORTED_LOCATION "${CBLAS_LIBRARIES}"
+      )
+      unset(WITH_SYSTEM_BLAS)
+    endif(CBLAS_FOUND)
+  else()
+    message(FATAL_ERROR "Wrong option, BLAS_ROOT = ${BLAS_ROOT} is not a directory")
+  endif(IS_DIRECTORY "${BLAS_ROOT}")
+endif(BLAS_ROOT)
+
+if(WITH_SYSTEM_BLAS)
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    find_package(MKL)
+    if(MKL_FOUND)
+      set(CBLAS_FOUND ON)
+      add_library(mlip_cblas UNKNOWN IMPORTED)
+      set_target_properties(mlip_cblas
+        PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${MKL_INCLUDE_DIRS}"
+        IMPORTED_LOCATION "${MKL_LIBRARIES}"
+      )
+    endif(MKL_FOUND)
+  endif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+
+  if(NOT CBLAS_FOUND)
+    find_package(OpenBLAS)
+    if(OpenBLAS_FOUND)
+      set(CBLAS_FOUND ON)
+      add_library(mlip_cblas UNKNOWN IMPORTED)
+      set_target_properties(mlip_cblas
+        PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${OpenBLAS_INCLUDE_DIRS}"
+        IMPORTED_LOCATION "${OpenBLAS_LIBRARIES}"
+      )
+    endif(OpenBLAS_FOUND)
+  endif(NOT CBLAS_FOUND)
+
+  if(NOT CBLAS_FOUND)
+    find_package(BLAS QUIET)
+    if(BLAS_FOUND)
+      include(CheckSymbolExists)
+      set(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
+      check_symbol_exists(cblas_dgemm "cblas.h" BLAS_HAS_CBLAS)
+      list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES "${BLAS_LIBRARIES}")
+
+      if(BLAS_HAS_CBLAS)
+        set(CBLAS_INCLUDE_DIRS_HINTS
+          $ENV{Atlas_HOME}/include/atlas
+          $ENV{Atlas_HOME}/include/atlas-base
+          $ENV{Atlas_HOME}/include
+          $ENV{Atlas_DIR}/include/atlas
+          $ENV{Atlas_DIR}/include/atlas-base
+          $ENV{Atlas_DIR}/include
+          $ENV{Atlas_ROOT_DIR}/include/atlas
+          $ENV{Atlas_ROOT_DIR}/include/atlas-base
+          $ENV{Atlas_ROOT_DIR}/include
+          $ENV{CBLASDIR}/include
+          $ENV{CBLASDIR}
+          /usr/local/opt/atlas/include
+          /usr/local/include/atlas
+          /usr/local/include/atlas-base
+          /usr/local/include
+          /usr/include/atlas
+          /usr/include/atlas-base
+          /usr/include
+        )
+
+        set(CBLAS_LIBRARIES_HINTS
+          $ENV{Atlas_HOME}/lib
+          $ENV{Atlas_HOME}
+          $ENV{Atlas_DIR}/lib
+          $ENV{Atlas_DIR}
+          $ENV{Atlas_ROOT_DIR}/lib
+          $ENV{Atlas_ROOT_DIR}
+          $ENV{CBLASDIR}/lib
+          $ENV{CBLASDIR}/lib64
+          /usr/local/opt/atlas/lib
+          /usr/lib/atlas
+          /usr/lib/atlas-base
+          /usr/lib
+          /usr/lib64/atlas
+          /usr/lib64/atlas-base
+          /usr/lib64
+          /usr/local/lib64/atlas
+          /usr/local/lib64/atlas-base
+          /usr/local/lib64
+          /usr/local/lib/atlas
+          /usr/local/lib/atlas-base
+          /usr/local/lib
+          /lib/atlas
+          /lib/atlas-base
+          /lib
+          /lib64/atlas
+          /lib64/atlas-base
+          /lib64
+        )
+
+        foreach(src_file ${BLAS_LIBRARIES})
+          get_filename_component(src_file_path "${src_file}" DIRECTORY)
+          list(APPEND ${CBLAS_INCLUDE_DIRS_HINTS} "${src_file_path}")
+          list(APPEND ${CBLAS_LIBRARIES_HINTS} "${src_file_path}")
+        endforeach(src_file ${BLAS_LIBRARIES})
+
+        find_path(CBLAS_INCLUDE_DIRS NAMES cblas.h HINTS ${CBLAS_INCLUDE_DIRS_HINTS})
+        find_library(CBLAS_LIBRARIES NAMES cblas HINTS ${CBLAS_LIBRARIES_HINTS})
+
+        include(FindPackageHandleStandardArgs)
+        find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES)
+
+        if(CBLAS_FOUND)
+          add_library(mlip_cblas UNKNOWN IMPORTED)
+          set_target_properties(mlip_cblas
+            PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${CBLAS_INCLUDE_DIRS}"
+            IMPORTED_LOCATION "${CBLAS_LIBRARIES}"
+          )
+        endif(CBLAS_FOUND)
+      endif(BLAS_HAS_CBLAS)
+    endif(BLAS_FOUND)
+  endif(NOT CBLAS_FOUND)
+
+  if(NOT CBLAS_FOUND)
+    if(NOT BLAS_ROOT)
+      message(WARNING " \
+
+To use the specific system installed library you can set \
+a ``BLAS_ROOT`` variable to a directory that contains a CBLAS \
+installation: `cmake .. -DBLAS_ROOT=<Path to a directory on disk>`")
+    endif(NOT BLAS_ROOT)
+  endif(NOT CBLAS_FOUND)
+endif(WITH_SYSTEM_BLAS)
+
+if(NOT CBLAS_FOUND)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/cblas)
+endif(NOT CBLAS_FOUND)
+
+mark_as_advanced(CBLAS_INCLUDE_DIRS CBLAS_LIBRARIES CBLAS)
diff --git a/mlip3_missing_files/cmake/Modules/FindMKL.cmake b/mlip3_missing_files/cmake/Modules/FindMKL.cmake
new file mode 100644
index 00000000..c6a2157e
--- /dev/null
+++ b/mlip3_missing_files/cmake/Modules/FindMKL.cmake
@@ -0,0 +1,53 @@
+#
+# The script to detect Intel(R) Math Kernel Library (MKL)
+# headers and libraries.
+#
+# On return this will define:
+#
+# MKL_FOUND         - True if Intel MKL found
+# MKL_INCLUDE_DIRS  - MKL include folder, where to find mkl.h, etc.
+# MKL_LIBRARIES     - MKL libraries when using mkl.
+#
+
+set(MKL_INCLUDE_DIRS_HINTS
+  $ENV{MKLROOT}/include
+  $ENV{MKLROOT}
+  $ENV{MKL_ROOT_DIR}/include
+  $ENV{MKL_ROOT_DIR}
+  /opt/intel/mkl/include
+  ${MKL_ROOT_DIR}/include
+)
+
+set(MKL_LIBRARIES_HINTS
+  $ENV{MKLROOT}/lib
+  $ENV{MKLROOT}/lib/intel64
+  $ENV{MKLROOT}/lib/ia32
+  $ENV{MKLROOT}
+  $ENV{MKL_ROOT_DIR}/lib
+  $ENV{MKL_ROOT_DIR}/lib/intel64
+  $ENV{MKL_ROOT_DIR}/lib/ia32
+  $ENV{MKL_ROOT_DIR}
+  /opt/intel/mkl/lib
+  /opt/intel/mkl/lib/intel64
+  /opt/intel/mkl/lib/ia32
+  ${MKL_ROOT_DIR}/lib
+  ${MKL_ROOT_DIR}/lib/intel64
+  ${MKL_ROOT_DIR}/lib/ia32
+)
+
+find_path(MKL_INCLUDE_DIRS NAMES mkl.h HINTS ${MKL_INCLUDE_DIRS_HINTS})
+find_library(MKL_LIBRARIES NAMES mkl_rt HINTS ${MKL_LIBRARIES_HINTS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MKL DEFAULT_MSG MKL_INCLUDE_DIRS MKL_LIBRARIES)
+
+if(NOT MKL_FOUND)
+  if(NOT DEFINED ENV{MKLROOT})
+     message(WARNING "\
+
+Set the ``MKLROOT`` environment variable to a directory that contains an MKL installation.
+")
+  endif(NOT DEFINED ENV{MKLROOT})
+endif(NOT MKL_FOUND)
+
+mark_as_advanced(MKL_INCLUDE_DIRS MKL_LIBRARIES MKL)
diff --git a/mlip3_missing_files/cmake/Modules/FindOpenBLAS.cmake b/mlip3_missing_files/cmake/Modules/FindOpenBLAS.cmake
new file mode 100644
index 00000000..e108e806
--- /dev/null
+++ b/mlip3_missing_files/cmake/Modules/FindOpenBLAS.cmake
@@ -0,0 +1,57 @@
+#
+# The script to detect OpenBLAS Library
+# headers and libraries.
+#
+# On return this will define:
+#
+# OpenBLAS_FOUND         - True if OpenBLAS found
+# OpenBLAS_INCLUDE_DIRS  - OpenBLAS include folder, where to find cblas.h.
+# OpenBLAS_LIBRARIES     - OpenBLAS libraries.
+#
+
+set(OpenBLAS_INCLUDE_DIRS_HINTS
+  $ENV{OpenBLAS}/include/openblas
+  $ENV{OpenBLAS}/include
+  $ENV{OpenBLAS}
+  $ENV{OpenBLAS_HOME}/include/openblas
+  $ENV{OpenBLAS_HOME}/include
+  $ENV{OpenBLAS_HOME}
+  $ENV{OpenBLAS_DIR}/include/openblas
+  $ENV{OpenBLAS_DIR}/include
+  $ENV{OpenBLAS_DIR}
+  /usr/local/opt/openblas/include
+  /opt/OpenBLAS/include
+  /usr/local/include/openblas
+  /usr/include/openblas
+  /usr/local/include/openblas-base
+  /usr/include/openblas-base
+  /usr/local/include
+  /usr/include
+)
+
+set(OpenBLAS_LIBRARIES_HINTS
+  $ENV{OpenBLAS}/lib
+  $ENV{OpenBLAS}
+  $ENV{OpenBLAS_HOME}/lib
+  $ENV{OpenBLAS_HOME}
+  $ENV{OpenBLAS_DIR}/lib
+  $ENV{OpenBLAS_DIR}
+  /usr/local/opt/openblas/lib
+  /opt/OpenBLAS/lib
+  /usr/local/lib64
+  /usr/local/lib
+  /lib/openblas-base
+  /lib64
+  /lib
+  /usr/lib/openblas-base
+  /usr/lib64
+  /usr/lib
+)
+
+find_path(OpenBLAS_INCLUDE_DIRS NAMES cblas.h HINTS ${OpenBLAS_INCLUDE_DIRS_HINTS})
+find_library(OpenBLAS_LIBRARIES NAMES openblas HINTS ${OpenBLAS_LIBRARIES_HINTS})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OpenBLAS DEFAULT_MSG OpenBLAS_INCLUDE_DIRS OpenBLAS_LIBRARIES)
+
+mark_as_advanced(OpenBLAS_INCLUDE_DIRS OpenBLAS_LIBRARIES OpenBLAS)
diff --git a/mlip3_missing_files/cmake/PreventInSourceBuilds.cmake b/mlip3_missing_files/cmake/PreventInSourceBuilds.cmake
new file mode 100644
index 00000000..47df5f99
--- /dev/null
+++ b/mlip3_missing_files/cmake/PreventInSourceBuilds.cmake
@@ -0,0 +1,27 @@
+# - Prevent in-source builds.
+# https://stackoverflow.com/questions/1208681/with-cmake-how-would-you-disable-in-source-builds/
+
+function(prevent_in_source_builds)
+
+  # make sure the user doesn't play dirty with symlinks
+  get_filename_component(srcdir1 "${CMAKE_SOURCE_DIR}"    REALPATH)
+  get_filename_component(srcdir2 "${MLIP_SOURCE_DIR}"     REALPATH)
+  get_filename_component(srcdir3 "${MLIP_SOURCE_DEV_DIR}" REALPATH)
+  get_filename_component(bindir  "${CMAKE_BINARY_DIR}"    REALPATH)
+
+  # disallow in-source builds
+  if("${srcdir1}" STREQUAL "${bindir}" OR
+     "${srcdir2}" STREQUAL "${bindir}" OR
+     "${srcdir3}" STREQUAL "${bindir}")
+    message(FATAL_ERROR "\
+
+One must not run the CMake within the source directory. \
+Instead, create a dedicated ``build`` directory and run CMake there. \
+To clean up after this aborted in-place compilation: `rm -fr CMakeCache.txt CMakeFiles`
+")
+  endif("${srcdir1}" STREQUAL "${bindir}" OR
+        "${srcdir2}" STREQUAL "${bindir}" OR
+        "${srcdir3}" STREQUAL "${bindir}")
+endfunction()
+
+prevent_in_source_builds()
diff --git a/mlip3_missing_files/file1.txt b/mlip3_missing_files/file1.txt
new file mode 100644
index 00000000..dc8cf14f
--- /dev/null
+++ b/mlip3_missing_files/file1.txt
@@ -0,0 +1,88 @@
+#
+# CMake build system
+# This file is part of MLIP
+#
+# Contributors:
+#    Yaser Afshar
+#    Ryan S. Elliott
+#
+cmake_minimum_required(VERSION 3.10)
+
+# Define main project
+project(MLIP LANGUAGES CXX C Fortran VERSION 0.2)
+
+# Project authors
+set(AUTHOR "Alexander Shapeev, Evgeny Podryabinkin, Konstantin Gubaev, and Ivan Novikov")
+set(AUTHOR_DETAILS "")
+set(DESCRIPTION "MLIP is a software for Machine Learning Interatomic Potentials.")
+
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+  message(STATUS "Setting the build type to \"RelWithDebInfo\" as none was specified.")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Set the build type in this build tree." FORCE)
+  # Possible values of build type in MLIP for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo" "MinSizeRel")
+endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+                      "${CMAKE_SOURCE_DIR}/cmake"
+                      "${CMAKE_SOURCE_DIR}/cmake/Modules")
+
+# Project build options
+option(WITH_MPI "Build an MPI version" ON)
+if(WITH_MPI)
+  find_package(MPI QUIET) # sets MPI_FOUND=TRUE if successful
+  if(MPI_FOUND)
+    message(STATUS "Found MPI: TRUE (found version \"${MPI_CXX_VERSION}\")")
+  endif(MPI_FOUND)
+endif(WITH_MPI)
+option(WITH_SELFTEST "Enable 'self-testing' implementation" ON)
+option(WITH_LOSSLESS_CFG "Enable 'lossless configuration' file writing" OFF)
+option(WITH_SYSTEM_BLAS "BLAS library, system installed library (with a C-style interface)" ON)
+set(BLAS_ROOT " /usr/local/opt/openblas/" CACHE STRING "Path to the BLAS directory on the system")
+find_package(CBLAS)
+
+add_executable(mlp "")
+set_target_properties(mlp
+  PROPERTIES
+  CXX_STANDARD 11
+  CXX_STANDARD_REQUIRED ON
+  )
+target_link_libraries(mlp PRIVATE
+  mlip_cblas
+  $<$<BOOL:${MPI_FOUND}>:MPI::MPI_CXX>
+  $<$<CXX_COMPILER_ID:Intel>:ifcore>
+  )
+target_compile_definitions(mlp PRIVATE
+  "MLIP_DEV"
+  $<$<BOOL:${MPI_FOUND}>:"MLIP_MPI">
+  $<$<NOT:$<BOOL:${WITH_SELFTEST}>>:"MLIP_NO_SELFTEST">
+  $<$<BOOL:${WITH_LOSSLESS_CFG}>:"MLIP_LOSSLESS_CFG">
+  $<$<BOOL:${MKL_FOUND}>:"MLIP_INTEL_MKL">
+  $<$<CONFIG:Debug>:"MLIP_DEBUG">
+  )
+
+add_library(libinterface "")
+set_target_properties(libinterface
+  PROPERTIES
+  OUTPUT_NAME "_mlip_interface"
+  CXX_STANDARD 11
+  CXX_STANDARD_REQUIRED ON
+  POSITION_INDEPENDENT_CODE ON
+  )
+target_link_libraries(libinterface PRIVATE
+  mlip_cblas
+  $<$<CXX_COMPILER_ID:Intel>:ifcore>
+  )
+target_compile_definitions(libinterface PRIVATE
+  "MLIP_DEV"
+  $<$<BOOL:${WITH_LOSSLESS_CFG}>:"MLIP_LOSSLESS_CFG">
+  $<$<BOOL:${MKL_FOUND}>:"MLIP_INTEL_MKL">
+  $<$<CONFIG:Debug>:"MLIP_DEBUG">
+  )
+
+install(TARGETS mlp libinterface)
+
+enable_testing()
+
+add_subdirectory(src)
+add_subdirectory(test)
diff --git a/mlip3_missing_files/mlp_commands.cpp b/mlip3_missing_files/mlp_commands.cpp
new file mode 100644
index 00000000..ab63daf4
--- /dev/null
+++ b/mlip3_missing_files/mlp_commands.cpp
@@ -0,0 +1,2192 @@
+/*   MLIP is a software for Machine Learning Interatomic Potentials
+ *   MLIP is released under the "New BSD License", see the LICENSE file.
+ *   Contributors: Alexander Shapeev, Evgeny Podryabinkin, Ivan Novikov
+ */
+#include <algorithm>
+#include <sstream>
+#include "mlp.h"
+#include "../mtpr_trainer.h"
+#include "../wrapper.h"
+#include "../drivers/basic_drivers.h"
+#include "../drivers/relaxation.h"
+#include "../magnetic_moments.h"
+
+
+//bool RunAllTests(bool is_parallel);
+using namespace std;
+
+
+// does a number of unit tests (not dev)
+// returns true if all tests are successful
+// otherwise returns false and stops further tests
+
+bool self_test()
+{
+    ofstream logstream("temp/log");
+    SetStreamForOutput(&logstream);
+
+#ifndef MLIP_DEBUG
+    if (mpi.rank == 0) {
+        std::cout << "Note: self-test is running without #define MLIP_DEBUG;\n"
+            << "      build with -DMLIP_DEBUG and run if troubles encountered" << std::endl;
+    }
+#endif
+
+#ifndef MLIP_MPI
+    cout << "Serial tests:" << endl;
+    // if (!RunAllTests(false)) return false;
+#else
+    if (mpi.rank == 0) 
+        cout << "mpi tests (" << mpi.size << " cores):" << endl;
+    // if (!RunAllTests(true)) return false;
+#endif
+
+    logstream.close();
+    return true;
+}
+
+
+bool Commands(const string& command, vector<string>& args, map<string, string>& opts)
+{
+    bool is_command_found = false;
+
+    if (command == "list" || command == "help") 
+    {
+        if (command == "list" || args.size() == 0) is_command_found = true;
+        if (mpi.rank == 0) 
+        {
+            cout << "mlp "
+#ifdef MLIP_MPI
+                << "mpi version (" << mpi.size << " cores), "
+#else
+                << "serial version, "
+#endif
+                << "(C) A. Shapeev, E. Podryabinkin, I. Novikov (Skoltech).\n";
+            if (command == "help" && args.size() == 0)
+                cout << USAGE;
+            if (command == "list")
+                cout << "List of available commands:\n";
+        }
+    }
+
+#include "../sample_potentials.h"
+
+    BEGIN_UNDOCUMENTED_COMMAND("run",
+        "reads configurations from a cfg_filename and processes them according to the settings specified in mlip.ini",
+        "mlp run mlip.ini cfg_filename [settings]:\n"
+        "Settings can be given in any order and are with the same names as in mlip.ini.\n"
+        "    Settings given from the command line overrides the settings taken from the file.\n"
+    )
+    {
+        if (args.size() != 2) {
+            cout << "mlp run: 2 argument required\n";
+            return 1;
+        }
+        string mlip_fnm = args[0];
+        Message("Settings are read from \""+mlip_fnm+"\"");
+        string read_cfg_fnm = args[1];
+        Message("Configurations are read from \""+read_cfg_fnm+"\"");
+
+        ios::sync_with_stdio(false);
+
+        Settings settings;
+        settings = LoadSettings(mlip_fnm);
+        settings.Modify(opts);
+
+        Wrapper mlip_wrapper(settings);
+
+        map<string, string> driver_settings;
+        driver_settings.emplace("cfg_filename", read_cfg_fnm);
+
+        CfgReader cfg_reader(driver_settings, &mlip_wrapper);
+
+        cfg_reader.Run();
+
+        Message("Configuration reading complete");
+    } END_COMMAND;
+
+    BEGIN_COMMAND("calculate_efs",
+        "calculate energy, forces and stresses for configurations from a file",
+        "mlp calculate_efs pot.mtp cfg_filename [settings]:\n"
+        "reads configurations from a cfg_filename and calculates energy, forces and stresses for them"
+        "Settings can be given in any order"
+        "    --output_filename=<filename> file for output configurations. The same as cfg_filename by default.\n"
+    )
+    {
+        if (args.size() != 2) {
+            cout << "mlp calculate_efs: 2 arguments required\n";
+            return 1;
+        }
+        string mlip_filename = args[0];
+        string read_cfg_fnm = args[1];
+
+        MLMTPR mtp(mlip_filename);
+        Message("MTP loaded from " + mlip_filename);
+
+        if (opts["output_filename"].empty())
+        {
+            auto cfgs = MPI_LoadCfgs(read_cfg_fnm);
+            Message("Configurations are read from \""+read_cfg_fnm+"\"");
+            for (auto& cfg : cfgs)
+                mtp.CalcEFS(cfg);
+            
+            {
+                ofstream ofs(read_cfg_fnm);
+                if (!ofs.is_open())
+                    ERROR("Can't open output configurations file");
+            }
+
+            int max_count = 0;
+            int local_count = (int)cfgs.size();
+            MPI_Allreduce(&local_count, &max_count, 1, MPI_INT, MPI_MAX, mpi.comm);
+
+            for (Configuration& cfg : cfgs)
+                for (int rnk=0; rnk<mpi.size; rnk++)
+                {
+                    if (rnk == mpi.rank)
+                        cfg.AppendToFile(read_cfg_fnm);
+
+                    MPI_Barrier(mpi.comm);
+                }
+            if (cfgs.size() < max_count)
+                for (int rnk=0; rnk<mpi.size; rnk++)
+                    MPI_Barrier(mpi.comm);
+        }
+        else
+        {
+            ifstream ifs(read_cfg_fnm, ios::binary);
+            if (ifs.is_open())
+                Message("Configurations are read from \""+read_cfg_fnm+"\"");
+            else
+                ERROR("Can't open input configurations file");
+
+            string out_filename = opts["output_filename"] + mpi.fnm_ending;
+            {
+                ofstream ofs(out_filename);
+                if (!ofs.is_open())
+                    ERROR("Can't open output configurations file");
+            }
+
+            int count=0;
+            for (Configuration cfg; cfg.Load(ifs); count++)
+                 if (count % mpi.size == mpi.rank)
+                 {
+                     mtp.CalcEFS(cfg);
+                     cfg.AppendToFile(out_filename);
+                 }
+        }
+
+        Message("EFS calculation complete");
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("add_species",
+        "adds species to an MTP",
+        "mlp set_species pot.mtp 0 1 2\n"
+        "Settings can be given in any order"
+        "    --random_parameters=<filename> initialize parameters randomly. True by default.\n"
+    )
+    {
+        if (args.size() < 2) {
+            cout << "mlp set_species: at least 2 arguments required\n";
+            return 1;
+        }
+        string mlip_filename = args[0];
+
+        bool init_random = true;
+        if (opts["random_parameters"] == "false" || opts["random_parameters"] == "False" || opts["random_parameters"] == "FALSE")
+            init_random = false;
+
+        MLMTPR mtp(mlip_filename);
+        Message("MTP loaded from " + mlip_filename);
+
+        set<int> new_species_set;
+        for (int i=1; i<args.size(); i++)
+        {
+            int sp = stoi(args[i]);
+            if (sp<0)
+                ERROR("Species numbers must be positive or zero");
+            new_species_set.insert(sp);
+        }
+        mtp.AddSpecies(new_species_set, init_random);
+        mtp.Save(mlip_filename);
+
+        Message("Species added");
+    } END_COMMAND;
+
+    BEGIN_COMMAND("relax",
+        "relaxes the configuration(s)",
+        "mlp relax pot.mtp for_relax_filename relaxed_filename [settings]:\n"
+        "for_relax_filename is a file with the configurations for relaxation.\n"
+        "relaxed_filename is a file with the relaxed configurations.\n"
+        "Settings can be given in any order.\n"
+        "    --extrapolation_control=<true/false>: switch-on preselection if specified, not specified by default.\n"
+        "    --threshold_save=<double>: configurations with grade above this value (the preselected ones) will be saved to output file, default=2.1.\n"
+        "    --threshold_break=<double>: grade calculation will break if configuration grade is above this value, default=11.\n"
+        "    --save_extrapolative_to=<filename>: output file where preselected configurations will be saved, default: preselected.cfg.\n"
+        "    --save_unrelaxed=<filename>: cfg_relax_failed_filename is a file with the original configurations which relaxation failed.\n"
+        "    --relaxation_settings=<filename>: file with the settings for the relaxation driver.\n"
+        "    Settings given from the command line overrides the settings taken from the file.\n"
+    ) 
+    {
+        if (args.size() != 3)
+        {
+            cout << "mlp relax: 3 argument required\n";
+            return 1;
+        }
+        string mtp_fnm = args[0];
+        Message("Potential was read from \""+mtp_fnm+"\"");
+        string read_cfg_fnm = args[1];
+        Message("Configurations are read from \""+read_cfg_fnm+"\"");
+        string relaxed_cfg_fnm = args[2];
+        Message("Relaxed configurations will be saved to \""+relaxed_cfg_fnm+"\"");
+
+        string unrelaxed_cfg_fnm;
+        if (!opts["save_unrelaxed"].empty())
+        {
+            unrelaxed_cfg_fnm = opts["save_unrelaxed"];
+            Message("Unrelaxed configurations will be saved to \""+relaxed_cfg_fnm+"\"");
+        }
+        string relaxation_settings_fnm = "";
+        if (!opts["relaxation_settings"].empty())
+        {
+            relaxation_settings_fnm = opts["relaxation_settings"];
+            Message("Relaxation settings will be loaded from \""+relaxation_settings_fnm+"\"");
+        }
+
+        Settings settings;
+        if (relaxation_settings_fnm != "") settings = LoadSettings(relaxation_settings_fnm);
+        settings.Modify(opts);
+        relaxed_cfg_fnm += mpi.fnm_ending;
+        if (!unrelaxed_cfg_fnm.empty())
+            unrelaxed_cfg_fnm += mpi.fnm_ending;
+
+        ios::sync_with_stdio(false);
+
+        // Fix settings, switch off settings enabling interprocessor communication
+        settings["mlip"] = "true";
+        settings["calculate_efs"] = "true";
+        settings["mlip:load_from"] = mtp_fnm;
+        bool preselect = opts.count("extrapolation_control") > 0;
+        settings["extrapolation_control"] = "FALSE";
+        settings["extrapolation_control:threshold_save"] = to_string(2.1);
+        settings["extrapolation_control:threshold_break"] = to_string(10);
+        settings["extrapolation_control:save_extrapolative_to"] = "preselected.cfg";
+        if (preselect) {
+            settings["extrapolation_control"] = "TRUE";
+            if (!opts["threshold_save"].empty()) settings["extrapolation_control:threshold_save"] = opts["threshold_save"];
+            if (!opts["threshold_break"].empty()) settings["extrapolation_control:threshold_break"] = opts["threshold_break"];
+            if (!opts["save_extrapolative_to"].empty()) settings["extrapolation_control:save_extrapolative_to"] = opts["save_extrapolative_to"];
+        }
+        settings["fit"] = "FALSE";
+        if (settings["fit"] == "TRUE" ||
+            settings["fit"] == "True" ||
+            settings["fit"] == "true")
+            ERROR("Learning is not compatible with relaxation. \
+                   Change \"fit\" setting to \"FALSE\"");
+        if (settings["lotf"] == "TRUE" ||
+            settings["lotf"] == "True" ||
+            settings["lotf"] == "true")
+            ERROR("Learning is not compatible with relaxation. \
+                   Change \"lotf\" setting to \"FALSE\"");
+        if (settings["select:update_active_set"] == "TRUE" ||
+            settings["select:update_active_set"] == "True" ||
+            settings["select:update_active_set"] == "true")
+            ERROR("Active set update is not allowed while relaxation. \
+                   Change \"select:update_active_set\" setting to \"FALSE\"");
+
+        Wrapper mlip_wrapper(settings);
+
+        Settings driver_settings(settings.ExtractSubSettings("relax"));
+
+        Relaxation rlx(driver_settings, &mlip_wrapper);
+
+        ofstream ofs_relaxed, ofs_unrelaxed;
+        // check and clean output files
+        ifstream ifs(read_cfg_fnm, ios::binary);
+        if (!ifs.is_open())
+            ERROR("Cannot open \"" + read_cfg_fnm + "\" file for input");
+        ofs_relaxed.open(relaxed_cfg_fnm, ios::binary);
+        if (!ofs_relaxed.is_open())
+            ERROR("Cannot open \"" + relaxed_cfg_fnm + "\" file for output");
+
+        if (relaxed_cfg_fnm != unrelaxed_cfg_fnm && !unrelaxed_cfg_fnm.empty())
+        {
+            ofs_unrelaxed.open(unrelaxed_cfg_fnm, ios::binary);
+            if (!ofs_unrelaxed.is_open())
+                ERROR("Cannot open \"" + unrelaxed_cfg_fnm + "\" file for output");
+        }
+
+        int relaxed_proc=0;
+
+        Configuration cfg_orig;                                //to save this cfg if failed to relax
+        int count=0;
+        bool error;                                            //was the relaxation successful or not
+        for (Configuration cfg; cfg.Load(ifs); count++)
+            if (count%mpi.size == mpi.rank)
+            {
+                if (cfg.size() != 0)
+                {
+                    error = false;
+                    cfg_orig = cfg;
+                    rlx.cfg = cfg;
+
+                    try { rlx.Run(); }
+                    catch (MlipException& e) {
+                        Warning("Relaxation of cfg#"+to_string(count+1)+" failed: "+e.message);
+                        error = true;
+                    }
+
+                    if (!error) 
+                    {
+                        rlx.cfg.Save(ofs_relaxed);
+                        relaxed_proc++;
+                    }
+                    else if (!unrelaxed_cfg_fnm.empty()) 
+                    {
+                        if (rlx.cfg.features["from"]=="" || 
+                            rlx.cfg.features["from"]=="relaxation_OK")
+                            rlx.cfg.features["from"]="relaxation_FAILED";
+
+                        if (relaxed_cfg_fnm != unrelaxed_cfg_fnm)
+                            cfg_orig.Save(ofs_unrelaxed);
+                        else 
+                            cfg_orig.Save(ofs_relaxed);
+                    }
+                }
+            }
+
+        int relaxed_total=relaxed_proc;
+        MPI_Allreduce(&relaxed_proc, &relaxed_total, 1, MPI_INT, MPI_SUM, mpi.comm);
+
+        Message("From " + to_string(count) + " configurations " + to_string(relaxed_total) + " were relaxed successfully\n");
+
+    } END_COMMAND;
+
+    BEGIN_COMMAND("select_add",
+        "Actively selects configurations to be added to the current training set",
+        "mlp select_add pot.mtp train_set.cfg candidates.cfg new_selected.cfg:\n"
+        "actively selects configurations from candidates.cfg and saves to new_selected.cfg those that are required to update the current training set (in train_set.cfg)\n"
+        "The specified selection weights overwrites the weights stored in MTP file (if MTP file contain the selection state). Default values of weights are set only if MTP file has no selection state data and the weights are not specified by options"
+        "  Options:\n"
+//        "  --energy_weight=<double>: set the weight for energy equation, default=1\n"
+//        "  --force_weight=<double>: set the weight for force equations, default=0\n"
+//        "  --stress_weight=<double>: set the weight for stress equations, default=0\n"
+//        "  --site_en_weight=<double>: set the weight for site energy equations, default=0\n"
+//        "  --weight_scaling=<0 or 1 or 2>: how energy and stress weights are scaled with the number of atoms. final_energy_weight = energy_weight/N^(weight_scaling/2)\n"
+        "  --swap_limit=<int>: swap limit for multiple selection, unlimited by default\n"
+//        "  --batch_size=<int>: size of batch, default=9999 \n"
+//        "  --save_to=<string>: save maxvol state to a new mtp file\n"
+//        "  --save_selected_to=<string>: filename for saving the active set\n"
+//        "  --threshold=<double>: set the select threshold to num, default=1.001\n"
+        "  --log=<string>: where to write the selection log, default="" (none) \n"
+    ) {
+
+        if (args.size() != 4) {
+            std::cout << "\tError: 4 arguments required\n";
+            return 1;
+        }
+
+        const string mtp_filename = args[0];
+        const string train_filename = args[1];   
+        const string candidates_filename = args[2];
+        const string new_selected_filename = args[3];
+
+        int selection_limit=0;                      //limits the number of swaps in MaxVol
+
+        Settings settings;
+
+        settings.Modify(opts);
+
+        MLMTPR mtp(mtp_filename);
+        Message("MTP loaded from " + mtp_filename);
+
+        CfgSelection select(&mtp, settings);
+        if (settings.GetMap().count("energy_weight") == 0 &&
+            settings.GetMap().count("force_weight") == 0 &&
+            settings.GetMap().count("stress_weight") == 0 &&
+            settings.GetMap().count("site_en_weight") == 0 &&
+            settings.GetMap().count("weight_scaling") == 0)
+        {
+            try { select.Load(args[0]); }
+            catch (MlipException& excp) { Message(excp.What()); }
+        }
+//        select.Reset();
+
+        auto train_cfgs = MPI_LoadCfgs(train_filename);
+        Message("Training set loaded from " + train_filename);
+
+        // selection from training set
+        for (auto& cfg : train_cfgs)
+            select.AddForSelection(cfg);
+        double threshold_backup = select.threshold_select;
+        select.threshold_select = 1.001; // set minimal threshold for selection from the training set
+        select.Select();
+        select.threshold_select = threshold_backup; // restore given threshold
+        train_cfgs.clear();
+        int local = (int)select.selected_cfgs.size();
+        int global;
+        MPI_Reduce(&local, &global, 1, MPI_INT, MPI_SUM, 0, mpi.comm);
+        Message(to_string(global) + " configurations selected from training set");
+
+        // selection from candidates
+        ifstream ifs(candidates_filename, ios::binary);
+        if (!ifs.is_open())
+            ERROR("Can't open file \"" + candidates_filename + "\" for reading configurations");
+        int count = 0;
+        for (Configuration cfg; cfg.Load(ifs); count++)
+        {
+            if (count % mpi.size == mpi.rank)
+            {
+                cfg.features["!@$%^&is_new"] = "true";
+                select.Process(cfg);
+            }
+        }
+        ifs.close();
+        Configuration cfg;
+        if ((count % mpi.size != 0) && mpi.rank >= (count % mpi.size))
+            select.Process(cfg);
+        select.Select((select.swap_limit>0) ? select.swap_limit : HUGE_INT);
+        Message("Loading and selection of candidates completed");
+        
+        // selecting from training set again
+        train_cfgs = MPI_LoadCfgs(train_filename);
+        for (auto& cfg : train_cfgs)
+            select.AddForSelection(cfg);
+        threshold_backup = select.threshold_select;
+        select.Select();
+        select.threshold_select = 1.001; // set minimal threshold for selection from the training set
+        select.threshold_select = threshold_backup; // restore given threshold
+        train_cfgs.clear();
+
+        // saving freshely selected configurations
+        if (mpi.rank == 0)
+            ofstream ofs(new_selected_filename); // clean output file
+
+        count = 0;
+        for (int rnk=0; rnk<mpi.size; rnk++)
+        {
+            if (rnk == mpi.rank)
+            {
+                ofstream ofs(new_selected_filename, ios::binary | ios::app);
+                if (!ofs.is_open())
+                    ERROR("Can not open file \"" + new_selected_filename+ "\" for output");
+
+                for (Configuration& cfg : select.selected_cfgs)
+                {
+                    if (cfg.features["!@$%^&is_new"] == "true" && cfg.size() > 0)
+                    {
+                        cfg.features.erase("!@$%^&is_new");
+
+                        cfg.features.erase("selected_eqn_inds");
+                        int first = *cfg.fitting_items.begin();
+                        for (int ind : cfg.fitting_items)
+                            cfg.features["selected_eqn_inds"] += ((ind == first) ? "" : ",") + to_string(ind);
+
+                        cfg.Save(ofs, Configuration::SAVE_GHOST_ATOMS);
+                        count++;
+                    }
+                    else
+                        cfg.features.erase("!@$%^&is_new");
+                }
+            }
+
+            MPI_Barrier(mpi.comm);
+        }
+
+        int new_count=0;
+        MPI_Allreduce(&count, &new_count, 1, MPI_INT, MPI_SUM, mpi.comm);
+        if (mpi.rank==0)
+            cout << "Training set was increased by " << new_count << " configurations" << endl;
+
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("sample",
+        "separates and saves the configurations with the high extrapolation grade",
+        "mlp sample mlip.mtp in.cfg sampled.cfg [options]:\n"
+        "actively samples the configurations from in.cfg and saves them to sampled.cfg\n"
+        "  Options:\n"
+        "  --threshold_save=<double>: configurations with grade above this value will be saved to output file, default=2.1\n"
+        "  --threshold_break=<double>: grade calculation will break if configuration grade is above this value, default=11\n"
+        "  --add_grade_feature=<bool>: adds grade as a feature to configurations, default true"
+        "  --log=<string>: log, default=stdout\n"
+    ) {
+        if (args.size() != 3) {
+            std::cout << "\tError: 3 arguments required\n";
+            return 1;
+        }
+
+        const string mtp_filename = args[0];
+        const string input_filename = args[1];
+        string output_filename = args[2];
+
+        cout << "MTPR from " << mtp_filename
+            << ", input: " << input_filename
+            << endl;
+        MLMTPR mtpr(mtp_filename);
+
+        Settings settings;
+        settings["add_grade_feature"] = "true";
+        settings["save_extrapolative_to"] = output_filename;
+        if (settings["log"] != "none")
+            settings["log"] = "stdout";
+        else
+            settings["log"] = "";
+        settings.Modify(opts);
+
+        CfgSampling sampler(&mtpr, mtp_filename, settings);
+
+        Message("Starting grades evaluation");
+
+        ifstream ifss(input_filename, ios::binary);
+        if (!ifss.is_open())
+            ERROR("Can't open file \"" + input_filename + "\" for reading configurations");
+        int count = 0;
+        for (Configuration cfg; cfg.Load(ifss); count++)
+            if (count % mpi.size == mpi.rank)
+                sampler.Evaluate(cfg);
+
+        if (mpi.rank == 0)
+            Message("Sampling complete");
+    } END_COMMAND;
+
+    BEGIN_COMMAND("train",
+        "fits an MTP",
+        "mlp train potential.mtp train_set.cfg [options]:\n"
+        "  trains potential.mtp on the training set from train_set.cfg\n"
+        "  Options include:\n"
+        "    --save_to=<string>: filename for trained potential. By default equal to the first argument (overwrites input MTP file)\n"
+        "    --energy_weight=<double>: weight of energies in the fitting. Default=1\n"
+        "    --force_weight=<double>: weight of forces in the fitting. Default=0.01\n"
+        "    --stress_weight=<double>: weight of stresses in the fitting. Default=0.001\n"
+        "    --weight_scaling=<0 or 1 or 2>: defines how energy and stress weights are scaled with the number of atoms (when fitting to configuration of different size). Default=1.0\n"
+        "    --weight_scaling_forces=<0 or 1 or 2>: defines how forces weights are scaled with the number of atoms (when fitting to configuration of different size). Default=0.0\n"
+        "        Typical combinations of weight_scaling and weight_scaling_forces: \n"
+        "           1) weight_scaling=1, weight_scaling_forces=0 used for fiting the configurations sampled from MD trajectories \n"
+        "           2) weight_scaling=0, weight_scaling_forces=0 used for fiting of molecules (non-periodic) \n"
+        "           3) weight_scaling=2, weight_scaling_forces=1 used for fiting of structures (periodic) \n"
+        "    --scale_by_force=<double>: if >0 wights for the small forces increse. Default=0\n"
+        "    --iteration_limit=<int>: maximal number of iterations. Default=1000\n"
+        "    --tolerance=<double>: stopping criterion for optimization. Default=1e-8\n"
+        "    --init_random=<bool>: Initialize parameters if a not pre-fitted MTP. Default is false - this is when interaction of all species is the same (more accurate fit, but longer optimization)\n"
+//        "    --skip_preinit=<bool>: skip the 75 iterations done when params are not given\n"
+        "    --no_mindist_update=<bool>: if true prevent updating of mindist parameter with actual minimal interatomic distance in the training set. Default=false\n"
+        "    --al_mode=<cfg or nbh>: Active learning mode: by configuration (cfg) or by neighborhoods (nbh). Applicable only for training from scratch. Default=cfg\n"
+        "    --log=<string>: where to write log. --log=none switches logging off. Default=stdout\n"
+    ) {
+        if (args.size() != 2) 
+        {
+            cout << "mlp train: 2 arguments are required\n";
+            return 1;
+        }
+
+        Settings settings;
+        settings.Modify(opts);
+
+        if (settings["log"]=="")
+            settings["log"]="stdout";
+        if (settings["log"]=="none")
+            settings["log"]="";
+        if (settings["save_to"]=="")
+            settings["save_to"]=args[0];
+        if (settings["al_mode"]=="")
+            settings["al_mode"]="cfg";
+
+        SetTagLogStream("dev", &std::cout);
+        MLMTPR mtp(args[0]);
+
+        bool has_selection = false;
+        double sel_ene_wgt = 0.0;
+        double sel_frc_wgt = 0.0;
+        double sel_str_wgt = 0.0;
+        double sel_nbh_wgt = 0.0;
+        int sel_wgt_scl = 2;
+        
+        {
+            ifstream ifs(args[0], ios::binary);
+            ifs.ignore(HUGE_INT, '#');
+            if (ifs.fail() || ifs.eof())
+            {
+                Message("Selection data was not found in MTP and will be set");
+                if (settings["al_mode"] == "nbh")
+                {
+                    Message("Selection by neighborhoods is set");
+                    sel_nbh_wgt = 1.0;
+                }
+                else if (settings["al_mode"] == "cfg")
+                {
+                    Message("Selection by configurations is set");
+                    sel_ene_wgt = 1.0;
+                }
+                else
+                {
+                    ERROR("Invalid al_mode");
+                }
+            }
+            else
+            {
+                Message("Selection data found");
+
+                has_selection = true;
+
+                string tmpstr;
+                ifs >> tmpstr;
+                if (tmpstr != "MVS_v1.1")
+                    ERROR("Invalid MVS-file format");
+
+                ifs >> tmpstr;
+                if (tmpstr != "energy_weight")
+                    ERROR("Invalid MVS-file format");
+                ifs >> sel_ene_wgt;
+
+                ifs >> tmpstr;
+                if (tmpstr != "force_weight")
+                    ERROR("Invalid MVS-file format");
+                ifs >> sel_frc_wgt;
+
+                ifs >> tmpstr;
+                if (tmpstr != "stress_weight")
+                    ERROR("Invalid MVS-file format");
+                ifs >> sel_str_wgt;
+
+                ifs >> tmpstr;
+                if (tmpstr != "site_en_weight")
+                    ERROR("Invalid MVS-file format");
+                ifs >> sel_nbh_wgt;
+
+                ifs >> tmpstr;
+                if (tmpstr != "weight_scaling")
+                    ERROR("Invalid MVS-file format");
+                ifs >> sel_wgt_scl;
+            }
+        }
+
+#ifdef MLIP_MPI
+        vector<Configuration> training_set = MPI_LoadCfgs(args[1]);
+#else
+        vector<Configuration> training_set = LoadCfgs(args[1]);
+#endif
+        // training
+        MTPR_trainer mtptr(&mtp, settings);
+        mtptr.Train(training_set);
+
+        // training errrors calculation
+        Settings erm_settings;
+        erm_settings["reprt_to"] = settings["log"];
+        ErrorMonitor errmon(erm_settings);
+        for (Configuration& cfg : training_set)
+        {
+            Configuration cfg_check(cfg);
+            mtp.CalcEFS(cfg_check);
+            errmon.AddToCompare(cfg, cfg_check);
+        }
+        errmon.GetReport();
+
+        {
+            Settings select_setup;
+            CfgSelection select(&mtp, select_setup);
+//            if (has_selection)
+            {
+                select.MV_ene_cmpnts_weight = sel_ene_wgt;
+                select.MV_frc_cmpnts_weight = sel_frc_wgt;
+                select.MV_str_cmpnts_weight = sel_str_wgt;
+                select.MV_nbh_cmpnts_weight = sel_nbh_wgt;
+                select.wgt_scale_power = sel_wgt_scl;
+            }
+
+            for (auto& cfg : training_set)
+                select.AddForSelection(cfg);
+
+            select.Select();
+
+            select.Save(settings["save_to"]);
+        }
+
+        Message("training complete");
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("select",
+        "Acively selects configurations from a pull and adds the selection state data to an MTP",
+        "mlp select mtp_filename cfg_filename [options]:\n"
+        "Settings can be given in any order.\n"
+        "  Options include:\n"
+        "  --energy_weight=<double>: set the weight for energy equation, default=1\n"
+        "  --force_weight=<double>: set the weight for force equations, default=0\n"
+        "  --stress_weight=<double>: set the weight for stress equations, default=0\n"
+        "  --site_en_weight=<double>: set the weight for site energy equations, default=0\n"
+        "  --weight_scaling=<0 or 1 or 2>: how energy and stress weights are scaled with the number of atoms. final_energy_weight = energy_weight/N^(weight_scaling/2)\n"
+        "  --save_to=<string>: were to save mtp with selection state data, equal to the first argument by default\n"
+        "  --save_selected_to=<string>: were to save selected configurations, default="" (not saved)\n"
+        "  --batch_size=<int>: Configurations are committed for selection by batches. Configuratoions are accumulated in batch until the number of committed configurations reaches this number. After that, selection among all collected configurations is performed. Unlimited if 0 is set"
+        "  --log=<int>: Where to write selection log. \"stdout\" and \"stderr\" corresponds to standard output streams; Default="" (none)"
+    )
+    {
+        if (args.size() != 2) 
+        {
+            cout << "mlp select: 2 argument required\n";
+            return 1;
+        }
+
+        Settings settings;
+        if (!opts["settings_file"].empty())
+            settings = LoadSettings(opts["settings_file"]);
+        settings.Modify(opts);
+
+        if (settings["save_to"].empty())
+            settings["save_to"] = args[0];
+
+        MLMTPR mtp(args[0]);
+        CfgSelection selector(&mtp, settings);
+
+#ifdef MLIP_MPI
+        auto cfgs = MPI_LoadCfgs(args[1]);
+#else 
+        auto cfgs = LoadCfgs(args[1]);
+#endif
+
+        for (auto& cfg : cfgs)
+            cfg.InitNbhs(mtp.CutOff());
+
+        for (auto& cfg : cfgs)
+            selector.Process(cfg);
+
+        selector.Select();
+
+    } END_COMMAND;
+
+    BEGIN_COMMAND("check_errors",
+        "calculates EFS for configuraion in database and compares them with the provided",
+        "mlp check_errors mlip.mtp db.cfg:\n"
+        "calculates errors of \"mlip.mtp\" on the database \"db.cfg\"\n"
+        "  Options include:\n"
+        "  --report_to <string> where to write the integral report. Default=stdout\n"
+        "  --log <string> where to write the report for each configuration\n"
+    ) {
+
+        if (args.size() < 2) {
+            cout << "mlp check_errors: 2 arguments are required\n";
+            return 1;
+        }
+
+        Settings settings;
+        settings.Modify(opts);
+
+        MLMTPR mtp(args[0]);
+
+        ErrorMonitor errmon(settings);
+
+        ifstream ifs(args[1], ios::binary);
+        if (!ifs.is_open())
+        {
+            Message("Can not open a file with configurations \""+args[1]+"\" for reading");
+        }
+        Configuration cfg;
+        int count=0;
+        for (; cfg.Load(ifs); count++)
+            if (count % mpi.size == mpi.rank)
+            {
+                Configuration cfg_copy(cfg);
+                mtp.CalcEFS(cfg_copy);
+                errmon.AddToCompare(cfg, cfg_copy);
+            }
+        if ((count % mpi.size != 0) && mpi.rank >= (count % mpi.size))
+            errmon.AddToCompare(cfg, cfg);
+
+        errmon.GetReport();
+
+    } END_COMMAND;
+
+    BEGIN_COMMAND("cut_extrapolative_nbh",
+        "converts neighborhood to sperical configuration (non-periodic boundary conditions)",
+        "mlp cut_extrapolative_nbh in.cfg out.cfg [options]:\n"
+        "for each configuration in \"in.cfg\" finds the neighborhood with the maximal extrapolation grade,\n"
+        "  converts it to the non-periodic configuration cut from in.cfg, and saves it in \"out.cfg\"\n"
+        "  Options include:\n"
+        "  --cutoff=<double>: set cutoff radius of sphere to cut from in.cfg. Default=7.0 Angstrom\n"
+//        "  --no_save_additional_atoms: indicate do not save atoms beyond the sphere.\n"
+//        "  --threshold_save=<double>: extrapolative environments with the extrapolation grade higher then this value will be saved. Default: 3.0\n"
+    ) {
+
+        if (args.size() != 2) {
+            cout << "mlp cut_extrapolative_nbh: 2 arguments required\n";
+            return 1;
+        }
+
+        double cutoff = 7.0;
+            if (opts["cutoff"] != "") 
+                cutoff = stod(opts["cutoff"]);
+
+        ifstream ifs(args[0]);
+        ofstream ofs(args[1]);
+        ofs.close();
+
+        Configuration cfg;
+
+        int cntr = 0;
+        while (cfg.Load(ifs))
+        {
+            int i = -1;
+            double max_grade = 0.0;
+            for (int j=0; j<cfg.size(); j++) 
+                if (cfg.nbh_grades(j) > max_grade) 
+                {
+                    max_grade = cfg.nbh_grades(j);
+                    i = j;
+                }
+            if (max_grade < 1.1)
+                continue;
+
+            cfg.InitNbhs(cutoff);
+
+            Configuration nbh;
+            nbh.features["cfg#"] = to_string(cntr);
+            nbh.features["nbh#"] = to_string(i);
+            nbh.features["extrapolation_grade"] = to_string(cfg.nbh_grades(i));
+            nbh.resize(cfg.nbhs[i].count + 1);
+            nbh.pos(0) = Vector3(0, 0, 0);
+            nbh.type(0) = cfg.type(i);
+                
+            for (int j=0; j<cfg.nbhs[i].count; j++)
+            {
+                nbh.pos(j+1) = cfg.nbhs[i].vecs[j];
+                nbh.type(j+1) = cfg.nbhs[i].types[j];
+            } 
+                    
+            nbh.AppendToFile(args[1]);
+
+            cout << "from cfg#" << cntr+1 << " cut nbh#" << i << " with grade=" << max_grade << " (atoms count is " << cfg.nbhs[i].count << ')' << endl;
+            cntr++;
+/*
+            if (cfg.lattice.det() != 0.0)
+            {
+
+                Vector3 v1(cfg.lattice[0][0], cfg.lattice[0][1], cfg.lattice[0][2]);
+                Vector3 v2(cfg.lattice[1][0], cfg.lattice[1][1], cfg.lattice[1][2]);
+                Vector3 v3(cfg.lattice[2][0], cfg.lattice[2][1], cfg.lattice[2][2]);
+            
+                Vector3 n3 = Vector3::VectProd(v1, v2) / Vector3::VectProd(v1, v2).Norm();
+                Vector3 n2 = Vector3::VectProd(v1, v3) / Vector3::VectProd(v1, v3).Norm();
+                Vector3 n1 = Vector3::VectProd(v3, v2) / Vector3::VectProd(v3, v2).Norm();
+                
+                int times1 = ceil((n1 * (n1 * v1)).Norm() / cutoff);
+                int times2 = ceil((n2 * (n2 * v2)).Norm() / cutoff);
+                int times3 = ceil((n3 * (n3 * v3)).Norm() / cutoff);
+            
+                int sz = cfg.size();
+                for (int i=1; i<=times1; i++)
+                {
+                    int oldsz = cfg.size();
+                    cfg.resize(oldsz + sz);
+                    for (int j=0; j<sz; j++)
+                    {
+                        cfg.pos(oldsz + j) = cfg.pos(j) + v1*i;
+                        cfg.type(oldsz + j) = cfg.type(j);
+                    }
+                    oldsz = cfg.size();
+                    cfg.resize(oldsz + sz);
+                    for (int j=0; j<sz; j++)
+                    {
+                        cfg.pos(oldsz + j) = cfg.pos(j) - v1*i;
+                        cfg.type(oldsz + j) = cfg.type(j);
+                    }
+                }
+                for (int i=1; i<=times2; i++)
+                {
+                    int oldsz = cfg.size();
+                    cfg.resize(oldsz + sz);
+                    for (int j=0; j<sz; j++)
+                    {
+                        cfg.pos(oldsz + j) = cfg.pos(j) + v2*i;
+                        cfg.type(oldsz + j) = cfg.type(j);
+                    }
+                    oldsz = cfg.size();
+                    cfg.resize(oldsz + sz);
+                    for (int j=0; j<sz; j++)
+                    {
+                        cfg.pos(oldsz + j) = cfg.pos(j) - v2*i;
+                        cfg.type(oldsz + j) = cfg.type(j);
+                    }
+                }
+                for (int i=1; i<=times3; i++)
+                {
+                    int oldsz = cfg.size();
+                    cfg.resize(oldsz + sz);
+                    for (int j=0; j<sz; j++)
+                    {
+                        cfg.pos(oldsz + j) = cfg.pos(j) + v3*i;
+                        cfg.type(oldsz + j) = cfg.type(j);
+                    }
+                    oldsz = cfg.size();
+                    cfg.resize(oldsz + sz);
+                    for (int j=0; j<sz; j++)
+                    {
+                        cfg.pos(oldsz + j) = cfg.pos(j) - v3*i;
+                        cfg.type(oldsz + j) = cfg.type(j);
+                    }
+                }
+
+            }
+            
+            for (int i : nbhs)
+            {
+                Vector3 pos(cfg.pos(i));
+                Configuration nbh;
+                nbh.resize(cfg.size() + 1);
+                nbh.pos(nbh.size()-1) = cfg.pos(i);
+                nbh.type(nbh.size()-1) = cfg.type(i);
+                
+                for (int j=0; j<cfg.size(); j++)
+                    if (distance(cfg.pos(i), cfg.pos(j)) <= cutoff)
+                    {
+                        nbh.resize(cfg.size() + 1);
+                        nbh.pos(nbh.size()-1) = cfg.pos(j);
+                        nbh.type(nbh.size()-1) = cfg.type(j);
+                    } 
+                    
+                nbh.AppendToFile(args[1]);
+            }
+*/
+        }
+        
+/*
+        double rcut  = 9;
+        if (!opts["cutoff"].empty())
+            rcut = stod(opts["cutoff"]);
+
+        double save_additional_atoms = false;
+
+        ifstream ifs(args[0]);
+        ofstream ofs(args[1]);
+
+        Configuration nbh;
+
+        while (nbh.Load(ifs)) {
+
+            double max_grade = 0;
+            int max_idx = 0;
+
+            for (int i = 0; i < nbh.size(); i++) {
+                if (max_grade < nbh.nbh_grades(i)) {
+                    max_grade = nbh.nbh_grades(i);
+                    max_idx = i;
+                }
+            }
+
+            Configuration cfg;
+            vector<Vector3> pos_not_deleted;
+            vector<int> type_not_deleted;
+            vector<Vector3> pos_deleted;
+            vector<int> type_deleted;
+
+            for (int i = 0; i < nbh.size(); i++) {
+                if (distance(nbh.pos(i), nbh.pos(max_idx)) < rcut) {
+                    pos_not_deleted.push_back(nbh.pos(i));
+                    type_not_deleted.push_back(nbh.type(i));
+                }
+                else {
+                    pos_deleted.push_back(nbh.pos(i));
+                    type_deleted.push_back(nbh.type(i));
+                }
+            }
+
+
+            cfg.resize(int(pos_not_deleted.size()+pos_deleted.size()));
+            for (int i = 0; i < pos_not_deleted.size(); i++) {
+                cfg.pos(i) = pos_not_deleted[i];
+                cfg.type(i) = type_not_deleted[i];
+            }
+            for (int i = (int)pos_not_deleted.size(); i < pos_deleted.size()+pos_not_deleted.size(); i++) {
+                cfg.pos(i) = pos_deleted[i-pos_not_deleted.size()];
+                cfg.type(i) = type_deleted[i-pos_not_deleted.size()];
+            }
+            string neighborhood_info = to_string(pos_not_deleted.size());
+            neighborhood_info += " first atoms are inside the neighborhood within the cut-off radius of ";
+            neighborhood_info += to_string(rcut);
+            neighborhood_info += " angstrom, the rest of the atoms were deleted";
+            cfg.features["Neighborhood_information:"] = neighborhood_info;
+
+            if (opts["no_save_additional_atoms"] != "") {
+                cfg.resize(int(pos_not_deleted.size()));
+            }
+
+            cfg.Save(ofs);
+
+        }
+*/
+    } END_COMMAND;
+
+    BEGIN_COMMAND("calculate_grade",
+        "calculates extrapolation grade for configurations",
+        "mlp calculate_grade mlip.almtp in.cfg out.cfg [options]:\n"
+        "loads an active learning state from \"mlip.almtp\" and calculates extrapolation grade for configurations from in.cfg.\n"
+        "Configurations with the grades will be written to \"out.cfg\"\n"
+        "and writes them to out.cfg.\n"
+        "  Options:\n"
+        "  --log=<num>: log, default=stdout, output grade for each configuration to this file\n"
+    ) {
+
+        if (args.size() != 3) {
+            std::cout << "\tError: 3 arguments required\n";
+            return 1;
+        }
+
+        const string mtp_filename = args[0];
+        const string input_filename = args[1];
+        string output_filename = args[2];
+
+        cout << "MTPR from " << mtp_filename
+            << ", input: " << input_filename
+            << endl;
+        MLMTPR mtpr(mtp_filename);
+
+        Settings settings;
+        settings["threshold_save"] = "0";
+        settings["threshold_break"] = "0";
+        settings["log"] = "stdout";
+        settings["add_grade_feature"] = "true";
+        settings["save_extrapolative_to"] = output_filename;
+        settings.Modify(opts);
+
+        CfgSampling sampler(&mtpr, mtp_filename, settings);
+
+        Message("Starting grades evaluation");
+
+        ifstream ifss(input_filename, ios::binary);
+        if (!ifss.is_open())
+            ERROR("Can't open file \"" + input_filename + "\" for reading configurations");
+
+        int count = 0;
+        for (Configuration cfg; cfg.Load(ifss); count++)
+        {
+            cfg.features["ID"] = to_string(count);
+            if (count % mpi.size == mpi.rank)
+                sampler.Evaluate(cfg);
+        }
+
+        ifss.close();
+        if (mpi.rank == 0)
+            Message("Grade evaluation complete");
+
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("make_cell_square",
+        "makes the unit cell as square as possible by replicating it",
+        "mlp make_cell_square in.cfg out.cfg\n"
+        "  Options:\n"
+        "  --min_atoms=<int>: minimal number of atoms (default: 32)\n"
+        "  --max_atoms=<int>: maximal number of atoms (default: 64)\n"
+    ) {
+        int min_atoms=32, max_atoms=64;
+        if (opts["min_atoms"] != "") {
+            try { min_atoms = stoi(opts["min_atoms"]); }
+            catch (invalid_argument) {
+                cerr << (string)"mlp error: " + opts["min_atoms"] + " is not an integer\n";
+            }
+        }
+        if (opts["max_atoms"] != "") {
+            try { max_atoms = stoi(opts["max_atoms"]); }
+            catch (invalid_argument) {
+                cerr << (string)"mlp error: " + opts["max_atoms"] + " is not an integer\n";
+            }
+        }
+
+        Configuration cfg;
+        ifstream ifs(args[0], ios::binary);
+        ofstream ofs(args[1], ios::binary);
+        while(cfg.Load(ifs))
+        {
+            int cfg_size = cfg.size();
+            const int M = 4;
+            Matrix3 A, B;
+            Matrix3 L = cfg.lattice * cfg.lattice.transpose();
+            Matrix3 X;
+            double score = 1e99;
+            for (A[0][0] = -M; A[0][0] <= M; A[0][0]++)
+                for (A[0][1] = -M; A[0][1] <= M; A[0][1]++)
+                    for (A[0][2] = -M; A[0][2] <= M; A[0][2]++)
+                        for (A[1][0] = -M; A[1][0] <= M; A[1][0]++)
+                            for (A[1][1] = -M; A[1][1] <= M; A[1][1]++)
+                                for (A[1][2] = -M; A[1][2] <= M; A[1][2]++)
+                                    for (A[2][0] = -M; A[2][0] <= M; A[2][0]++)
+                                        for (A[2][1] = -M; A[2][1] <= M; A[2][1]++)
+                                            for (A[2][2] = -M; A[2][2] <= M; A[2][2]++) {
+                                                if (fabs(A.det()) * cfg_size < min_atoms) continue;
+                                                if (fabs(A.det()) * cfg_size > max_atoms) continue;
+                                                X = A*L*A.transpose();
+                                                double shift = (X[0][0] + X[1][1] + X[2][2]) / 3;
+                                                X[0][0] -= shift;
+                                                X[1][1] -= shift;
+                                                X[2][2] -= shift;
+                                                double curr_score = X.NormFrobeniusSq();
+                                                if (curr_score < score) {
+                                                    score = curr_score;
+                                                    B = A;
+                                                }
+                                            }
+            for (int a = 0; a < 3; a++) {
+                for (int b = 0; b < 3; b++)
+                    cout << (int)B[a][b] << " ";
+                cout << "\n";
+            }
+            cout << B.det() << "\n";
+            cfg.ReplicateUnitCell(templateMatrix3<int>((int)B[0][0], (int)B[0][1], (int)B[0][2], (int)B[1][0], (int)B[1][1], (int)B[1][2], (int)B[2][0], (int)B[2][1], (int)B[2][2]));
+            cfg.CorrectSupercell();
+            cfg.Save(ofs);
+        }
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("replicate_cell",
+        "replicates unit cell",
+        "mlp replicate_cell in.cfg out.cfg nx ny nz:\n"
+        "or"
+        "mlp replicate_cell in.cfg out.cfg nxx nxy nxz nyx nyy nyz nzx nzy nzz:\n"
+        "Replicate the supercells by nx*ny*nz.\n"
+        "Alternatively, applies the 3x3 integer matrix to the lattice\n"
+    ) {
+        if (args.size() == 5) {
+            int nx, ny, nz;
+            try { nx = stoi(args[2]); }
+            catch (invalid_argument) {
+                cerr << (string)"mlp error: " + args[2] + " is not an integer\n";
+            }
+            try { ny = stoi(args[3]); }
+            catch (invalid_argument) {
+                cerr << (string)"mlp error: " + args[3] + " is not an integer\n";
+            }
+            try { nz = stoi(args[4]); }
+            catch (invalid_argument) {
+                cerr << (string)"mlp error: " + args[4] + " is not an integer\n";
+            }
+
+            Configuration cfg;
+            ifstream ifs(args[0], ios::binary);
+            ofstream ofs(args[1], ios::binary);
+            while (cfg.Load(ifs)) {
+                cfg.ReplicateUnitCell(nx, ny, nz);
+                cfg.Save(ofs);
+            }
+        } else if (args.size() == 11) {
+            int args_int[3][3];
+            for (int i = 0; i < 9; i++) {
+                try { args_int[i/3][i%3] = stoi(args[i+2]); }
+                catch (invalid_argument) {
+                    cerr << (string)"mlp error: " + args[i+2] + " is not an integer\n";
+                }
+            }
+            templateMatrix3<int> A(args_int);
+
+            Configuration cfg;
+            ifstream ifs(args[0], ios::binary);
+            ofstream ofs(args[1], ios::binary);
+            while (cfg.Load(ifs)) {
+                cfg.ReplicateUnitCell(A);
+                cfg.Save(ofs);
+            }
+        }
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("mindist_filter",
+        "removes the configuration with too short interatomic distance",
+        "mlp mindist_filter input.cfg 2.1 output.cfg:\n"
+    ) {
+        if (args.size() != 3) {
+            cout << "3 arguments are required\n";
+            return 1;
+        }
+
+        {    ofstream ofs(args[2]);    }
+
+        auto cfgs = MPI_LoadCfgs(args[0]);
+        double md = stod(args[1]);
+        for (int rnk=0; rnk<mpi.size; rnk++)
+        {
+            MPI_Barrier(mpi.comm);
+            if (rnk == mpi.rank)
+                for (auto cfg : cfgs)
+                    if (cfg.MinDist() >= md)
+                        cfg.AppendToFile(args[2]);
+        }
+    } END_COMMAND;
+
+    BEGIN_COMMAND("convert",
+        "converts a file with configurations from one format to another",
+        "mlp convert [options] inputfilename outputfilename\n"
+        "  input filename sould always be entered before output filename\n"
+        "  Options can be given in any order. Options include:\n"
+        "  --input_format=<format>: format of the input file\n"
+        "  --output_format=<format>: format of the output file\n"
+//        "  --input_chgcar=<filename>: name of the chgcar file\n"
+//        "  --input_near_neigh=<filename>: name of the file with nearest neighbor distances\n"
+        "  --append: opens output file in append mode\n"
+        "  --last: ignores all configurations in the input file except the last one\n"
+        "          (useful with relaxation)\n"
+        "  --fix_lattice: creates an equivalent configuration by moving the atoms into\n"
+        "                 the supercell (if they are outside) and tries to make the\n"
+        "                 lattice as much orthogonal as possible and lower triangular\n"
+        "  --save_nonconverged: writes configurations with nonconverged VASP calculations, otherwise they are ignored\n"
+        "  --absolute_types: writes absolute atomic numbers into the cfg file instead of 0,1,2,.... Disabled by default, used only while reading from OUTCAR\n"
+        "  --type_order=18,22,46,... atomic numbers separated with commas in the order as they are in POTCAR. Used only while writing to POSCAR\n"
+        "  --no_forces_stresses: no saving of forces and stresses into the cfg file\n"  
+        "\n"
+        "  <format> can be:\n"
+        "  txt (default):   mlip textual format\n"
+//        "  bin:           mlip binary format\n"
+        "  outcar:          only as input; VASP versions 5.3.5 and 5.4.1 were tested\n"
+        "  poscar:          only as output. When writing multiple configurations,\n"
+        "                   POSCAR0, POSCAR1, etc. are created\n"
+        "  lammps_dump:     only as input. Only lattice, atomic positions and types are saved. Only cartesian coordinates are processed.\n"
+        "  lammps_datafile: only as output. Can be read by read_data from lammps.\n"
+        "                 Multiple configurations are saved to several files.\n"
+
+        ) {
+
+        if (opts["input_format"] == "") opts["input_format"] = "txt";
+        if (opts["output_format"] == "") opts["output_format"] = "txt";
+
+        if (opts["output_format"] == "vasp_outcar" || opts["output_format"] == "outcar" || opts["output_format"] == "lammps_dump")
+            ERROR("Format " + opts["output_format"] + " is not allowed for output");
+        if (opts["input_format"] == "vasp_poscar" || opts["input_format"] == "poscar" || opts["input_format"] == "lammps_datafile")
+            ERROR("Format " + opts["input_format"] + " is not allowed for input");
+
+        const bool possibly_many_output_files = (opts["output_format"] == "vasp_poscar" || opts["output_format"] == "poscar")
+            || (opts["output_format"] == "lammps_datafile");
+
+        ifstream ifs(args[0], ios::binary);
+        if (ifs.fail()) ERROR("cannot open " + args[0] + " for reading");
+
+        ofstream ofs;
+        if (!possibly_many_output_files) {
+            if (opts["append"] == "") {
+                ofs.open(args[1], ios::binary);
+                if (ofs.fail()) ERROR("cannot open " + args[1] + " for writing");
+            } else {
+                ofs.open(args[1], ios::binary | ios::app);
+                if (ofs.fail()) ERROR("cannot open " + args[1] + " for appending");
+            }
+        }
+
+        vector<Configuration> db;
+        vector<int> db_species;
+        vector<int> types_mapping;
+        int count = 0;
+        bool read_success = true;
+
+        // block that reads configurations from VASP OUTCAR into the database
+        if (opts["input_format"] == "vasp_outcar" || opts["input_format"] == "outcar") 
+        {
+            ifs.close();
+	        Configuration::LoadDynamicsFromOUTCAR(db, args[0],opts["save_nonconverged"] != "",opts["absolute_types"] == "");
+            //to ensure the unified way of POSCAR writing    
+            for (int i=0;i<200;i++)
+                types_mapping.push_back(i);
+
+            //block that creates magnetic moments
+            if (opts["input_chgcar"] != "" && opts["input_near_neigh"] != "") {
+                ifstream ifs_nnd(opts["input_near_neigh"]);
+                int n_species;
+                ifs_nnd >> n_species;
+
+                Array1D nearest_neigh_dist(n_species);
+                for (int i = 0; i < n_species; i++) {
+                    ifs_nnd >> nearest_neigh_dist[i];
+                    nearest_neigh_dist[i] /= 4;
+                }
+
+                MagneticMoments MagMom(false, nearest_neigh_dist);
+                MagMom.LoadDensityFromCHGCAR(opts["input_chgcar"], n_species);
+                MagMom.CalculateWeights(db[0]);
+                MagMom.Calculate(db[0]);
+                if (opts["no_forces_stresses"] != "") {
+                    db[0].has_forces(false);
+                    db[0].has_stresses(false);
+                }
+            }
+        }
+
+        //block that reads configurations from LAMMPS dump file into the database
+        bool is_eof_lammps_file = true;
+        if ((opts["input_format"] == "lammps_dump") && (opts["output_format"] != "poscar")) 
+        {
+            while (is_eof_lammps_file) {
+                Configuration cfg;
+                is_eof_lammps_file = cfg.LoadNextFromLAMMPSDump(ifs);
+                if (is_eof_lammps_file) db.push_back(cfg);
+            }
+        }  
+
+        Configuration cfg_last; // used to differentiate between single-configuration and multiple-configuration inputs
+            
+        //scanning the entire database for all atomic numbers present
+        if ((opts["output_format"] == "vasp_poscar" || opts["output_format"] == "poscar") && ((opts["input_format"] == "txt") || (opts["input_format"] == "bin") || (opts["input_format"] == "") || (opts["input_format"] == "lammps_dump")))
+        {
+            while (read_success)
+            {
+                Configuration cfg;
+
+                // Read cfg
+                if (db_species.size() == 0) {
+                    if (opts["input_format"] == "txt" || opts["input_format"] == "bin") {
+                        read_success = cfg.Load(ifs);
+                    }
+                    else if (opts["input_format"] == "lammps_dump") {
+                        read_success = cfg.LoadNextFromLAMMPSDump(ifs);
+                        if (read_success) db.push_back(cfg);
+                    }
+                    else {
+                        ERROR("unknown file format");
+                    }
+                }
+                else {
+                    if (opts["input_format"] == "txt" || opts["input_format"] == "bin") {
+                        read_success = cfg.Load(ifs);
+                    }
+                    else if (opts["input_format"] == "lammps_dump") {
+                        read_success = cfg.LoadNextFromLAMMPSDump(ifs);
+                        if (read_success) db.push_back(cfg);
+                    }
+                }
+
+                for (int i=0; i<cfg.size(); i++)
+                    if (std::find(db_species.begin(), db_species.end(), cfg.type(i)) == db_species.end())
+                        db_species.push_back(cfg.type(i));
+            }
+
+            //if (std::distance(db_species.begin(), std::max_element(db_species.begin(), db_species.end()))==db_species[db_species.size() - 1])
+            bool relative=true;
+            for (int i=0; i<db_species.size(); i++)
+                if (std::find(db_species.begin(), db_species.end(), i) == db_species.end())
+                    relative=false;
+            if (relative)
+            {
+                cout << "relative numeration of species detected" << endl;
+                for (int i=0; i<db_species.size(); i++)
+                    types_mapping.push_back(i);
+            }
+            else
+            {
+                cout << "absolute numeration of species detected" << endl;
+                if (opts["type_order"] == "")
+                {
+                    std::set<int> tmp;
+                    for (int el : db_species)
+                        tmp.insert(el);
+                    for (int el : tmp)
+                        types_mapping.push_back(el);
+                }
+
+                string t = opts["type_order"];
+                int k = 0;
+                while (k<t.size())
+                {
+                    string type ="";
+                    while ((t[k]!=',')&&(k<t.size()))
+                    {
+                        type+=t[k++];
+                    }
+                    types_mapping.push_back(std::stoi(type));
+                    k+=1;
+                }
+
+            }
+
+            for (int i = 0; i < db_species.size(); i++)
+            {
+                bool found = false;
+                for (int j=0; j < types_mapping.size(); j++)
+                    if (db_species[i]==types_mapping[j])
+                        found=true;
+
+                if (!found)
+                    ERROR("Not all the species were mentioned in [--type_order] option: " + std::to_string(db_species[i]) + " atomic number is missing.");
+            }
+            ifs.close();
+            ifs.open(args[0], ios::binary);
+        }
+        
+        read_success = true;
+
+        while (read_success) {
+            Configuration cfg;
+
+            // Read cfg
+            if (db.size() == 0) {
+                if (opts["input_format"] == "txt" || opts["input_format"] == "bin")
+                    read_success = cfg.Load(ifs);
+                else
+                {
+                    Warning("no confgiurations will be saved");
+                    read_success=false;
+                }
+                    //ERROR("unknown file format");
+            } else {
+                read_success = (count < db.size());
+                if (read_success)
+                    cfg = db[count];
+            }
+            if (!read_success) break;
+
+            if (opts["fix_lattice"] != "") cfg.CorrectSupercell();
+
+            // Save cfg
+            if (opts["last"] == "") {
+                if (opts["output_format"] == "txt")
+                    cfg.Save(ofs);
+                if (opts["output_format"] == "bin")
+                    cfg.SaveBin(ofs);
+                if (opts["output_format"] == "vasp_poscar" || opts["output_format"] == "poscar") {
+                    if (count != 0) {
+                        if (count == 1) cfg_last.WriteVaspPOSCAR(args[1] + "0",types_mapping);
+                        cfg.WriteVaspPOSCAR(args[1] + to_string(count),types_mapping);
+                    }
+                }
+                if (opts["output_format"] == "lammps_datafile") {
+                    if (count != 0) {
+                        if (count == 1) cfg_last.WriteLammpsDatafile(args[1] + "0");
+                        cfg.WriteLammpsDatafile(args[1] + to_string(count));
+                    }
+                }
+            }
+
+            cfg_last = cfg;
+            count++;
+        }
+        if (opts["last"] == "") {
+            if ((opts["output_format"] == "vasp_poscar" || opts["output_format"] == "poscar") && count == 1)
+                cfg_last.WriteVaspPOSCAR(args[1],types_mapping);
+            if (opts["output_format"] == "lammps_datafile" && count == 1)
+                cfg_last.WriteLammpsDatafile(args[1]);
+        } else {
+            // --last option given
+            if (opts["output_format"] == "txt")
+                cfg_last.Save(ofs);
+            if (opts["output_format"] == "bin")
+                cfg_last.SaveBin(ofs);
+            if (opts["output_format"] == "vasp_poscar" || opts["output_format"] == "poscar")
+                cfg_last.WriteVaspPOSCAR(args[1],types_mapping);
+            if (opts["output_format"] == "lammps_datafile")
+                cfg_last.WriteLammpsDatafile(args[1]);
+        }
+
+        std::cout << "Processed " << count << " configurations" << std::endl;
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("add_from_outcar",
+        "reads one configuration from the OUTCAR(s)",
+        "mlp add_from_outcar db outcar [outcar2 outcar3 ...]:\n"
+        "reads one configuration from each of the outcars\n"
+        "and APPENDs to the database db\n"
+        ) {
+
+        if (args.size() < 2) {
+            cout << "mlp add_from_outcar: minimum 2 arguments required\n";
+            return 1;
+        }
+
+        ofstream ofs(args[0], std::ios::app|std::ios::binary);
+        Configuration cfg;
+        for (int i = 1; i < args.size(); i++) {
+            cfg.LoadFromOUTCAR(args[i],opts["absolute_types"] == "");
+
+			if (opts["save_nonconverged"] != "")
+				cfg.Save(ofs);
+			else
+				if (cfg.features["EFS_by"] != "VASP_not_converged")
+            		cfg.Save(ofs);
+				else
+					Warning("non-converged OUTCAR detected. Use '--save_noncoverged' option to save it");
+        }
+        ofs.close();
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("add_from_outcar_dyn",
+        "reads dynamics from the OUTCAR(s)",
+        "mlp add_from_outcar_dyn db outcar [outcar2 outcar3 ...]:\n"
+        "reads dynamics configuration from each of the outcars\n"
+        "each configuration from the dynamics is APPENDed to the database\n"
+        ) {
+
+        if (args.size() < 2) {
+            cout << "mlp add_from_outcar_dyn: minimum 2 arguments required\n";
+            return 1;
+        }
+
+        ofstream ofs(args[0], ios::app|ios::binary);
+        for (int i = 1; i < args.size(); i++) {
+            vector<Configuration> db;
+            if (!Configuration::LoadDynamicsFromOUTCAR(db, args[i],opts["save_nonconverged"] != "",opts["absolute_types"] == ""))
+                cerr << "Warning: OUTCAR is broken" << endl;
+            for (Configuration cfg : db)
+                cfg.Save(ofs);
+        }
+        ofs.close();
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("add_from_outcar_last",
+        "reads the dynamics from OUTCAR(s) and gets the last configuration from each OUTCAR",
+        "mlp add_from_outcar_last db outcar [outcar2 outcar3 ...]:\n"
+        "reads the dynamics from OUTCAR(s) and gets the last configuration from each OUTCAR.\n"
+        "APPENDs all last configurations to the database db\n"
+        ) {
+
+        if (args.size() < 2) {
+            std::cout << "mlp add_from_outcar: minimum 2 arguments required\n";
+            return 1;
+        }
+
+        ofstream ofs(args[0], ios::app|ios::binary);
+        Configuration cfg;
+        for (int i = 1; i < args.size(); i++) {
+            if (!cfg.LoadLastFromOUTCAR(args[i],opts["absolute_types"] == ""))
+                cerr << "Warning: OUTCAR is broken" << endl;
+            cfg.Save(ofs);
+        }
+        ofs.close();
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("to_bin",
+        "converts a database to the binary format",
+        "mlp to_bin db.cfgs db.bin.cfgs:\n"
+        "Reads the database db.cfgs and saves in a binary format to db.bin.cfgs\n"
+        ) {
+
+        if (args.size() < 3) {
+            cout << "mlp to_bin: minimum 2 arguments required\n";
+            return 1;
+        }
+
+        ifstream ifs(args[0], ios::binary);
+        ofstream ofs(args[2], ios::binary);
+        Configuration cfg;
+        int count;
+        for (count = 0; cfg.Load(ifs); count++)
+            cfg.SaveBin(ofs);
+        ofs.close();
+        cout << count << " configurations processed.\n";
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("from_bin",
+        "converts a database from the binary format",
+        "mlp from_bin db.bin.cfgs db.cfgs:\n"
+        "Reads the binary database db.bin.cfgs and saves to db.cfgs\n"
+        ) {
+
+        if (args.size() < 3) {
+            cout << "mlp to_bin: minimum 2 arguments required\n";
+            return 1;
+        }
+
+        ifstream ifs(args[0], ios::binary);
+        ofstream ofs(args[2], ios::binary);
+        Configuration cfg;
+        int count;
+
+        for (count = 0; cfg.Load(ifs); count++)
+            cfg.Save(ofs);
+
+        ofs.close();
+        cout << count << " configurations processed.\n";
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("fix_lattice",
+        "puts the atoms into supercell (if they are outside) and makes lattice as much orthogonal as possible and lower triangular",
+        "mlp fix_lattice in.cfg out.cfg:\n"
+        ) {
+
+        if (args.size() < 2) {
+            cout << "mlp to_bin: minimum 2 arguments required\n";
+            return 1;
+        }
+
+        ifstream ifs(args[0], ios::binary);
+        ofstream ofs(args[1], ios::binary);
+        Configuration cfg;
+        int count;
+        for (count = 0; cfg.Load(ifs); count++) {
+            cfg.CorrectSupercell();
+            cfg.Save(ofs);
+        }
+        ofs.close();
+        cout << count << " configurations processed.\n";
+    } END_COMMAND;
+
+    BEGIN_COMMAND("mindist",
+        "reads a cfg file and saves it with added mindist feature",
+        "mlp mindist db.cfg [Options]\n"
+        "  Options can be given in any order. Options include:\n"
+        "  --no_overwrite=<true/false>: do not change db.cfg, only print global mindist\n"
+        "  --no_types=<true/false>: calculate a single mindist for all types"
+        ) 
+    {
+
+        if (args.size() != 1) {
+            cout << "mlp mindist: 1 arguments required\n";
+            return 1;
+        }
+
+        auto cfgs = MPI_LoadCfgs(args[0]);
+        if (cfgs.back().size() == 0)
+            cfgs.pop_back();
+        bool no_overwrite = opts.count("no_overwrite") > 0;
+        bool no_types = opts.count("no_types") > 0;
+
+        string outfnm = args[0];
+        if (mpi.size > 1)
+            outfnm += mpi.fnm_ending;
+        if (!no_overwrite) { ofstream ofs(outfnm); }
+
+        map<pair<int, int>, double> global_mindists;
+        double global_md = HUGE_DOUBLE;
+        for (auto& cfg : cfgs)
+        {
+            auto typeset = cfg.GetTypes();
+            auto mdm = cfg.GetTypeMindists();
+            
+            double local_md = HUGE_DOUBLE;
+            for (int t1 : typeset)
+                for (int t2 : typeset)
+                    local_md = __min(local_md, mdm[make_pair(t1, t2)]);
+            global_md = __min(local_md, global_md);
+
+            for (auto& md : mdm)
+                if (global_mindists.find(md.first) != global_mindists.end())
+                    global_mindists[md.first] = __min(md.second, global_mindists[md.first]);
+                else
+                    global_mindists[md.first] = md.second;
+
+            if (!no_overwrite)
+            {
+                if (!no_types)
+                {
+                    string str = "{";
+                    for (auto& md : mdm)
+                        if (md.first.first <= md.first.second)
+                            str += "(" + to_string(md.first.first) + "," + to_string(md.first.second) + "):" + to_string(md.second) + ",";
+                    str = str.substr(0, str.length()-1);
+                    cfg.features["mindists"] = str + '}';
+                    cfg.AppendToFile(outfnm);
+                }
+                else
+                {
+                    cfg.features["mindists"] = to_string(local_md);
+                    cfg.AppendToFile(outfnm);
+                }
+            }
+        }
+
+        if (mpi.rank == 0)
+        {
+            cout << "Global mindist: " << global_md << endl;
+            for (auto& md : global_mindists)
+                if (md.first.first <= md.first.second)
+                    cout << "mindist(" << md.first.first << ", " << md.first.second << ") = " << md.second << endl;
+        }
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("fix_mindist",
+        "fixes too short minimal interatomic distance in all configurations from a file by relaxation with repulsive pair potential",
+        "mlp fix_mindist input.cfg output.cfg 1.6 [Options]\n"
+        "the third argument is new minimal allowed interatomic distance\n"
+        "Options:\n"
+        "   --correct_cell=<true/false>: orthogonolize supercell "
+    ) {
+
+        if (args.size() != 3) ERROR("mlp fix_mindist: 3 arguments required\n");
+
+        Settings settings;
+        settings["init_mindist"] = args[2];
+        settings["iteration_limit"] = "0";
+        if (!opts["log"].empty())
+            settings["log"] = opts["log"];
+        settings["correct_cell"] = opts["correct_cell"];
+
+        Relaxation rlx(settings);
+
+        ifstream ifs(args[0], ios::binary);
+        ofstream ofs(args[1] + mpi.fnm_ending, ios::binary);
+
+        if (!ifs.is_open()) ERROR("input file is not open");
+        if (!ofs.is_open()) ERROR("output file is not open");
+
+        int count=0;
+        for (Configuration cfg; cfg.Load(ifs); count++)
+            if (count%mpi.size == mpi.rank)
+            {
+                if (cfg.size() != 0)
+                {
+                    rlx.cfg = cfg;
+
+                    try { rlx.Run(); }
+                    catch (MlipException& e) 
+                    {
+                        Warning("Mindist fixing in cfg#"+to_string(count+1)+" failed: "+e.message);
+                    }
+
+                    rlx.cfg.Save(ofs);
+                }
+            }
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("subsample",
+        "subsamples a database (reduces size by skiping every N configurations)",
+        "mlp subsample db_in.cfg db_out.cfg skipN\n"
+        ) {
+
+        if (args.size() != 3) {
+            cout << "mlp subsample: 3 arguments required\n";
+            return 1;
+        }
+
+        ifstream ifs(args[0], ios::binary);
+        ofstream ofs(args[1], ios::binary);
+        Configuration cfg;
+        int skipN = stoi(args[2]);
+        for (int i = 0; cfg.Load(ifs); i++) {
+            if (i % skipN == 0)
+                cfg.Save(ofs);
+        }
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("fix_cfg",
+        "Fixes a database by reading and writing it again",
+        "mlp fix_db db.cfg [options]\n"
+        "  Options:\n"
+        "  --no_loss: writes a file with a very high precision\n"
+    ) {
+        unsigned int flag = 0U;
+        if (opts["no_loss"] != "")
+            flag |= Configuration::SAVE_NO_LOSS;
+
+        std::vector<Configuration> db;
+        {
+            Configuration cfg;
+            std::ifstream ifs(args[0], std::ios::binary);
+            while (cfg.Load(ifs)) {
+                    db.push_back(cfg);
+            }
+            ifs.close();
+        }
+        {
+            Configuration cfg;
+            std::ofstream ofs(args[0], std::ios::binary);
+            for (int i = 0; i < db.size(); i++) {
+                db[i].Save(ofs, flag);
+            }
+            ofs.close();
+        }
+    }
+    END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("filter_nonconv",
+        "removes configurations with feature[EFS_by] containing not_converged",
+        "mlp filter_nonconv db.cfg\n"
+        ) {
+        if (args.size() != 1) {
+        std::cerr << "Error: 1 argument required" << std::endl;
+        std::cerr << "Usage: mlp filter_nonconv db.cfg" << std::endl;
+        return 1;
+    }
+    std::vector<Configuration> db;
+    {
+        Configuration cfg;
+        std::ifstream ifs(args[0], std::ios::binary);
+        while (cfg.Load(ifs)) {
+            if (cfg.features["EFS_by"].find("not_converged") == std::string::npos)
+                db.push_back(cfg);
+        }
+        ifs.close();
+    }
+    {
+        Configuration cfg;
+        std::ofstream ofs(args[0], std::ios::binary);
+        for (int i = 0; i < db.size(); i++) {
+            db[i].Save(ofs);
+        }
+        ofs.close();
+    }
+    } END_COMMAND;
+
+    BEGIN_COMMAND("test",
+        "performs a number of unit tests",
+        "mlp test\n"
+    ) {
+        if(!self_test()) exit(1);
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("remap_species",
+        "changes species numbering",
+        "mlp remap_species in.cfg out.cfg n0 n1 ...\n"
+    ) {
+        if (args.size() <= 3) {
+            std::cout << "\tError: at least 3 arguments required\n";
+            return 1;
+        }
+
+        vector<int> new_species;
+        for(int i=2; i < args.size(); i++)
+            new_species.push_back(std::stoi(args[i]));
+
+        ifstream ifs(args[0], std::ios::binary);
+        ofstream ofs(args[1], std::ios::binary);
+        Configuration cfg;
+        while (cfg.Load(ifs)) {
+            for(int i=0; i<cfg.size(); i++)
+                cfg.type(i) = new_species[cfg.type(i)];
+            cfg.Save(ofs);
+        }
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("extract_nbhs",
+        "Creates small configurations from a large ones by extracting neighborhoods encoded in features (\"nbhs_inds_to_extract\")",
+        "mlp extract_nbhs large.cfg nbhs.cfg\n"
+        "neighborhood index numeration starts with 1\n"
+        "Options:\n"
+        "   --lattice=<double>,<double>,<double>,<double>,<double>,<double>,<double>,<double>,<double>\n"
+        "   --cutoff=<double>\n"
+        "   --mindist=<double>/<string>"
+    ) {
+        if (args.size() != 2) {
+            std::cout << "\tError: 2 arguments required\n";
+            return 1;
+        }
+
+        Message("Initialization");
+
+        const string inp_filename = args[0];
+        const string out_filename = args[1];
+
+        double cutoff = 5.0;
+        bool change_cutoff = false;
+        if (!opts["cutoff"].empty())
+            cutoff = stod(opts["cutoff"]);
+
+        if (!opts["lattice"].empty())
+            change_cutoff = true;
+
+        string lat_str = (opts["lattice"].empty()) ?
+            to_string(2*cutoff) + ",0.0,0.0,0.0," + to_string(2*cutoff) + ",0.0,0.0,0.0," + to_string(2*cutoff) :
+            opts["lattice"];
+
+        Matrix3 lattice;
+        for (int i=0; i<8; i++)
+        {
+            lattice[i/3][i%3] = stod(lat_str.substr(0, lat_str.find(',')));
+            lat_str = lat_str.substr(lat_str.find(',')+1);
+        }
+        lattice[2][2] = stod(lat_str);
+
+        if (change_cutoff)
+        {
+            cutoff = Vector3(lattice[0][0], lattice[0][1], lattice[0][2]).Norm()/2;
+            cutoff = min(cutoff, Vector3(lattice[1][0], lattice[1][1], lattice[1][2]).Norm()/2);
+            cutoff = min(cutoff, Vector3(lattice[2][0], lattice[2][1], lattice[2][2]).Norm()/2);
+        }
+
+        vector<Configuration> nbhcfgs;
+
+        Settings settings;
+        settings["init_mindist"] = opts["mindist"];
+        settings["iteration_limit"] = "0";
+        settings["use_gd_method"] = "true";
+        settings["log"] = "stdout";
+
+        Relaxation rlx(settings);
+
+        ifstream ifs(inp_filename, ios::binary);
+        int cntr = 1;
+        for (Configuration cfg; cfg.Load(ifs); cntr++)
+        {
+            int nbh_cntr = 0;
+            if (cfg.features["nbhs_inds_to_extract"].empty())
+                ERROR("\"nbhs_inds_to_extract\" feature not found in cfg#" + to_string(cntr));
+            string inds_string = cfg.features["nbhs_inds_to_extract"];
+
+            while (!inds_string.empty())
+            {
+                int ind = -1;
+                auto comma_pos = inds_string.find(',');
+                if (comma_pos != string::npos)
+                {
+                    ind = stoi(inds_string.substr(0, comma_pos+1));
+                    inds_string = inds_string.substr(comma_pos+1);
+                }
+                else
+                {
+                    ind = stoi(inds_string);
+                    inds_string.clear();
+                }
+                ind--;
+                //ind -= 1 - 3*(cfg.size()-cfg.ghost_inds.size()) - 9;
+
+                nbhcfgs.push_back(cfg.GetCfgFromLocalEnv(ind, lattice));
+                nbh_cntr++;
+
+                if (!opts["mindist"].empty())
+                {
+                    Configuration& extracted_cfg = nbhcfgs.back();
+
+                    set<int> fixed_inds;
+
+                    fixed_inds.insert(0);
+                    extracted_cfg.InitNbhs(cutoff);
+                    for (int i=0; i<extracted_cfg.nbhs[0].count; i++)
+                        fixed_inds.insert(extracted_cfg.nbhs[0].inds[i]);
+
+                    // check the minimal distance between fixed atoms
+                    {
+                        Configuration cfg_tmp;
+                        for (int i : fixed_inds)
+                        {
+                            cfg_tmp.resize(cfg_tmp.size()+1);
+                            cfg_tmp.pos(cfg_tmp.size()-1) = extracted_cfg.pos(i);
+                            cfg_tmp.type(cfg_tmp.size()-1) = extracted_cfg.type(i);
+                        }
+
+                        if (opts["mindist"].find(':') != string::npos)
+                        {
+                            auto typeset = cfg.GetTypes();
+                            auto md_map = cfg_tmp.GetTypeMindists();
+                            std::map<int, double> mindist_cores;
+
+                            string mindist_string = opts["mindist"];
+                            mindist_string = mindist_string.substr(1);
+                            while (!mindist_string.empty())
+                            {
+                                int endpos = (int)mindist_string.find(':', 0);
+                                int spec = stoi(mindist_string.substr(0, endpos));
+                                mindist_string = mindist_string.substr(endpos+1);
+                                if ((endpos = (int)mindist_string.find(',', 0)) != -1)
+                                {
+                                    double rad = stod(mindist_string.substr(0, endpos));
+                                    mindist_string = mindist_string.substr(endpos+1);
+                                    mindist_cores[spec] = rad;
+                                }
+                                else if ((endpos = (int)mindist_string.find('}', 0)) != -1)
+                                {
+                                    double rad = stod(mindist_string.substr(0, endpos));
+                                    mindist_cores[spec] = rad;
+                                    break;
+                                }
+                                else
+                                    ERROR("Incorrect setting for\'mindist\'");
+                            }
+
+                            double delta = HUGE_DOUBLE;
+                            for (int i : typeset)
+                                for (int j : typeset)
+                                    delta = min(delta, md_map[make_pair(i, j)] - (mindist_cores[i]+mindist_cores[j]));
+
+                            if (delta < 0)
+                            {
+                                Warning("Neighborhood mindist is smaller than the target mindist!");
+                                continue;
+                            }
+                        }
+                        else
+                        {
+                            double mindist = stod(opts["mindist"]);
+                            if (mindist > cfg_tmp.MinDist())
+                            {
+                                Warning("Neighborhood mindist (" 
+                                        + to_string(cfg_tmp.MinDist()) 
+                                        + ") is smaller than target mindist (" 
+                                        + to_string(mindist) 
+                                        + ")!");
+                                continue;
+                            }
+                        }
+                    }
+
+                    for (int i : fixed_inds)
+                        extracted_cfg.features["relax:fixed_atoms"] += to_string(i) + ",";
+                    extracted_cfg.features["relax:fixed_atoms"].resize(extracted_cfg.features["relax:fixed_atoms"].size()-1);
+
+                    rlx.cfg = extracted_cfg;
+
+                    double mindist = rlx.cfg.MinDist();
+                    double target_mindist = stod(opts["mindist"]);
+                    while (mindist < target_mindist)
+                    {
+                        try { rlx.Run(); }
+                        catch (MlipException& e) { Warning("Mindist correction failed: " + e.message); }
+                        for (int i=0; i<extracted_cfg.size(); i++)
+                            if (fixed_inds.count(i) > 0)
+                                rlx.cfg.pos(i) = extracted_cfg.pos(i);
+                        mindist = rlx.cfg.MinDist();
+                    }
+
+                    extracted_cfg = rlx.cfg;
+                }
+            }
+            cout << "from configuration #" << cntr << " extracted " << nbh_cntr << " configuration(s)" << endl;
+        }
+
+        { ofstream ofs(out_filename); }
+        for (int i=0; i<mpi.size; i++)
+        {
+            if (i == mpi.rank)
+            {
+                ofstream ofs(args[1], ios::binary | ios::app);
+                for (auto& cfg : nbhcfgs)
+                    cfg.Save(ofs);
+            }
+            MPI_Barrier(mpi.comm);
+        }
+
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("invert_stress",
+        "xxx",
+        "mlp invert_stress input.cfg:\n"
+    ) {
+
+        if (args.size() != 1) {
+            std::cout << "\tError: 1 arguments required\n";
+            return 1;
+        }
+
+        const string cfg_filename = args[0];
+
+        auto cfgs = LoadCfgs(cfg_filename);
+        
+        { ofstream ofs(cfg_filename); }
+
+        int counter=0;
+        for (auto cfg : cfgs)
+        {
+            cfg.stresses *= -1;
+            cfg.AppendToFile(cfg_filename);
+            cout << ++counter << endl;
+        }
+
+    } END_COMMAND;
+
+    BEGIN_UNDOCUMENTED_COMMAND("recognize",
+        "XXX",
+        "mlp recognize MTP100.mtp input.cfg --train=out.mvs:\n"
+    ) 
+    {
+        if (args.size() != 2) {
+            std::cout << "\tError: 2 arguments required\n";
+            return 1;
+        }
+
+        const string mtp_filename = args[0];
+        const string cfg_filename = args[1];
+
+        MTP mtp(mtp_filename);
+
+        if (!opts["train"].empty())
+        {
+            Settings set;
+            set["site_en_weight"] = "1";
+            set["energy_weight"] = "0";
+            set["force_weight"] = "0";
+            set["stress_weight"] = "0";
+            set["weight_scaling"] = "0";
+                
+            CfgSelection selector(&mtp, set);
+
+            selector.LoadSelected(args[1]);
+
+            selector.Save(opts["train"]);
+        }
+        else
+        {
+            auto cfgs = MPI_LoadCfgs(args[1]);
+
+            for (auto& cfg : cfgs)
+                for (int i=0; i< cfg.size(); i++)
+                    cfg.type(i) = 0;
+            
+            Settings set;
+            set["threshold_save"] = "1.";
+            set["threshold_break"] = "0";
+            set["save_extrapolative_to"] = "";
+            set["add_grade_feature"] = "true";
+            set["log"] = "stdout";
+
+            CfgSampling samp(&mtp, mtp_filename, set);
+
+            ofstream ofs("recognition.txt");
+
+            int cntr = 0;
+            for (auto& cfg : cfgs)
+            {
+                samp.Grade(cfg);
+
+                // find two atoms with maximal grades
+                int i1=-1, i2=-1;
+                double g1=-1, g2=-1;
+                for (int i=0; i<cfg.size(); i++)
+                    if (cfg.nbh_grades(i) > g1)
+                    {
+                        i1 = i;
+                        g1 = cfg.nbh_grades(i);
+                    }
+                for (int i=0; i<cfg.size(); i++)
+                    if (cfg.nbh_grades(i) > g2 && i != i1)
+                    {
+                        i2 = i;
+                        g2 = cfg.nbh_grades(i);
+                    }
+
+                //
+                bool high_grades = false;
+                for (int i=0; i<cfg.size(); i++)
+                    if (cfg.nbh_grades(i) > g2/5.0 && i!=i1 && i!=i2)
+                        high_grades = true;
+                bool low_grades = false;
+                if (g2 < 11.0)
+                    low_grades = true;
+
+                //
+                Vector3 l1(cfg.lattice[0][0], cfg.lattice[0][1], cfg.lattice[0][2]);
+                Vector3 l2(cfg.lattice[1][0], cfg.lattice[1][1], cfg.lattice[1][2]);
+                Vector3 l3(cfg.lattice[2][0], cfg.lattice[2][1], cfg.lattice[2][2]);
+                bool is_lattice_ortog = (fabs(l1*l2) + fabs(l2*l3) + fabs(l1*l3) < 0.1);
+                double a = (l1.Norm() + l2.Norm() + l3.Norm()) / 3/4;
+                //cout << " lat_dist=" << a << ' ';
+                bool is_lattice_cubic = ((l1-l2).Norm() + (l2-l3).Norm() + (l1-l3).Norm() < 0.1*a);
+                //
+                Vector3 d12 = cfg.pos(i1) - cfg.pos(i2);
+                bool wrong_orientation = false;
+                bool wrong_dist = false;
+                if (d12.Norm() < 0.85*1.45*sqrt(2)*a/3 || d12.Norm() > 1.15*1.45*sqrt(2)*a/3)
+                    wrong_dist = true;
+                //cout << ' ' << d12.Norm() << ' ' << sqrt(2)*a/3 << ' ' << d12.Norm()/(sqrt(2)*a/3) << ' ';
+                if (fabs(d12*Vector3(1,1,0)) < 0.9)
+                    wrong_orientation = true;
+
+                bool shifted = false;
+                if (cfg.pos(i1, 0) < 0.05*a ||
+                    cfg.pos(i1, 1) < 0.05*a ||
+                    cfg.pos(i1, 2) < 0.4*a  ||
+                    cfg.pos(i1, 0) > 0.95*a ||
+                    cfg.pos(i1, 1) > 0.95*a ||
+                    cfg.pos(i1, 2) > 0.6*a  ||
+                    cfg.pos(i2, 0) < 0.05*a ||
+                    cfg.pos(i2, 1) < 0.05*a ||
+                    cfg.pos(i2, 2) < 0.4*a  ||
+                    cfg.pos(i2, 0) > 0.95*a ||
+                    cfg.pos(i2, 1) > 0.95*a ||
+                    cfg.pos(i2, 2) > 0.6*a)
+                    shifted = true;
+
+                ofs << "cfg# " << ++cntr 
+                    << "\ni1= " << i1 << " g1=" << g1 << "\ti2= " << i2  << " g2=" << g2
+                    << "\nhigh_grades_check " << high_grades
+                    << "\nlow_grades_check " << low_grades
+                    << "\nlattice_ortogonality_check " << is_lattice_ortog
+                    << "\nlattice_cubic_check " << is_lattice_cubic
+                    << "\nwrong_orientation_check " << wrong_orientation
+                    << "\nwrong_distance_check " << wrong_dist
+                    << "\ndisplscement_check " << shifted
+                    << "\n"
+                    << endl;
+
+            }
+        }
+
+    } END_COMMAND;
+
+    return is_command_found;
+}
diff --git a/mlip3_missing_files/src_CMakeLists.txt b/mlip3_missing_files/src_CMakeLists.txt
new file mode 100644
index 00000000..97238e72
--- /dev/null
+++ b/mlip3_missing_files/src_CMakeLists.txt
@@ -0,0 +1,34 @@
+set(SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/basic_mlip.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/basic_potentials.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/basic_trainer.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/basis.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cfg_sampling.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cfg_selection.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/configuration.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/eam.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/error_monitor.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/external_potential.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/lammps_potential.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/linear_regression.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/lotf.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/lotf_linear.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/magnetic_moments.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/maxvol.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mtp.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mtpr.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mtpr_trainer.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/non_linear_regression.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/pair_potentials.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/sw.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/vasp_potential.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/wrapper.cpp
+  )
+
+target_sources(mlp PRIVATE ${SOURCES})
+target_sources(libinterface PRIVATE ${SOURCES})
+
+add_subdirectory(mlp)
+add_subdirectory(common)
+add_subdirectory(drivers)
+add_subdirectory(external)
diff --git a/mlip3_missing_files/src_common_CMakeLists.txt b/mlip3_missing_files/src_common_CMakeLists.txt
new file mode 100644
index 00000000..a56453c6
--- /dev/null
+++ b/mlip3_missing_files/src_common_CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/bfgs.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/comm.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mpi_stubs.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/stdafx.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+  )
+
+target_sources(mlp PRIVATE ${SOURCES})
+target_sources(libinterface PRIVATE ${SOURCES})
\ No newline at end of file
diff --git a/mlip3_missing_files/src_drivers_CMakeLists.txt b/mlip3_missing_files/src_drivers_CMakeLists.txt
new file mode 100644
index 00000000..3b7486d8
--- /dev/null
+++ b/mlip3_missing_files/src_drivers_CMakeLists.txt
@@ -0,0 +1,7 @@
+set(SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/basic_drivers.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/relaxation.cpp
+  )
+
+target_sources(mlp PRIVATE ${SOURCES})
+target_sources(libinterface PRIVATE ${SOURCES})
diff --git a/mlip3_missing_files/src_external_CMakeLists.txt b/mlip3_missing_files/src_external_CMakeLists.txt
new file mode 100644
index 00000000..1396a3f7
--- /dev/null
+++ b/mlip3_missing_files/src_external_CMakeLists.txt
@@ -0,0 +1,3 @@
+set(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/interface.cpp)
+
+target_sources(libinterface PRIVATE ${SOURCES})
\ No newline at end of file
diff --git a/mlip3_missing_files/src_mlp_CMakeLists.txt b/mlip3_missing_files/src_mlp_CMakeLists.txt
new file mode 100644
index 00000000..11cab618
--- /dev/null
+++ b/mlip3_missing_files/src_mlp_CMakeLists.txt
@@ -0,0 +1,6 @@
+set(SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/mlp.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/mlp_commands.cpp
+  )
+
+target_sources(mlp PRIVATE ${SOURCES})
diff --git a/mlip3_missing_files/test_CMakeLists.txt b/mlip3_missing_files/test_CMakeLists.txt
new file mode 100644
index 00000000..136a10d7
--- /dev/null
+++ b/mlip3_missing_files/test_CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TEST_BASE_DIR ${CMAKE_BINARY_DIR}/Testing)
+
+add_subdirectory(self-test)
+add_subdirectory(examples)
\ No newline at end of file
diff --git a/mlip3_missing_files/test_examples_00_CMakeLists.txt b/mlip3_missing_files/test_examples_00_CMakeLists.txt
new file mode 100644
index 00000000..3ad61c0b
--- /dev/null
+++ b/mlip3_missing_files/test_examples_00_CMakeLists.txt
@@ -0,0 +1,14 @@
+set(TEST_NAME "convert_vasp_outcar")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/OUTCAR
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> convert --input_format=outcar OUTCAR out/relax.cfg > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_01_CMakeLists.txt b/mlip3_missing_files/test_examples_01_CMakeLists.txt
new file mode 100644
index 00000000..6fd6d996
--- /dev/null
+++ b/mlip3_missing_files/test_examples_01_CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TEST_NAME "train")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/06.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/train.cfg
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> train 06.almtp train.cfg --save_to=./out/pot.almtp --iteration_limit=100 --al_mode=nbh > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_02_CMakeLists.txt b/mlip3_missing_files/test_examples_02_CMakeLists.txt
new file mode 100644
index 00000000..74fcf1f0
--- /dev/null
+++ b/mlip3_missing_files/test_examples_02_CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TEST_NAME "check_errors")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/pot.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/train.cfg
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> check_errors pot.almtp train.cfg --log=stdout --report_to=./out/errors.txt > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_03_CMakeLists.txt b/mlip3_missing_files/test_examples_03_CMakeLists.txt
new file mode 100644
index 00000000..e1184239
--- /dev/null
+++ b/mlip3_missing_files/test_examples_03_CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TEST_NAME "calculate_efs")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/pot.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/train.cfg
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> calculate_efs pot.almtp train.cfg --output_filename=./out/calculated.cfg > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_04_CMakeLists.txt b/mlip3_missing_files/test_examples_04_CMakeLists.txt
new file mode 100644
index 00000000..4365711b
--- /dev/null
+++ b/mlip3_missing_files/test_examples_04_CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TEST_NAME "calculate_grade")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/pot.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/train.cfg
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> calculate_grade pot.almtp train.cfg ./out/graded.cfg > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_05_CMakeLists.txt b/mlip3_missing_files/test_examples_05_CMakeLists.txt
new file mode 100644
index 00000000..06973258
--- /dev/null
+++ b/mlip3_missing_files/test_examples_05_CMakeLists.txt
@@ -0,0 +1,14 @@
+set(TEST_NAME "cut_extrapolative_neighborhood")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/selected.cfg
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> cut_extrapolative_nbh selected.cfg ./out/spherical.cfg --cutoff=8 > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_06_CMakeLists.txt b/mlip3_missing_files/test_examples_06_CMakeLists.txt
new file mode 100644
index 00000000..45899b4d
--- /dev/null
+++ b/mlip3_missing_files/test_examples_06_CMakeLists.txt
@@ -0,0 +1,16 @@
+set(TEST_NAME "select_add")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/pot.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/train.cfg
+  ${CMAKE_CURRENT_SOURCE_DIR}/preselected.cfg
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> select_add pot.almtp train.cfg preselected.cfg ./out/selected.cfg > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_07_CMakeLists.txt b/mlip3_missing_files/test_examples_07_CMakeLists.txt
new file mode 100644
index 00000000..e28995d3
--- /dev/null
+++ b/mlip3_missing_files/test_examples_07_CMakeLists.txt
@@ -0,0 +1,16 @@
+set(TEST_NAME "relax")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/pot.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/to-relax.cfg
+  ${CMAKE_CURRENT_SOURCE_DIR}/relax.ini
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> relax pot.almtp to-relax.cfg ./out/relaxed.cfg --relaxation_settings=relax.ini > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
+
diff --git a/mlip3_missing_files/test_examples_08_CMakeLists.txt b/mlip3_missing_files/test_examples_08_CMakeLists.txt
new file mode 100644
index 00000000..210a10a5
--- /dev/null
+++ b/mlip3_missing_files/test_examples_08_CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TEST_NAME "relax_preselect")
+set(TEST_DIR ${TEST_BASE_DIR}/examples/${TEST_NAME})
+file(MAKE_DIRECTORY ${TEST_DIR}/out)
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/pot.almtp
+  ${CMAKE_CURRENT_SOURCE_DIR}/to-relax.cfg
+  ${CMAKE_CURRENT_SOURCE_DIR}/relax.ini
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+add_test(
+    NAME ${TEST_NAME}
+    COMMAND sh -c "$<TARGET_FILE:mlp> relax pot.almtp to-relax.cfg ./out/relaxed.cfg --relaxation_settings=relax.ini --extrapolation_control --save_extrapolative_to=./out/preselected.cfg --threshold_save=2 --threshold_break=10 > ./out/stdout.log"
+    WORKING_DIRECTORY ${TEST_DIR}
+)
diff --git a/mlip3_missing_files/test_examples_CMakeLists.txt b/mlip3_missing_files/test_examples_CMakeLists.txt
new file mode 100644
index 00000000..1f8762cf
--- /dev/null
+++ b/mlip3_missing_files/test_examples_CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory(00.convert_vasp_outcar)
+add_subdirectory(01.train)
+add_subdirectory(02.check_errors)
+add_subdirectory(03.calculate_efs)
+add_subdirectory(04.calculate_grade)
+add_subdirectory(05.cut_extrapolative_neighborhood)
+add_subdirectory(06.select_add)
+add_subdirectory(07.relax)
+add_subdirectory(08.relax_preselect)
\ No newline at end of file
diff --git a/mlip3_missing_files/test_selftest_CMakeLists.txt b/mlip3_missing_files/test_selftest_CMakeLists.txt
new file mode 100644
index 00000000..01f67694
--- /dev/null
+++ b/mlip3_missing_files/test_selftest_CMakeLists.txt
@@ -0,0 +1,21 @@
+set(TEST_DIR ${TEST_BASE_DIR}/self-test)
+file(MAKE_DIRECTORY ${TEST_DIR})
+set(TEST_SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/configurations
+  ${CMAKE_CURRENT_SOURCE_DIR}/mlips
+  )
+file(COPY ${TEST_SOURCES} DESTINATION ${TEST_DIR})
+
+set(TEST_DEV_SOURCES ${PROJECT_SOURCE_DIR})
+
+add_test(
+  NAME UNIT_TEST
+  COMMAND mlp self-test
+  WORKING_DIRECTORY ${TEST_DIR}
+)
+
+add_test(
+  NAME UNIT_TEST_DEV
+  COMMAND mlp self-test-dev
+  WORKING_DIRECTORY ${TEST_DIR}
+)
\ No newline at end of file

From 76f3a223152b92f825aba91de78e26ded9e67eb3 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 08:23:49 -0500
Subject: [PATCH 05/29] add dependencies

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index ce0d9a06..ec3bc073 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ isort==5.13.2
 jupyter==1.0.0
 jinja2==3.1.2
 maml==2023.9.9
+monty==2024.2.2
 myst-parser==2.0.0
 orion>=0.2.4.post1
 pyarrow==15.0.0

From 193e765f695452f1be090345f6d0d1ea6f26cfe2 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 10:06:27 -0500
Subject: [PATCH 06/29] refactor to clean up script

---
 crystal_diffusion/models/mtp.py | 318 +++++++++++++++++++++++++++++
 crystal_diffusion/train_mtp.py  | 340 ++------------------------------
 2 files changed, 336 insertions(+), 322 deletions(-)
 create mode 100644 crystal_diffusion/models/mtp.py

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
new file mode 100644
index 00000000..45c4217b
--- /dev/null
+++ b/crystal_diffusion/models/mtp.py
@@ -0,0 +1,318 @@
+"""Moment Tensor Potential model.
+
+This script defines a MTP model in a lightning like manner, with a train() and evaluate() method.
+However, it cannot be called as a standard lightning module as it relies on the MLIP-3 library for the model
+implementation.
+"""
+import itertools
+import os
+import re
+import shutil
+import subprocess
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from maml.apps.pes import MTPotential
+from maml.utils import check_structures_forces_stresses, pool_from
+from monty.io import zopen
+from monty.tempfile import ScratchDir
+from pymatgen.core import Structure
+
+
+class MTPWithMLIP3(MTPotential):
+    """MTP with MLIP-3."""
+
+    def __init__(self,
+                 mlip_path: str,
+                 name: Optional[str] = None,
+                 param: Optional[Dict[Any, Any]] = None,
+                 version: Optional[str] = None):
+        """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3.
+
+        Args:
+            mlip_path: path to mlip3 library
+            name: MTPotential argument
+            param: MTPotential argument
+            version: MTPotential argument
+        """
+        super().__init__(name, param, version)
+        self.mlp_command = os.path.join(mlip_path, "build", "mlp")
+        assert os.path.exists(self.mlp_command), "mlp command not found in mlip-3 build folder"
+        self.mlp_templates = os.path.join(mlip_path, "MTP_templates")
+        assert os.path.exists(self.mlp_templates), "MTP templates not found in mlip-3 folder"
+        self.fitted_mtp = None
+        self.elements = None
+
+    def to_lammps_format(self):
+        """Write the trained MTP in a LAMMPS compatible format."""
+        # TODO
+        # write params write the fitted mtp in a LAMMPS compatible format
+        # self.write_param(
+        #    fitted_mtp=fitted_mtp,
+        #    Abinitio=0,
+        #    Driver=1,
+        #    Write_cfgs=predict_file,
+        #    Database_filename=original_file,
+        #    **kwargs,
+        # )
+        pass
+
+    def evaluate(self,
+                 test_structures: List[Structure],
+                 test_energies: List[float],
+                 test_forces: List[List[float]],
+                 test_stresses: Optional[List[List[float]]] = None,
+                 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP.
+
+        Args:
+            test_structures: evaluation set of pymatgen Structure Objects.
+            test_energies: list of total energies of each structure to evaluation in test_structures list.
+            test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list.
+                m can be varied with each single structure case.
+            test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in
+                test_structures list. If None, do not evaluate on stresses. Default to None.
+
+        Returns:
+            dataframe with ground truth energies, forces
+            dataframe with predicted energies, forces, MaxVol gamma (nbh grades)
+        """
+        original_file = "original.cfgs"
+        predict_file = "predict.cfgs"
+        test_structures, test_forces, test_stresses = check_structures_forces_stresses(
+            test_structures, test_forces, test_stresses
+        )
+        predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses)
+
+        with ScratchDir("."):  # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere
+            # write the structures to evaluate in a mlp compatible format
+            original_file = self.write_cfg(original_file, cfg_pool=predict_pool)
+            df_orig = self.read_cfgs(original_file, nbh_grade=False)  # read original values as a DataFrame
+
+            # calculate_grade is the method to get the forces, energy & maxvol values
+            cmd = [self.mlp_command, "calculate_grade", self.fitted_mtp, original_file, predict_file]
+            predict_file += '.0'  # added by mlp...
+            with subprocess.Popen(cmd, stdout=subprocess.PIPE) as p:  # run mlp
+                stdout = p.communicate()[0]
+                rc = p.returncode
+            if rc != 0:
+                error_msg = f"mlp exited with return code {rc}"
+                msg = stdout.decode("utf-8").split("\n")[:-1]
+                try:
+                    error_line = next(i for i, m in enumerate(msg) if m.startswith("ERROR"))
+                    error_msg += ", ".join(msg[error_line:])
+                except Exception:
+                    error_msg += msg[-1]
+                raise RuntimeError(error_msg)
+
+            df_predict = self.read_cfgs(predict_file, nbh_grade=True)
+        return df_orig, df_predict
+
+    def read_cfgs(self, filename: str, nbh_grade: bool = False) -> pd.DataFrame:
+        """Read mlp output when MaxVol gamma factor is present.
+
+        Args:
+            filename: name of mlp output file to be parsed.
+            nbh_grade (optional): if True, add the nbh_grades in the resulting dataframe. Defaults to False.
+
+        Returns:
+            dataframe with energies, forces, optional nbh grades (MaxVol gamma)
+        """
+        def formatify(string: str) -> List[float]:
+            """Convert string to a list of float."""
+            return [float(s) for s in string.split()]
+
+        if not self.elements:
+            raise ValueError("No species given.")
+
+        data_pool = []
+        with zopen(filename, "rt") as f:
+            lines = f.read()
+
+        block_pattern = re.compile("BEGIN_CFG\n(.*?)\nEND_CFG", re.S)
+        size_pattern = re.compile("Size\n(.*?)\n SuperCell", re.S | re.I)
+        if nbh_grade:
+            position_pattern = re.compile("nbh_grades\n(.*?)\n Energy", re.S)
+        else:
+            position_pattern = re.compile("fz\n(.*?)\n Energy", re.S)
+        energy_pattern = re.compile("Energy\n(.*?)\n (?=PlusStress|Stress)", re.S)
+        # stress_pattern = re.compile("xy\n(.*?)(?=\n|$)", re.S)  # TODO stress values
+
+        for block in block_pattern.findall(lines):
+            d = {"outputs": {}}
+            size_str = size_pattern.findall(block)[0]
+            size = int(size_str.lstrip())
+            position_str = position_pattern.findall(block)[0]
+            position = np.array(list(map(formatify, position_str.split("\n"))))
+            species = np.array(self.elements)[position[:, 1].astype(np.int64)]
+            forces = position[:, 5:8].tolist()
+
+            energy_str = energy_pattern.findall(block)[0]
+            energy = float(energy_str.lstrip())
+            # TODO add stress
+            # stress_str = stress_pattern.findall(block)[0]
+            # virial_stress = (np.array(list(map(formatify, stress_str.split()))).reshape(6,).tolist())
+            # virial_stress = [virial_stress[self.mtp_stress_order.index(n)] for n in self.vasp_stress_order]
+            d["outputs"]["energy"] = energy
+            d["num_atoms"] = size
+            d["outputs"]["forces"] = forces
+            d["outputs"]["species"] = species
+            # d["outputs"]["virial_stress"] = virial_stress
+            if nbh_grade:
+                nbh_grade_values = position[:, 8].tolist()
+                d["outputs"]["nbh_grades"] = nbh_grade_values
+            data_pool.append(d)
+
+        # originally used convert_docs from maml.utils, but it hard-coded the structure of the dataframe and does not
+        # manage nbh_grades; we use our own implementation instead
+        df = self.convert_to_dataframe(docs=data_pool)
+        return df
+
+    @staticmethod
+    def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
+        """Convert a list of docs into DataFrame usable for computing metrics and analysis.
+
+        Modified from maml.utils._data_conversion.py
+
+        Args:
+            docs: list of docs generated by mlip-3. Each doc should be structured as a dict.
+
+        Returns:
+            A DataFrame with energy, force, and nbh grades (MaxVol factor) per atom and per structure. Energy is repeated
+            for each atom and corresponds to the total energy predicted.
+        """
+        df = defaultdict(list)
+        for s_idx, d in enumerate(docs):
+            outputs = d["outputs"]
+            force_arr = np.array(outputs["forces"])
+            n_atom = force_arr.shape[0]
+            for i, x in enumerate(['x', 'y', 'z']):
+                df[f'f{x}'] += force_arr[:, i].tolist()
+            df['energy'] += [outputs['energy']] * n_atom  # copy the value to all atoms
+            if "nbh_grades" in outputs.keys():
+                nbh_grades = outputs["nbh_grades"]
+                df['nbh_grades'] += nbh_grades
+            df['atom_index'] += list(range(n_atom))
+            df['structure_index'] += [s_idx] * n_atom
+
+        df = pd.DataFrame(df)
+        return df
+
+    def train(
+            self,
+            train_structures: List[Structure],
+            train_energies: List[float],
+            train_forces: List[np.array],
+            train_stresses: List[List[float]],
+            unfitted_mtp: str = "08g.amltp",
+            fitted_mtp_savedir: str = '../',
+            max_dist: float = 5,
+            radial_basis_size: int = 8,
+            max_iter: int = 1000,  # TODO check the next kwargs in mlip3
+            energy_weight: float = 1,
+            force_weight: float = 1e-2,
+            stress_weight: float = 1e-3,
+            init_params: str = "same",
+            scale_by_force: float = 0,
+            bfgs_conv_tol: float = 1e-3,
+            weighting: float = "vibration",
+    ) -> int:
+        """Training data with moment tensor method using MLIP-3.
+
+        Override the base class method.
+
+        Args:
+            train_structures: The list of Pymatgen Structure object.
+            train_energies: List of total energies of each structure in structures list.
+            train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
+                m can be varied with each single structure case.
+            train_stresses: List of (6, ) virial stresses of each structure in structures list.
+            unfitted_mtp (optional): Define the initial mtp file. Default to 08g.amltp
+            fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd)
+            max_dist (optional): The actual radial cutoff. Defaults to 5.
+            radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8.
+            max_iter (optional): The number of maximum iteration. Defaults to 1000.
+            energy_weight (optional): The weight of energy. Defaults to 1
+            force_weight (optional): The weight of forces. Defaults to 1e-2
+            stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
+            init_params (optional): How to initialize parameters if a potential was not
+                pre-fitted. Choose from "same" and "random". Defaults to "same".
+            scale_by_force (optional): If >0 then configurations near equilibrium
+               (with roughly force < scale_by_force) get more weight. Defaults to 0.
+            bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this
+                over 50 BFGS iterations. Defaults to 1e-3.
+            weighting (optional): How to weight configuration with different sizes relative to each other.
+                Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
+
+        Returns:
+            rc : return code of the mlp training script
+        """
+        train_structures, train_forces, train_stresses = check_structures_forces_stresses(
+            train_structures, train_forces, train_stresses
+        )
+        train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses)
+        elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures])))
+        self.elements = [str(element) for element in elements]  # TODO move to __init__
+
+        atoms_filename = "train.cfgs"
+
+        with ScratchDir("."):  # create a tmpdir - deleted afterew
+            atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
+
+            if not unfitted_mtp:
+                raise RuntimeError("No specific parameter file provided.")
+            mtp_file_path = os.path.join(self.mlp_templates, unfitted_mtp)
+            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
+            commands = [self.mlp_command, "mindist", atoms_filename]
+            with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f) as p:
+                p.communicate()[0]
+
+            with open("min_dist") as f:
+                lines = f.readlines()
+
+            split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
+            min_dist = float(lines[-1].split(split_symbol)[1])
+
+            with open(unfitted_mtp) as f:
+                template = f.read()
+
+            s = template % (len(self.elements), min_dist, max_dist, radial_basis_size)
+            with open(unfitted_mtp, "w") as f:
+                f.write(s)
+
+            save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
+            cmds_list = [
+                self.mlp_command,
+                "train",
+                unfitted_mtp,
+                atoms_filename,
+                f"--save_to={save_fitted_mtp}",
+                f"--iteration_limit={max_iter}",
+                "--al_mode=nbh",  # active learning mode - required to get extrapolation grade
+                # f"--curr-pot-name={unfitted_mtp}",  # TODO check those kwargs
+                # f"--energy-weight={energy_weight}",
+                # f"--force-weight={force_weight}",
+                # f"--stress-weight={stress_weight}",
+                # f"--init-params={init_params}",
+                # f"--scale-by-force={scale_by_force}",
+                # f"--bfgs-conv-tol={bfgs_conv_tol}",
+                # f"--weighting={weighting}",
+            ]
+            with subprocess.Popen(cmds_list, stdout=subprocess.PIPE) as p:
+                stdout = p.communicate()[0]
+                rc = p.returncode
+            if rc != 0:
+                error_msg = f"MLP exited with return code {rc}"
+                msg = stdout.decode("utf-8").split("\n")[:-1]
+                try:
+                    error_line = next(i for i, m in enumerate(msg) if m.startswith("ERROR"))
+                    error_msg += ", ".join(msg[error_line:])
+                except Exception:
+                    error_msg += msg[-1]
+                raise RuntimeError(error_msg)
+            # copy the fitted mtp outside the working directory
+            self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp)
+            shutil.copy(save_fitted_mtp, self.fitted_mtp)
+        return rc
diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index 3fcc3540..9a880fde 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -1,21 +1,21 @@
-import itertools
+"""Script to train and evaluate a MTP.
+
+Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp.
+"""
 import os
-import re
-import shutil
-import subprocess
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
 import yaml
-from maml.apps.pes import MTPotential
-from maml.utils import check_structures_forces_stresses, pool_from
-from monty.io import zopen
-from monty.tempfile import ScratchDir
 from pymatgen.core import Structure
 from sklearn.metrics import mean_absolute_error
 
+from crystal_diffusion.models.mtp import MTPWithMLIP3
+
+
+MLIP_PATH = os.path.join(os.getcwd(), "mlip-3")
+SAVE_DIR = os.path.join(os.getcwd(), "debug_mlip3")  # for demo only
 
 # TODO list of yaml files should come from an external call
 # yaml dump file
@@ -28,309 +28,6 @@
 atom_dict = {1: 'Si'}
 
 
-def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
-    """Method to convert a list of docs into DataFrame usable for computing metrics and analysis.
-
-    Modified from maml.utils._data_conversion.py
-
-    Args:
-        docs: list of docs generated by mlip-3. Each doc should be structured as a dict.
-
-    Returns:
-        A DataFrame with energy, force, and nbh grades (MaxVol factor) per atom and per structure. Energy is repeated
-        for each atom and corresponds to the total energy predicted.
-    """
-    # TODO move to another .py file
-    df = defaultdict(list)
-    for s_idx, d in enumerate(docs):
-        outputs = d["outputs"]
-        force_arr = np.array(outputs["forces"])
-        n_atom = force_arr.shape[0]
-        for i, x in enumerate(['x', 'y', 'z']):
-            df[f'f{x}'] += force_arr[:, i].tolist()
-        df['energy'] += [outputs['energy']] * n_atom  # copy the value to all atoms
-        if "nbh_grades" in outputs.keys():
-            nbh_grades = outputs["nbh_grades"]
-            df['nbh_grades'] += nbh_grades
-        df['atom_index'] += list(range(n_atom))
-        df['structure_index'] += [s_idx] * n_atom
-
-    df = pd.DataFrame(df)
-    return df
-
-
-class MTPWithMLIP3(MTPotential):
-    """MTP with MLIP-3."""
-    def __init__(self,
-                 mlip_path: str,
-                 maml_path: str,
-                 name: Optional[str] = None,
-                 param: Optional[Dict[Any, Any]] = None,
-                 version: Optional[str] = None):
-        """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3.
-
-        Args:
-            mlip_path: path to mlip3 mlp command
-            maml_path: path to maml _mtp.py file
-            name: MTPotential argument
-            param: MTPotential argument
-            version: MTPotential argument
-        """
-        super().__init__(name, param, version)
-        self.mlp_command = mlip_path
-        # TODO there is a better way to do this
-        self.module_dir = maml_path
-        self.fitted_mtp = None
-        self.elements = None
-
-    def to_lammps_format(self):
-        """Write the trained MTP in a LAMMPS compatible format."""
-        # TODO
-        # write params write the fitted mtp in a LAMMPS compatible format
-        # self.write_param(
-        #    fitted_mtp=fitted_mtp,
-        #    Abinitio=0,
-        #    Driver=1,
-        #    Write_cfgs=predict_file,
-        #    Database_filename=original_file,
-        #    **kwargs,
-        # )
-        pass
-
-    def evaluate(self,
-                 fitted_mtp_path: str,
-                 test_structures: List[Structure],
-                 test_energies: List[float],
-                 test_forces: List[List[float]],
-                 test_stresses: Optional[List[List[float]]] = None,
-                 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
-        """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP.
-
-        Args:
-            fitted_mtp_path: path to fitted mtp file.
-            test_structures: evaluation set of pymatgen Structure Objects.
-            test_energies: list of total energies of each structure to evaluation in test_structures list.
-            test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list.
-                m can be varied with each single structure case.
-            test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in
-                test_structures list. If None, do not evaluate on stresses. Default to None.
-
-        Returns:
-            dataframe with ground truth energies, forces
-            dataframe with predicted energies, forces, MaxVol gamma (nbh grades)
-        """
-        fitted_mtp = fitted_mtp_path
-        original_file = "original.cfgs"
-        predict_file = "predict.cfgs"
-        test_structures, test_forces, test_stresses = check_structures_forces_stresses(
-            test_structures, test_forces, test_stresses
-        )
-        predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses)
-
-        with ScratchDir("."):  # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere
-            # write the structures to evaluate in a mlp compatible format
-            original_file = self.write_cfg(original_file, cfg_pool=predict_pool)
-            df_orig = self.read_cfgs(original_file, nbh_grade=False)  # read original values as a DataFrame
-
-            # calculate_grade is the method to get the forces, energy & maxvol values
-            cmd = [self.mlp_command, "calculate_grade", fitted_mtp, original_file, predict_file]
-            predict_file += '.0'  # added by mlp...
-            with subprocess.Popen(cmd, stdout=subprocess.PIPE) as p:  # run mlp
-                stdout = p.communicate()[0]
-                rc = p.returncode
-            if rc != 0:
-                error_msg = f"mlp exited with return code {rc}"
-                msg = stdout.decode("utf-8").split("\n")[:-1]
-                try:
-                    error_line = next(i for i, m in enumerate(msg) if m.startswith("ERROR"))
-                    error_msg += ", ".join(msg[error_line:])
-                except Exception:
-                    error_msg += msg[-1]
-                raise RuntimeError(error_msg)
-
-            df_predict = self.read_cfgs(predict_file, nbh_grade=True)
-        return df_orig, df_predict
-
-    def read_cfgs(self, filename: str, nbh_grade: bool = False) -> pd.DataFrame:
-        """Read mlp output when MaxVol gamma factor is present.
-
-        Args:
-            filename: name of mlp output file to be parsed.
-            nbh_grade (optional): if True, add the nbh_grades in the resulting dataframe. Defaults to False.
-
-        Returns:
-            dataframe with energies, forces, optional nbh grades (MaxVol gamma)
-        """
-        def formatify(string: str) -> List[float]:
-            """Convert string to a list of float."""
-            return [float(s) for s in string.split()]
-
-        if not self.elements:
-            raise ValueError("No species given.")
-
-        data_pool = []
-        with zopen(filename, "rt") as f:
-            lines = f.read()
-
-        block_pattern = re.compile("BEGIN_CFG\n(.*?)\nEND_CFG", re.S)
-        size_pattern = re.compile("Size\n(.*?)\n SuperCell", re.S | re.I)
-        if nbh_grade:
-            position_pattern = re.compile("nbh_grades\n(.*?)\n Energy", re.S)
-        else:
-            position_pattern = re.compile("fz\n(.*?)\n Energy", re.S)
-        energy_pattern = re.compile("Energy\n(.*?)\n (?=PlusStress|Stress)", re.S)
-        # stress_pattern = re.compile("xy\n(.*?)(?=\n|$)", re.S)  # TODO stress values
-
-        for block in block_pattern.findall(lines):
-            d = {"outputs": {}}
-            size_str = size_pattern.findall(block)[0]
-            size = int(size_str.lstrip())
-            position_str = position_pattern.findall(block)[0]
-            position = np.array(list(map(formatify, position_str.split("\n"))))
-            species = np.array(self.elements)[position[:, 1].astype(np.int64)]
-            forces = position[:, 5:8].tolist()
-
-            energy_str = energy_pattern.findall(block)[0]
-            energy = float(energy_str.lstrip())
-            # TODO add stress
-            # stress_str = stress_pattern.findall(block)[0]
-            # virial_stress = (np.array(list(map(formatify, stress_str.split()))).reshape(6,).tolist())
-            # virial_stress = [virial_stress[self.mtp_stress_order.index(n)] for n in self.vasp_stress_order]
-            d["outputs"]["energy"] = energy
-            d["num_atoms"] = size
-            d["outputs"]["forces"] = forces
-            d["outputs"]["species"] = species
-            # d["outputs"]["virial_stress"] = virial_stress
-            if nbh_grade:
-                nbh_grade_values = position[:, 8].tolist()
-                d["outputs"]["nbh_grades"] = nbh_grade_values
-            data_pool.append(d)
-
-        # originally used convert_docs from maml.utils, but it hard-coded the structure of the dataframe and does not
-        # manage nbh_grades; we use our own implementation instead
-        df = convert_to_dataframe(docs=data_pool)
-        return df
-
-    def train(
-            self,
-            train_structures: List[Structure],
-            train_energies: List[float],
-            train_forces: List[np.array],
-            train_stresses: List[List[float]],
-            unfitted_mtp: str = "08g.mtp",  # TODO fix this
-            fitted_mtp_savedir: str = '../',
-            max_dist: float = 5,
-            radial_basis_size: int = 8,
-            max_iter: int = 1000,  # TODO check the next kwargs in mlip3
-            energy_weight: float = 1,
-            force_weight: float = 1e-2,
-            stress_weight: float = 1e-3,
-            init_params: str = "same",
-            scale_by_force: float = 0,
-            bfgs_conv_tol: float = 1e-3,
-            weighting: float = "vibration",
-    ) -> int:
-        """Training data with moment tensor method using MLIP-3.
-
-        Override the base class method.
-
-        Args:
-            train_structures: The list of Pymatgen Structure object.
-            train_energies: List of total energies of each structure in structures list.
-            train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
-                m can be varied with each single structure case.
-            train_stresses: List of (6, ) virial stresses of each structure in structures list.
-            unfitted_mtp (optional): Define the initial mtp file.
-                Default to 08g.mtp -  the mtp file stored in .params directory. TODO change
-            fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd)
-            max_dist (optional): The actual radial cutoff. Defaults to 5.
-            radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8.
-            max_iter (optional): The number of maximum iteration. Defaults to 1000.
-            energy_weight (optional): The weight of energy. Defaults to 1
-            force_weight (optional): The weight of forces. Defaults to 1e-2
-            stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3.
-            init_params (optional): How to initialize parameters if a potential was not
-                pre-fitted. Choose from "same" and "random". Defaults to "same".
-            scale_by_force (optional): If >0 then configurations near equilibrium
-               (with roughly force < scale_by_force) get more weight. Defaults to 0.
-            bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this
-                over 50 BFGS iterations. Defaults to 1e-3.
-            weighting (optional): How to weight configuration with different sizes relative to each other.
-                Choose from "vibrations", "molecules" and "structures". Defaults to "vibration".
-
-        Returns:
-            rc : return code of the mlp training script
-        """
-        train_structures, train_forces, train_stresses = check_structures_forces_stresses(
-            train_structures, train_forces, train_stresses
-        )
-        train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses)
-        elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures])))
-        self.elements = [str(element) for element in elements]  # TODO move to __init__
-
-        atoms_filename = "train.cfgs"
-
-        with ScratchDir("."):  # create a tmpdir - deleted afterew
-            atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
-
-            if not unfitted_mtp:
-                raise RuntimeError("No specific parameter file provided.")
-            # TODO move initial mtp file to our repo to remove the dependency on module_dir
-            mtp_file_path = os.path.join(self.module_dir, "params", unfitted_mtp)
-            shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
-            commands = [self.mlp_command, "mindist", atoms_filename]
-            with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f) as p:
-                p.communicate()[0]
-
-            with open("min_dist") as f:
-                lines = f.readlines()
-
-            split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
-            min_dist = float(lines[-1].split(split_symbol)[1])
-
-            with open(unfitted_mtp) as f:
-                template = f.read()
-
-            s = template % (len(self.elements), min_dist, max_dist, radial_basis_size)
-            with open(unfitted_mtp, "w") as f:
-                f.write(s)
-
-            save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
-            cmds_list = [
-                self.mlp_command,
-                "train",
-                unfitted_mtp,
-                atoms_filename,
-                f"--save_to={save_fitted_mtp}",
-                f"--iteration_limit={max_iter}",
-                "--al_mode=nbh",  # active learning mode - required to get extrapolation grade
-                # f"--curr-pot-name={unfitted_mtp}",  # TODO check those kwargs
-                # f"--energy-weight={energy_weight}",
-                # f"--force-weight={force_weight}",
-                # f"--stress-weight={stress_weight}",
-                # f"--init-params={init_params}",
-                # f"--scale-by-force={scale_by_force}",
-                # f"--bfgs-conv-tol={bfgs_conv_tol}",
-                # f"--weighting={weighting}",
-            ]
-            with subprocess.Popen(cmds_list, stdout=subprocess.PIPE) as p:
-                stdout = p.communicate()[0]
-                rc = p.returncode
-            if rc != 0:
-                error_msg = f"MLP exited with return code {rc}"
-                msg = stdout.decode("utf-8").split("\n")[:-1]
-                try:
-                    error_line = next(i for i, m in enumerate(msg) if m.startswith("ERROR"))
-                    error_msg += ", ".join(msg[error_line:])
-                except Exception:
-                    error_msg += msg[-1]
-                raise RuntimeError(error_msg)
-            # copy the fitted mtp outside the working directory
-            self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp)
-            shutil.copy(save_fitted_mtp, self.fitted_mtp)
-        return rc
-
-
 def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \
         Tuple[List[Structure], List[List[float]]]:
     """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods.
@@ -412,20 +109,20 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
     return mtp_inputs
 
 
-def train_mtp(train_inputs: Dict[str, Any]) -> MTPWithMLIP3:
+def train_mtp(train_inputs: Dict[str, Any], mlip_cmd_path: str, save_dir: str) -> MTPWithMLIP3:
     """Create and evaluate an MTP potential.
 
     Args:
         train_inputs: inputs for training. Should contain structure, energies and forces
+        mlip_cmd_path: path to MLIP-3 folder
+        save_dir: path to directory where to save the fitted model
 
     Returns:
        dataframe with original and predicted energies and forces.
     """
-    # TODO more kwargs for MTP training. See maml documentation.
+    # TODO more kwargs for MTP training. See maml and mlip-3 documentation.
     # create MTP
-    mlp_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/build/mlp"
-    maml_path = "/Users/simonb/miniconda/envs/crystal_diffusion/lib/python3.11/site-packages/maml/apps/pes/"
-    mtp = MTPWithMLIP3(mlip_path=mlp_path, maml_path=maml_path)
+    mtp = MTPWithMLIP3(mlip_path=mlip_cmd_path)
     # train
     mtp.train(
         train_structures=train_inputs["structure"],
@@ -433,8 +130,8 @@ def train_mtp(train_inputs: Dict[str, Any]) -> MTPWithMLIP3:
         train_forces=train_inputs["forces"],
         train_stresses=None,
         max_dist=5,
-        stress_weight=0,  # TODO kwargs and savedir as args
-        fitted_mtp_savedir='/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/debug_mlip3/'
+        stress_weight=0,
+        fitted_mtp_savedir=save_dir,
     )
 
     return mtp
@@ -452,7 +149,6 @@ def evaluate_mtp(eval_inputs: Dict[str, Any], mtp: MTPWithMLIP3) -> Tuple[pd.Dat
     """
     # evaluate
     df_orig, df_predict = mtp.evaluate(
-        fitted_mtp_path='/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/debug_mlip3/08g_fitted.mtp',
         test_structures=eval_inputs["structure"],
         test_energies=eval_inputs["energy"],
         test_forces=eval_inputs["forces"],
@@ -495,7 +191,7 @@ def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tu
 def main():
     """Train and evaluate an example for MTP."""
     mtp_inputs = prepare_mtp_inputs_from_lammps(lammps_yaml, lammps_thermo_yaml, atom_dict)
-    mtp = train_mtp(mtp_inputs)
+    mtp = train_mtp(mtp_inputs, MLIP_PATH, SAVE_DIR)
     print("Training is done")
     df_orig, df_predict = evaluate_mtp(mtp_inputs, mtp)
     energy_mae, force_mae = get_metrics_from_pred(df_orig, df_predict)

From a55747b0902ad888207db6f5b5fb8eedf25b67c6 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 10:12:46 -0500
Subject: [PATCH 07/29] fixed typo in kwarg default

---
 crystal_diffusion/models/mtp.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 45c4217b..a735cbbb 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -206,7 +206,7 @@ def train(
             train_energies: List[float],
             train_forces: List[np.array],
             train_stresses: List[List[float]],
-            unfitted_mtp: str = "08g.amltp",
+            unfitted_mtp: str = "08.almtp",
             fitted_mtp_savedir: str = '../',
             max_dist: float = 5,
             radial_basis_size: int = 8,
@@ -275,13 +275,6 @@ def train(
             split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
             min_dist = float(lines[-1].split(split_symbol)[1])
 
-            with open(unfitted_mtp) as f:
-                template = f.read()
-
-            s = template % (len(self.elements), min_dist, max_dist, radial_basis_size)
-            with open(unfitted_mtp, "w") as f:
-                f.write(s)
-
             save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
             cmds_list = [
                 self.mlp_command,

From 49dde3d037431fd3f6eda39d65b980d2d41f156a Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 10:14:40 -0500
Subject: [PATCH 08/29] comment out a unused piece of code

---
 crystal_diffusion/models/mtp.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index a735cbbb..4e12aedc 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -269,11 +269,11 @@ def train(
             with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f) as p:
                 p.communicate()[0]
 
-            with open("min_dist") as f:
-                lines = f.readlines()
-
-            split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
-            min_dist = float(lines[-1].split(split_symbol)[1])
+            # TODO check what min_dist is used for in maml
+            # with open("min_dist") as f:
+            #    lines = f.readlines()
+            # split_symbol = "="  # different for mlip-2 (":") and mlip-3 ("=")
+            # min_dist = float(lines[-1].split(split_symbol)[1])
 
             save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]])
             cmds_list = [

From 944101d7f8414ceaac8a07145fd8fe46fc816ad0 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 14:31:54 -0500
Subject: [PATCH 09/29] basic unit test for train_mtp

---
 tests/models/test_mtp.py | 62 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 tests/models/test_mtp.py

diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
new file mode 100644
index 00000000..dc84bcd9
--- /dev/null
+++ b/tests/models/test_mtp.py
@@ -0,0 +1,62 @@
+import pytest
+
+from crystal_diffusion.models.mtp import MTPWithMLIP3
+
+
+class MockStructure:
+    """Mock a pymatgen structure"""
+    def __init__(self, species):
+        self.species = species
+
+def passthrough(*args, **kwargs):
+    """Function to return arguments as passed.
+
+       Useful for mocking a function to return its input arguments directly.
+    """
+    return args if len(kwargs) == 0 else (args, kwargs)
+
+
+# Mock the external dependencies and method calls within the MTPWithMLIP3.train method
+@pytest.mark.parametrize("mock_subprocess", [0])  # Here, 0 simulates a successful subprocess return code
+def test_train_method(mocker, mock_subprocess):
+    # Mock os.path.exists to always return True
+    mocker.patch("os.path.exists", return_value=True)
+
+    # Mock the ScratchDir context manager to do nothing
+    mocker.patch("crystal_diffusion.models.mtp.ScratchDir", mocker.MagicMock())
+
+    # Mock check_structures_forces_stresses to return a value without needing real input
+    mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", side_effect=passthrough)
+
+    # Mock pool_from to return a simplified pool object
+    mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="simple_pool_object")
+
+    # Mock self.write_cfg to simulate creating a config file without file operations
+    mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
+
+    # mocker.patch("crystal_diffusion.models.mtp.itertools.chain", return_value=[1, 2, 3])
+    mocker.patch("shutil.copyfile", return_value=None)
+
+    # Mock subprocess.Popen to simulate an external call to `mlp` command
+    mock_popen = mocker.patch("subprocess.Popen")
+    mock_popen.return_value.__enter__.return_value.communicate.return_value = (b'', b'')  # stdout, stderr
+    mock_popen.return_value.__enter__.return_value.returncode = mock_subprocess
+
+    # Initialize MTPWithMLIP3 with mock parameters
+    model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model")
+    # Call the train method
+
+
+    return_code = model.train(
+        train_structures=[MockStructure(['H', 'O']), MockStructure(['Si'])],
+        train_energies=[1, 2],
+        train_forces=[],
+        train_stresses=[],
+        unfitted_mtp="08.almtp",
+        fitted_mtp_savedir="/mock/dir"
+    )
+    # Assert the expected results
+    assert return_code == mock_subprocess, "The train method should return the mocked subprocess return code."
+
+    # Optionally, assert that mocked methods were called the expected number of times
+    model.write_cfg.assert_called()
\ No newline at end of file

From 8457fa16b2bf14ea11760638006b5772c3b3cd39 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 15:54:20 -0500
Subject: [PATCH 10/29] file rename

---
 .../{file1.txt => CMakeLists.txt}             |  0
 tests/models/test_mtp.py                      | 66 ++++++++++++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)
 rename mlip3_missing_files/{file1.txt => CMakeLists.txt} (100%)

diff --git a/mlip3_missing_files/file1.txt b/mlip3_missing_files/CMakeLists.txt
similarity index 100%
rename from mlip3_missing_files/file1.txt
rename to mlip3_missing_files/CMakeLists.txt
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index dc84bcd9..53d8ee45 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -1,4 +1,5 @@
 import pytest
+from pymatgen.core import Structure
 
 from crystal_diffusion.models.mtp import MTPWithMLIP3
 
@@ -46,7 +47,6 @@ def test_train_method(mocker, mock_subprocess):
     model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model")
     # Call the train method
 
-
     return_code = model.train(
         train_structures=[MockStructure(['H', 'O']), MockStructure(['Si'])],
         train_energies=[1, 2],
@@ -59,4 +59,66 @@ def test_train_method(mocker, mock_subprocess):
     assert return_code == mock_subprocess, "The train method should return the mocked subprocess return code."
 
     # Optionally, assert that mocked methods were called the expected number of times
-    model.write_cfg.assert_called()
\ No newline at end of file
+    model.write_cfg.assert_called()
+
+
+@pytest.fixture
+def mock_structure():
+    # Setup a simple mock structure object
+    # Replace with appropriate structure setup for your use case
+    return Structure(lattice=[1, 0, 0, 0, 1, 0, 0, 0, 1], species=[""], coords=[[0, 0, 0]])
+
+@pytest.fixture
+def mtp_instance(mocker):
+    # Mock __init__ to not execute its original behavior
+    mocker.patch.object(MTPWithMLIP3, '__init__', lambda x, y: None)
+    # Setup a mocked instance with necessary attributes
+    instance = MTPWithMLIP3("mock_path")
+    instance.mlp_command = "mock_mlp_command"
+    instance.fitted_mtp = "mock_fitted_mtp"
+    instance.elements = ["Si"]
+    return instance
+
+
+@pytest.mark.parametrize("mock_subprocess", [0])  # Here, 0 simulates a successful subprocess return code
+def test_evaluate(mocker, mock_structure, mtp_instance, mock_subprocess):
+    test_structures = [mock_structure]
+    test_energies = [1.0]
+    test_forces = [[[0, 0, 0]]]
+    test_stresses = None  # or appropriate mock stresses
+
+    # Mock check_structures_forces_stresses to return the arguments unmodified
+    mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses",
+                 side_effect=lambda s, f, st: (s, f, st))
+
+    # Mock pool_from to return a mocked value
+    mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="mock_pool")
+
+    # Mock self.write_cfg to simulate creating a config file without file operations
+    mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
+
+    # Mock subprocess.Popen for evaluate method's call
+    # Mock subprocess.Popen to simulate an external call to `mlp` command
+    mock_popen = mocker.patch("subprocess.Popen")
+    mock_popen.return_value.__enter__.return_value.communicate.return_value = (b'', b'')  # stdout, stderr
+    mock_popen.return_value.__enter__.return_value.returncode = mock_subprocess
+
+    # process_mock = mocker.Mock()
+    # attrs = {'communicate.return_value': (b'mock_stdout', b'mock_stderr'), 'returncode': 0}
+    # process_mock.configure_mock(**attrs)
+    # mocker.patch('subprocess.Popen', return_value=process_mock)
+
+    # Mock read_cfgs to simulate reading of configurations without accessing the file system
+    mocker.patch.object(MTPWithMLIP3, "read_cfgs", return_value="mock_dataframe")
+
+    # Mock os.remove, shutil.copyfile and os.path.exists since evaluate interacts with the filesystem
+    mocker.patch("os.remove", return_value=None)
+    mocker.patch("shutil.copyfile", return_value=None)
+    mocker.patch("os.path.exists", return_value=True)
+
+    # Perform the test
+    df_orig, df_predict = mtp_instance.evaluate(test_structures, test_energies, test_forces, test_stresses)
+
+    # Assertions can vary based on the real output of `read_cfgs`
+    # Here's an example assertion assuming `read_cfgs` returns a string in this mocked scenario
+    assert df_orig == "mock_dataframe" and df_predict == "mock_dataframe", "Evaluate method should return mock dataframes"
\ No newline at end of file

From 02bbf41056c2782ce78cbc49261064cc4012a7ee Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 7 Mar 2024 16:10:11 -0500
Subject: [PATCH 11/29] unit tests

---
 tests/models/mtp_cfg_examples.txt | 20 ++++++++++++++++++++
 tests/models/test_mtp.py          | 30 +++++++++++++++++++++---------
 2 files changed, 41 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/mtp_cfg_examples.txt

diff --git a/tests/models/mtp_cfg_examples.txt b/tests/models/mtp_cfg_examples.txt
new file mode 100644
index 00000000..1034c9d5
--- /dev/null
+++ b/tests/models/mtp_cfg_examples.txt
@@ -0,0 +1,20 @@
+BEGIN_CFG
+ Size
+    3
+ Supercell
+         1.000000      0.000000      0.000000
+         0.000000      2.000000      0.000000
+         0.000000      0.000000      3.000000
+ AtomData:  id type       cartes_x      cartes_y      cartes_z           fx          fy          fz       nbh_grades
+             1    0       0.100000      1.100000      2.100000     3.100000    4.100000    5.100000          6.10000
+             2    0       0.200000      1.200000      2.200000     3.200000    4.200000    5.200000          6.20000
+             3    0       0.300000      1.300000      2.300000     3.300000    4.300000    5.300000          6.30000
+ Energy
+       -7.0000
+ PlusStress:  xx          yy          zz          yz          xz          xy
+        8.10000    8.20000    8.3000 0    8.40000     8.50000     8.60000
+ Feature   EFS_by	VASP
+ Feature   ID	0
+ Feature   MV_grade 9.000000	
+ Feature   mindist	10.00000
+END_CFG
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 53d8ee45..cb9b08d1 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -1,18 +1,21 @@
+from pathlib import Path
+
+import numpy as np
 import pytest
 from pymatgen.core import Structure
 
 from crystal_diffusion.models.mtp import MTPWithMLIP3
 
-
 class MockStructure:
     """Mock a pymatgen structure"""
     def __init__(self, species):
         self.species = species
 
+
 def passthrough(*args, **kwargs):
-    """Function to return arguments as passed.
+    """Return arguments as passed.
 
-       Useful for mocking a function to return its input arguments directly.
+    Useful for mocking a function to return its input arguments directly.
     """
     return args if len(kwargs) == 0 else (args, kwargs)
 
@@ -103,11 +106,6 @@ def test_evaluate(mocker, mock_structure, mtp_instance, mock_subprocess):
     mock_popen.return_value.__enter__.return_value.communicate.return_value = (b'', b'')  # stdout, stderr
     mock_popen.return_value.__enter__.return_value.returncode = mock_subprocess
 
-    # process_mock = mocker.Mock()
-    # attrs = {'communicate.return_value': (b'mock_stdout', b'mock_stderr'), 'returncode': 0}
-    # process_mock.configure_mock(**attrs)
-    # mocker.patch('subprocess.Popen', return_value=process_mock)
-
     # Mock read_cfgs to simulate reading of configurations without accessing the file system
     mocker.patch.object(MTPWithMLIP3, "read_cfgs", return_value="mock_dataframe")
 
@@ -121,4 +119,18 @@ def test_evaluate(mocker, mock_structure, mtp_instance, mock_subprocess):
 
     # Assertions can vary based on the real output of `read_cfgs`
     # Here's an example assertion assuming `read_cfgs` returns a string in this mocked scenario
-    assert df_orig == "mock_dataframe" and df_predict == "mock_dataframe", "Evaluate method should return mock dataframes"
\ No newline at end of file
+    assert df_orig == "mock_dataframe" and df_predict == "mock_dataframe", "Evaluate method should return mock" + \
+                                                                           "dataframes"
+
+
+def test_read_cfgs(mtp_instance):
+    cfg_path = Path(__file__).parent.joinpath("mtp_cfg_examples.txt")
+    df = mtp_instance.read_cfgs(cfg_path, True)
+    print(df.keys())
+    assert np.array_equal(df['x'], [0.1, 0.2, 0.3])
+    assert np.array_equal(df['y'], [1.1, 1.2, 1.3])
+    assert np.array_equal(df['z'], [2.1, 2.2, 2.3])
+    assert np.array_equal(df['fx'], [3.1, 3.2, 3.3])
+    assert np.array_equal(df['fy'], [4.1, 4.2, 4.3])
+    assert np.array_equal(df['fz'], [5.1, 5.2, 5.3])
+    assert np.array_equal(df['nbh_grades'], [6.1, 6.2, 6.3])

From 66a30b47147ee0c01fadbffaa41ba36856b93d61 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 8 Mar 2024 09:36:43 -0500
Subject: [PATCH 12/29] update the readme mtp file

---
 README_mtp.md | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/README_mtp.md b/README_mtp.md
index c36d6260..161b733f 100644
--- a/README_mtp.md
+++ b/README_mtp.md
@@ -32,6 +32,8 @@ export HOMEBREW_CXX=g++-13  # brew g++
 brew reinstall openmpi --build-from-source
 ```
 
+## Build instructions from the package
+
 Then, you can install MLIP
 
 ```
@@ -46,6 +48,8 @@ cd mlip-3
 make mlip
 ```
 
+## Corrected build instructions
+
 But this doesn't work. Instead, we have to use cmake:
 
 ```
@@ -81,21 +85,30 @@ cp -r mlip3_missing_files/cmake mlip-3/
 cp mlip3_missing_files/mlp_commands.cpp mlip-3/src/mlp/mlp_commands.cpp
 ```
 A .sh file is provided in the missing_data folder for this.
+For the last file, we commented the tests as they were not working.
+
+The CMakeLists files are based on those present in MLIP-2, but modified for the content of MLIP-3.
 
+We can now compile.
 
-then we can cmake. For the last file, we commented the tests as they were not working. 
 On mac, we use openblas:
 
 ```
 cmake .. -DBLAS_ROOT=/usr/local/opt/openblas/
 make
 ```
+This takes a few minutes and raises a lot of warnings. If no error message appears, then MLIP has compiled!
 
-If the installation is successful, the command is 
+You can try running the tests in *mlip-3/test/examples/* and compare to the results in the respective *sample_out* folder.
+This can be done automatically with the following command:
 ```
-mlip-3/build/mlp
+make tests
 ```
 
-You can try running the tests in *mlip-3/test/examples/* and compare to the results in the respective *sample_out* folder.
+## Running MLIP
 
-The *train_mtp.py* script 
\ No newline at end of file
+If the installation is successful, the command is 
+```
+mlip-3/build/mlp
+```
+This is called by the *train_mtp.py* script.

From 90e23490009eaa0b04e1483d4e158e61729199a1 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 8 Mar 2024 09:48:21 -0500
Subject: [PATCH 13/29] missing blank line

---
 tests/models/test_mtp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index cb9b08d1..cbf5b2bc 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -6,6 +6,7 @@
 
 from crystal_diffusion.models.mtp import MTPWithMLIP3
 
+
 class MockStructure:
     """Mock a pymatgen structure"""
     def __init__(self, species):

From 59002f1499bd9708224ffa853135118d7e4238f1 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 8 Mar 2024 09:50:45 -0500
Subject: [PATCH 14/29] line length 121 - obviously

---
 crystal_diffusion/models/mtp.py | 9 ++++++---
 tests/models/test_mtp.py        | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 4e12aedc..725f2413 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -157,6 +157,7 @@ def formatify(string: str) -> List[float]:
             # virial_stress = [virial_stress[self.mtp_stress_order.index(n)] for n in self.vasp_stress_order]
             d["outputs"]["energy"] = energy
             d["num_atoms"] = size
+            d["outputs"]["position"] = position[:, 2:5].tolist()
             d["outputs"]["forces"] = forces
             d["outputs"]["species"] = species
             # d["outputs"]["virial_stress"] = virial_stress
@@ -180,15 +181,17 @@ def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
             docs: list of docs generated by mlip-3. Each doc should be structured as a dict.
 
         Returns:
-            A DataFrame with energy, force, and nbh grades (MaxVol factor) per atom and per structure. Energy is repeated
-            for each atom and corresponds to the total energy predicted.
+            A DataFrame with energy, force, and nbh grades (MaxVol factor) per atom and per structure. Energy is
+            repeated for each atom and corresponds to the total energy predicted.
         """
         df = defaultdict(list)
         for s_idx, d in enumerate(docs):
             outputs = d["outputs"]
+            pos_arr = np.array(outputs["position"])
             force_arr = np.array(outputs["forces"])
             n_atom = force_arr.shape[0]
             for i, x in enumerate(['x', 'y', 'z']):
+                df[x] += pos_arr[:, i].tolist()
                 df[f'f{x}'] += force_arr[:, i].tolist()
             df['energy'] += [outputs['energy']] * n_atom  # copy the value to all atoms
             if "nbh_grades" in outputs.keys():
@@ -307,5 +310,5 @@ def train(
                 raise RuntimeError(error_msg)
             # copy the fitted mtp outside the working directory
             self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp)
-            shutil.copy(save_fitted_mtp, self.fitted_mtp)
+            shutil.copyfile(save_fitted_mtp, self.fitted_mtp)
         return rc
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index cbf5b2bc..cb7e14f5 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -72,6 +72,7 @@ def mock_structure():
     # Replace with appropriate structure setup for your use case
     return Structure(lattice=[1, 0, 0, 0, 1, 0, 0, 0, 1], species=[""], coords=[[0, 0, 0]])
 
+
 @pytest.fixture
 def mtp_instance(mocker):
     # Mock __init__ to not execute its original behavior

From e911272a95ee00e884468ba74f5d9f4dd247ce9c Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 8 Mar 2024 09:57:48 -0500
Subject: [PATCH 15/29] extra whiteline

---
 crystal_diffusion/train_mtp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index 9a880fde..c84daec3 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -13,7 +13,6 @@
 
 from crystal_diffusion.models.mtp import MTPWithMLIP3
 
-
 MLIP_PATH = os.path.join(os.getcwd(), "mlip-3")
 SAVE_DIR = os.path.join(os.getcwd(), "debug_mlip3")  # for demo only
 

From b80bc34ce08342fbc2180c013f076125f387d2cd Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 8 Mar 2024 10:03:35 -0500
Subject: [PATCH 16/29] adding pytest-mock to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index ec3bc073..1511b50e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ pymatgen==2024.2.23
 pyyaml==6.0
 pytest==7.1.2
 pytest-cov==3.0.0
+pytest-mock==3.12.0
 pytorch_lightning>=2.2.0
 pytype==2024.2.13
 sphinx==7.2.6

From 35c555169cae7a552c60d2ce74d5b100d1e6116e Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 8 Mar 2024 10:25:55 -0500
Subject: [PATCH 17/29] typing fixes

---
 crystal_diffusion/models/mtp.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 725f2413..001c199c 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -207,8 +207,8 @@ def train(
             self,
             train_structures: List[Structure],
             train_energies: List[float],
-            train_forces: List[np.array],
-            train_stresses: List[List[float]],
+            train_forces: List[List[float]],
+            train_stresses: Optional[List[List[float]]] = None,
             unfitted_mtp: str = "08.almtp",
             fitted_mtp_savedir: str = '../',
             max_dist: float = 5,
@@ -220,7 +220,7 @@ def train(
             init_params: str = "same",
             scale_by_force: float = 0,
             bfgs_conv_tol: float = 1e-3,
-            weighting: float = "vibration",
+            weighting: str = "vibration",
     ) -> int:
         """Training data with moment tensor method using MLIP-3.
 
@@ -231,7 +231,8 @@ def train(
             train_energies: List of total energies of each structure in structures list.
             train_forces: List of (m, 3) forces array of each structure with m atoms in structures list.
                 m can be varied with each single structure case.
-            train_stresses: List of (6, ) virial stresses of each structure in structures list.
+            train_stresses (optional): List of (6, ) virial stresses of each structure in structures list.
+                Defaults to None.
             unfitted_mtp (optional): Define the initial mtp file. Default to 08g.amltp
             fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd)
             max_dist (optional): The actual radial cutoff. Defaults to 5.

From 4fe53b547ca32ab0eb08df3126f5cd025ee6625c Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 09:00:03 -0400
Subject: [PATCH 18/29] first batch of code review

---
 crystal_diffusion/models/mtp.py      |  2 +-
 mlip3_missing_files/mlp_commands.cpp |  3 ++
 tests/models/test_mtp.py             | 43 ++++++++++++----------------
 3 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 001c199c..49ac2f9c 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -262,7 +262,7 @@ def train(
 
         atoms_filename = "train.cfgs"
 
-        with ScratchDir("."):  # create a tmpdir - deleted afterew
+        with ScratchDir("."):  # create a tmpdir - deleted afterwards
             atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
 
             if not unfitted_mtp:
diff --git a/mlip3_missing_files/mlp_commands.cpp b/mlip3_missing_files/mlp_commands.cpp
index ab63daf4..45a7e5d8 100644
--- a/mlip3_missing_files/mlp_commands.cpp
+++ b/mlip3_missing_files/mlp_commands.cpp
@@ -1,3 +1,6 @@
+/* Code changed to allow compilation. RunAllTests fails compilation, so we commented them out.
+ */
+
 /*   MLIP is a software for Machine Learning Interatomic Potentials
  *   MLIP is released under the "New BSD License", see the LICENSE file.
  *   Contributors: Alexander Shapeev, Evgeny Podryabinkin, Ivan Novikov
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index cb7e14f5..f4b96f92 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -7,7 +7,7 @@
 from crystal_diffusion.models.mtp import MTPWithMLIP3
 
 
-class MockStructure:
+class FakeStructure:
     """Mock a pymatgen structure"""
     def __init__(self, species):
         self.species = species
@@ -21,15 +21,20 @@ def passthrough(*args, **kwargs):
     return args if len(kwargs) == 0 else (args, kwargs)
 
 
+@pytest.fixture
+def mock_popen(mocker):
+    # mock subprocess calling mlp
+    mock_popen = mocker.patch("subprocess.Popen")
+    mock_popen.return_value.__enter__.return_value.communicate.return_value = (b'', b'')  # stdout, stderr
+    mock_popen.return_value.__enter__.return_value.returncode = 0
+    return mock_popen
+
+
 # Mock the external dependencies and method calls within the MTPWithMLIP3.train method
-@pytest.mark.parametrize("mock_subprocess", [0])  # Here, 0 simulates a successful subprocess return code
-def test_train_method(mocker, mock_subprocess):
+def test_train(mocker, mock_popen):
     # Mock os.path.exists to always return True
     mocker.patch("os.path.exists", return_value=True)
 
-    # Mock the ScratchDir context manager to do nothing
-    mocker.patch("crystal_diffusion.models.mtp.ScratchDir", mocker.MagicMock())
-
     # Mock check_structures_forces_stresses to return a value without needing real input
     mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", side_effect=passthrough)
 
@@ -39,20 +44,14 @@ def test_train_method(mocker, mock_subprocess):
     # Mock self.write_cfg to simulate creating a config file without file operations
     mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
 
-    # mocker.patch("crystal_diffusion.models.mtp.itertools.chain", return_value=[1, 2, 3])
     mocker.patch("shutil.copyfile", return_value=None)
 
-    # Mock subprocess.Popen to simulate an external call to `mlp` command
-    mock_popen = mocker.patch("subprocess.Popen")
-    mock_popen.return_value.__enter__.return_value.communicate.return_value = (b'', b'')  # stdout, stderr
-    mock_popen.return_value.__enter__.return_value.returncode = mock_subprocess
-
     # Initialize MTPWithMLIP3 with mock parameters
     model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model")
     # Call the train method
 
     return_code = model.train(
-        train_structures=[MockStructure(['H', 'O']), MockStructure(['Si'])],
+        train_structures=[FakeStructure(['H', 'O']), FakeStructure(['Si'])],
         train_energies=[1, 2],
         train_forces=[],
         train_stresses=[],
@@ -60,14 +59,14 @@ def test_train_method(mocker, mock_subprocess):
         fitted_mtp_savedir="/mock/dir"
     )
     # Assert the expected results
-    assert return_code == mock_subprocess, "The train method should return the mocked subprocess return code."
+    assert return_code == 0  # The train method should return the mocked subprocess success return code from mock_open
 
-    # Optionally, assert that mocked methods were called the expected number of times
+    # Assert that mocked methods were called
     model.write_cfg.assert_called()
 
 
 @pytest.fixture
-def mock_structure():
+def fake_structure():
     # Setup a simple mock structure object
     # Replace with appropriate structure setup for your use case
     return Structure(lattice=[1, 0, 0, 0, 1, 0, 0, 0, 1], species=[""], coords=[[0, 0, 0]])
@@ -85,9 +84,9 @@ def mtp_instance(mocker):
     return instance
 
 
-@pytest.mark.parametrize("mock_subprocess", [0])  # Here, 0 simulates a successful subprocess return code
-def test_evaluate(mocker, mock_structure, mtp_instance, mock_subprocess):
-    test_structures = [mock_structure]
+# def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
+def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen):
+    test_structures = [fake_structure]
     test_energies = [1.0]
     test_forces = [[[0, 0, 0]]]
     test_stresses = None  # or appropriate mock stresses
@@ -102,12 +101,6 @@ def test_evaluate(mocker, mock_structure, mtp_instance, mock_subprocess):
     # Mock self.write_cfg to simulate creating a config file without file operations
     mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg")
 
-    # Mock subprocess.Popen for evaluate method's call
-    # Mock subprocess.Popen to simulate an external call to `mlp` command
-    mock_popen = mocker.patch("subprocess.Popen")
-    mock_popen.return_value.__enter__.return_value.communicate.return_value = (b'', b'')  # stdout, stderr
-    mock_popen.return_value.__enter__.return_value.returncode = mock_subprocess
-
     # Mock read_cfgs to simulate reading of configurations without accessing the file system
     mocker.patch.object(MTPWithMLIP3, "read_cfgs", return_value="mock_dataframe")
 

From 0b6e90a888b5b501ce396ccdd216adf281fcb9ce Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 10:34:44 -0400
Subject: [PATCH 19/29] second batch of code review

---
 crystal_diffusion/models/mtp.py               |  51 +++-
 examples/local/mtp_example/dump.si-300-1.yaml | 220 ++++++++++++++++++
 examples/local/mtp_example/thermo_log.yaml    |  15 ++
 3 files changed, 275 insertions(+), 11 deletions(-)
 create mode 100644 examples/local/mtp_example/dump.si-300-1.yaml
 create mode 100644 examples/local/mtp_example/thermo_log.yaml

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 49ac2f9c..104c00d3 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -4,6 +4,7 @@
 However, it cannot be called as a standard lightning module as it relies on the MLIP-3 library for the model
 implementation.
 """
+import _io
 import itertools
 import os
 import re
@@ -94,9 +95,7 @@ def evaluate(self,
             # calculate_grade is the method to get the forces, energy & maxvol values
             cmd = [self.mlp_command, "calculate_grade", self.fitted_mtp, original_file, predict_file]
             predict_file += '.0'  # added by mlp...
-            with subprocess.Popen(cmd, stdout=subprocess.PIPE) as p:  # run mlp
-                stdout = p.communicate()[0]
-                rc = p.returncode
+            stdout, rc = self._call_mlip(cmd)
             if rc != 0:
                 error_msg = f"mlp exited with return code {rc}"
                 msg = stdout.decode("utf-8").split("\n")[:-1]
@@ -115,7 +114,8 @@ def read_cfgs(self, filename: str, nbh_grade: bool = False) -> pd.DataFrame:
 
         Args:
             filename: name of mlp output file to be parsed.
-            nbh_grade (optional): if True, add the nbh_grades in the resulting dataframe. Defaults to False.
+            nbh_grade (optional): if True, add the nbh_grades (neighborhood-based approach to determine the MaxVol gamma
+                values - see MLIP3 paper) in the resulting dataframe. Defaults to False.
 
         Returns:
             dataframe with energies, forces, optional nbh grades (MaxVol gamma)
@@ -186,16 +186,19 @@ def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
         """
         df = defaultdict(list)
         for s_idx, d in enumerate(docs):
+            n_atom = d['num_atoms']
             outputs = d["outputs"]
             pos_arr = np.array(outputs["position"])
+            assert n_atom == pos_arr.shape[0], "Number of positions do not match number of atoms"
             force_arr = np.array(outputs["forces"])
-            n_atom = force_arr.shape[0]
+            assert n_atom == force_arr.shape[0], "Number of forces do not match number of atoms"
             for i, x in enumerate(['x', 'y', 'z']):
                 df[x] += pos_arr[:, i].tolist()
                 df[f'f{x}'] += force_arr[:, i].tolist()
             df['energy'] += [outputs['energy']] * n_atom  # copy the value to all atoms
             if "nbh_grades" in outputs.keys():
                 nbh_grades = outputs["nbh_grades"]
+                assert n_atom == len(nbh_grades), "Number of gamma values do not match number of atoms"
                 df['nbh_grades'] += nbh_grades
             df['atom_index'] += list(range(n_atom))
             df['structure_index'] += [s_idx] * n_atom
@@ -203,6 +206,34 @@ def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
         df = pd.DataFrame(df)
         return df
 
+
+    @staticmethod
+    def _call_mlip(cmd_list: List[str]) -> Tuple[str, int]:
+        """Call MLIP library with subprocess.
+
+        Args:
+            cmd_list: list of commands & arguments to execute
+
+        Returns:
+            stdout: output of the executed commands
+            rc: return code of the executed commands
+        """
+        with subprocess.Popen(cmd_list, stdout=subprocess.PIPE) as p:
+            stdout = p.communicate()[0]
+            rc = p.returncode
+        return stdout, rc
+
+    @staticmethod
+    def _call_cmd_to_stdout(cmd: List[str], output_file: _io.TextIOWrapper):
+        """Call commands with subprocess.POpen and pipe output to a file
+
+        Args:
+            cmd: list of commands & arguments to run
+            output_file: name of the file where the stdout is redirected
+        """
+        with subprocess.Popen(cmd, stdout=output_file) as p:
+            p.communicate()[0]
+
     def train(
             self,
             train_structures: List[Structure],
@@ -262,7 +293,7 @@ def train(
 
         atoms_filename = "train.cfgs"
 
-        with ScratchDir("."):  # create a tmpdir - deleted afterwards
+        with (ScratchDir(".")):  # create a tmpdir - deleted afterwards
             atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool)
 
             if not unfitted_mtp:
@@ -270,8 +301,8 @@ def train(
             mtp_file_path = os.path.join(self.mlp_templates, unfitted_mtp)
             shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp))
             commands = [self.mlp_command, "mindist", atoms_filename]
-            with open("min_dist", "w") as f, subprocess.Popen(commands, stdout=f) as p:
-                p.communicate()[0]
+            with open("min_dist", "w") as f:
+                self._call_cmd_to_stdout(commands, f)
 
             # TODO check what min_dist is used for in maml
             # with open("min_dist") as f:
@@ -297,9 +328,7 @@ def train(
                 # f"--bfgs-conv-tol={bfgs_conv_tol}",
                 # f"--weighting={weighting}",
             ]
-            with subprocess.Popen(cmds_list, stdout=subprocess.PIPE) as p:
-                stdout = p.communicate()[0]
-                rc = p.returncode
+            stdout, rc = self._call_mlip(cmds_list)
             if rc != 0:
                 error_msg = f"MLP exited with return code {rc}"
                 msg = stdout.decode("utf-8").split("\n")[:-1]
diff --git a/examples/local/mtp_example/dump.si-300-1.yaml b/examples/local/mtp_example/dump.si-300-1.yaml
new file mode 100644
index 00000000..55a41e75
--- /dev/null
+++ b/examples/local/mtp_example/dump.si-300-1.yaml
@@ -0,0 +1,220 @@
+---
+creator: LAMMPS
+timestep: 0
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0 , 0 , 0 , 0 , 8.67362e-19 , 0, ]
+  - [ 2 , 1 , 0 , 2.715 , 2.715 , 0 , 0 , 0, ]
+  - [ 3 , 1 , 2.715 , 0 , 2.715 , -3.16605e-25 , 0 , 3.16605e-25, ]
+  - [ 4 , 1 , 2.715 , 2.715 , 0 , 0 , 8.67362e-19 , 0, ]
+  - [ 5 , 1 , 1.3575 , 1.3575 , 1.3575 , 8.67361e-19 , 0 , 0, ]
+  - [ 6 , 1 , 1.3575 , 4.0725 , 4.0725 , 4.33681e-19 , 4.3368e-19 , 4.33681e-19, ]
+  - [ 7 , 1 , 4.0725 , 1.3575 , 4.0725 , 4.33681e-19 , 4.33681e-19 , 4.33681e-19, ]
+  - [ 8 , 1 , 4.0725 , 4.0725 , 1.3575 , 4.33681e-19 , 4.33681e-19 , 4.33681e-19, ]
+...
+---
+creator: LAMMPS
+timestep: 1
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.000261578 , 0.00145878 , -0.00141573 , 0.00973571 , -0.0309875 , 0.0337626, ]
+  - [ 2 , 1 , 0.000286538 , 2.71877 , 2.71314 , 0.00944413 , -0.0947069 , -0.00717711, ]
+  - [ 3 , 1 , 2.71153 , 0.00284738 , 2.71681 , 0.0078028 , -0.0841485 , -0.00622089, ]
+  - [ 4 , 1 , 2.71444 , 2.71061 , 0.00313226 , 0.100613 , 0.0759878 , -0.0820386, ]
+  - [ 5 , 1 , 1.36199 , 1.35863 , 1.3543 , -0.122359 , -0.0151081 , 0.071805, ]
+  - [ 6 , 1 , 1.35475 , 4.06856 , 4.07467 , 0.0580653 , 0.0503925 , 0.00158432, ]
+  - [ 7 , 1 , 4.07417 , 1.3535 , 4.06875 , -0.0197861 , 0.101362 , 0.0435984, ]
+  - [ 8 , 1 , 4.07257 , 4.07562 , 1.36061 , -0.0435151 , -0.0027912 , -0.0553137, ]
+...
+---
+creator: LAMMPS
+timestep: 2
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.000526508 , 0.00290696 , -0.00281991 , 0.0193296 , -0.0616508 , 0.0673476, ]
+  - [ 2 , 1 , 0.00057633 , 2.7225 , 2.71128 , 0.0184908 , -0.188194 , -0.0139953, ]
+  - [ 3 , 1 , 2.70806 , 0.00566594 , 2.71862 , 0.0154165 , -0.167209 , -0.012478, ]
+  - [ 4 , 1 , 2.71392 , 2.70625 , 0.00623643 , 0.201037 , 0.153072 , -0.164629, ]
+  - [ 5 , 1 , 1.36643 , 1.35975 , 1.35113 , -0.244605 , -0.0320252 , 0.143804, ]
+  - [ 6 , 1 , 1.35202 , 4.06463 , 4.07684 , 0.115375 , 0.10016 , 0.0035132, ]
+  - [ 7 , 1 , 4.07583 , 1.34954 , 4.06501 , -0.0386702 , 0.20142 , 0.0864668, ]
+  - [ 8 , 1 , 4.07263 , 4.07875 , 1.36371 , -0.0863736 , -0.0055738 , -0.110029, ]
+...
+---
+creator: LAMMPS
+timestep: 3
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.000798128 , 0.00433421 , -0.0042012 , 0.0285237 , -0.0916794 , 0.100457, ]
+  - [ 2 , 1 , 0.000872527 , 2.72617 , 2.70941 , 0.0269245 , -0.279457 , -0.0202524, ]
+  - [ 3 , 1 , 2.7046 , 0.00842757 , 2.72043 , 0.0228881 , -0.248335 , -0.0187064, ]
+  - [ 4 , 1 , 2.71346 , 2.70195 , 0.00928461 , 0.300135 , 0.230667 , -0.246927, ]
+  - [ 5 , 1 , 1.3708 , 1.36086 , 1.348 , -0.36548 , -0.0506129 , 0.215279, ]
+  - [ 6 , 1 , 1.34933 , 4.06074 , 4.07901 , 0.171493 , 0.148824 , 0.00561127, ]
+  - [ 7 , 1 , 4.07748 , 1.34565 , 4.06129 , -0.0564696 , 0.299179 , 0.128284, ]
+  - [ 8 , 1 , 4.07267 , 4.08187 , 1.36677 , -0.128014 , -0.00858581 , -0.163744, ]
+...
+---
+creator: LAMMPS
+timestep: 4
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.0010797 , 0.00573079 , -0.00554876 , 0.0370629 , -0.120778 , 0.132798, ]
+  - [ 2 , 1 , 0.00117815 , 2.72975 , 2.70754 , 0.0345505 , -0.367544 , -0.0257618, ]
+  - [ 3 , 1 , 2.70114 , 0.0111054 , 2.72223 , 0.0302774 , -0.326725 , -0.0248355, ]
+  - [ 4 , 1 , 2.71311 , 2.69772 , 0.0122497 , 0.396769 , 0.308134 , -0.328061, ]
+  - [ 5 , 1 , 1.37504 , 1.36196 , 1.34495 , -0.483729 , -0.0706455 , 0.285495, ]
+  - [ 6 , 1 , 1.34669 , 4.05689 , 4.08118 , 0.226006 , 0.195933 , 0.00768446, ]
+  - [ 7 , 1 , 4.07911 , 1.34186 , 4.05763 , -0.0730398 , 0.393694 , 0.168758, ]
+  - [ 8 , 1 , 4.07265 , 4.08499 , 1.36977 , -0.167897 , -0.0120688 , -0.216077, ]
+...
+---
+creator: LAMMPS
+timestep: 5
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.0013744 , 0.00708769 , -0.00685246 , 0.0446998 , -0.148674 , 0.164093, ]
+  - [ 2 , 1 , 0.00149605 , 2.73321 , 2.70565 , 0.0411962 , -0.451566 , -0.0303529, ]
+  - [ 3 , 1 , 2.69769 , 0.0136745 , 2.72403 , 0.0376582 , -0.40164 , -0.0307935, ]
+  - [ 4 , 1 , 2.7129 , 2.69358 , 0.0151059 , 0.489823 , 0.384791 , -0.407145, ]
+  - [ 5 , 1 , 1.37911 , 1.36303 , 1.34199 , -0.598122 , -0.0918186 , 0.353719, ]
+  - [ 6 , 1 , 1.34413 , 4.05311 , 4.08336 , 0.278535 , 0.241076 , 0.00952454, ]
+  - [ 7 , 1 , 4.08071 , 1.3382 , 4.05401 , -0.0882757 , 0.484095 , 0.207636, ]
+  - [ 8 , 1 , 4.07258 , 4.08811 , 1.37271 , -0.205516 , -0.016264 , -0.266681, ]
+...
+---
+creator: LAMMPS
+timestep: 6
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.00168521 , 0.00839691 , -0.00810303 , 0.0511995 , -0.175121 , 0.19408, ]
+  - [ 2 , 1 , 0.00182893 , 2.73651 , 2.70375 , 0.0467106 , -0.530715 , -0.0338702, ]
+  - [ 3 , 1 , 2.69425 , 0.016112 , 2.72582 , 0.045119 , -0.472412 , -0.0365098, ]
+  - [ 4 , 1 , 2.71285 , 2.68958 , 0.0178293 , 0.578222 , 0.459939 , -0.483301, ]
+  - [ 5 , 1 , 1.383 , 1.36407 , 1.33915 , -0.707477 , -0.113761 , 0.419225, ]
+  - [ 6 , 1 , 1.34166 , 4.0494 , 4.08555 , 0.328738 , 0.283887 , 0.0109139, ]
+  - [ 7 , 1 , 4.08229 , 1.33469 , 4.05046 , -0.10211 , 0.569591 , 0.244705, ]
+  - [ 8 , 1 , 4.07244 , 4.09124 , 1.37556 , -0.240402 , -0.0214093 , -0.315243, ]
+...
+---
+creator: LAMMPS
+timestep: 7
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.00201502 , 0.00965157 , -0.00929226 , 0.0563421 , -0.199902 , 0.222522, ]
+  - [ 2 , 1 , 0.00217935 , 2.73965 , 2.70183 , 0.0509632 , -0.604272 , -0.0361712, ]
+  - [ 3 , 1 , 2.6908 , 0.0183975 , 2.7276 , 0.0527626 , -0.538455 , -0.041919, ]
+  - [ 4 , 1 , 2.713 , 2.68571 , 0.0203983 , 0.66094 , 0.532875 , -0.555675, ]
+  - [ 5 , 1 , 1.38665 , 1.36508 , 1.33643 , -0.810678 , -0.136048 , 0.481313, ]
+  - [ 6 , 1 , 1.33929 , 4.04577 , 4.08775 , 0.376313 , 0.324051 , 0.0116303, ]
+  - [ 7 , 1 , 4.08385 , 1.33137 , 4.04697 , -0.114511 , 0.649488 , 0.279794, ]
+  - [ 8 , 1 , 4.07221 , 4.09437 , 1.37831 , -0.272132 , -0.0277369 , -0.361494, ]
+...
+---
+creator: LAMMPS
+timestep: 8
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.00236654 , 0.010846 , -0.010413 , 0.0599248 , -0.222835 , 0.249205, ]
+  - [ 2 , 1 , 0.00254978 , 2.74261 , 2.69988 , 0.0538418 , -0.671608 , -0.037124, ]
+  - [ 3 , 1 , 2.68735 , 0.0205133 , 2.72938 , 0.0607067 , -0.599267 , -0.0469634, ]
+  - [ 4 , 1 , 2.71338 , 2.682 , 0.0227936 , 0.737014 , 0.602913 , -0.62345, ]
+  - [ 5 , 1 , 1.39006 , 1.36605 , 1.33387 , -0.906686 , -0.158223 , 0.539316, ]
+  - [ 6 , 1 , 1.33704 , 4.04223 , 4.08997 , 0.421007 , 0.361305 , 0.0114504, ]
+  - [ 7 , 1 , 4.08537 , 1.32825 , 4.04356 , -0.125479 , 0.723187 , 0.312775, ]
+  - [ 8 , 1 , 4.07189 , 4.09751 , 1.38096 , -0.300329 , -0.0354728 , -0.405209, ]
+...
+---
+creator: LAMMPS
+timestep: 9
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.0027424 , 0.0119759 , -0.0114594 , 0.0617614 , -0.24377 , 0.273941, ]
+  - [ 2 , 1 , 0.00294264 , 2.74536 , 2.6979 , 0.0552479 , -0.732186 , -0.0366024, ]
+  - [ 3 , 1 , 2.68388 , 0.0224441 , 2.73117 , 0.0690844 , -0.654427 , -0.0515962, ]
+  - [ 4 , 1 , 2.71402 , 2.67845 , 0.0249984 , 0.805554 , 0.669406 , -0.685861, ]
+  - [ 5 , 1 , 1.39318 , 1.36697 , 1.33146 , -0.994553 , -0.17981 , 0.592611, ]
+  - [ 6 , 1 , 1.3349 , 4.03878 , 4.09222 , 0.462606 , 0.395442 , 0.0101527, ]
+  - [ 7 , 1 , 4.08686 , 1.32534 , 4.04022 , -0.135042 , 0.790183 , 0.343562, ]
+  - [ 8 , 1 , 4.07146 , 4.10067 , 1.38349 , -0.324658 , -0.0448379 , -0.446207, ]
+...
+---
+creator: LAMMPS
+timestep: 10
+natoms: 8
+boundary: [ p, p, p, p, p, p, ]
+box:
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+  - [ 0, 5.43 ]
+keywords: [ id, type, x, y, z, fx, fy, fz,  ]
+data:
+  - [ 1 , 1 , 0.00314522 , 0.0130383 , -0.0124265 , 0.0616811 , -0.262587 , 0.29657, ]
+  - [ 2 , 1 , 0.00336046 , 2.7479 , 2.69588 , 0.0550928 , -0.785551 , -0.0344815, ]
+  - [ 3 , 1 , 2.68039 , 0.024177 , 2.73296 , 0.0780454 , -0.70359 , -0.0557829, ]
+  - [ 4 , 1 , 2.71495 , 2.67509 , 0.0269986 , 0.865734 , 0.731754 , -0.742201, ]
+  - [ 5 , 1 , 1.39601 , 1.36785 , 1.32922 , -1.07342 , -0.200336 , 0.640623, ]
+  - [ 6 , 1 , 1.3329 , 4.03541 , 4.0945 , 0.500945 , 0.426302 , 0.00751765, ]
+  - [ 7 , 1 , 4.08834 , 1.32267 , 4.03695 , -0.143248 , 0.85006 , 0.372109, ]
+  - [ 8 , 1 , 4.07092 , 4.10386 , 1.38591 , -0.344829 , -0.056052 , -0.484354, ]
+...
diff --git a/examples/local/mtp_example/thermo_log.yaml b/examples/local/mtp_example/thermo_log.yaml
new file mode 100644
index 00000000..feb4497f
--- /dev/null
+++ b/examples/local/mtp_example/thermo_log.yaml
@@ -0,0 +1,15 @@
+---
+keywords: ['Step', 'Temp', 'KinEng', 'PotEng', 'E_bond', 'E_angle', 'E_dihed', 'E_impro', 'E_vdwl', 'E_coul', 'E_long', 'Press', ]
+data:
+  - [0, 300, 0.2714463045, -34.6927860387157, 0, 0, 0, 0, -34.6927860387157, 0, 0, 2343.50586002891, ]
+  - [1, 298.189192725463, 0.269807848023884, -34.691140928288, 0, 0, 0, 0, -34.691140928288, 0, 0, 2346.19351433979, ]
+  - [2, 292.849234893736, 0.264976141958524, -34.6862550971077, 0, 0, 0, 0, -34.6862550971077, 0, 0, 2354.29914839224, ]
+  - [3, 284.257531905119, 0.257202188539785, -34.6782877748402, 0, 0, 0, 0, -34.6782877748402, 0, 0, 2367.86508750942, ]
+  - [4, 272.862158239342, 0.246891414973212, -34.6675042945152, 0, 0, 0, 0, -34.6675042945152, 0, 0, 2386.88523776537, ]
+  - [5, 259.248925478159, 0.234573875888807, -34.6542622858419, 0, 0, 0, 0, -34.6542622858419, 0, 0, 2411.23505368503, ]
+  - [6, 244.102558151161, 0.22086912443043, -34.6389932119674, 0, 0, 0, 0, -34.6389932119674, 0, 0, 2440.60961539168, ]
+  - [7, 228.168192263628, 0.206451375314691, -34.6221807744643, 0, 0, 0, 0, -34.6221807744643, 0, 0, 2474.49206612378, ]
+  - [8, 212.218632643712, 0.192019878590596, -34.6043377888619, 0, 0, 0, 0, -34.6043377888619, 0, 0, 2512.16855003701, ]
+  - [9, 197.031060870793, 0.178277844483638, -34.5859830675481, 0, 0, 0, 0, -34.5859830675481, 0, 0, 2552.79500740595, ]
+  - [10, 183.374480382415, 0.165921083464715, -34.5676197655996, 0, 0, 0, 0, -34.5676197655996, 0, 0, 2595.50653276987, ]
+...

From c13472ca779895b45dd3f697c4295ff197bf4123 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 11:53:48 -0400
Subject: [PATCH 20/29] adding new unit tests for train_mtp to refactor later

---
 tests/models/test_mtp.py | 109 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index f4b96f92..1c91da04 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -1,9 +1,12 @@
+import os
 from pathlib import Path
 
 import numpy as np
 import pytest
+import yaml
 from pymatgen.core import Structure
 
+from crystal_diffusion.train_mtp import extract_structure_and_forces_from_file, extract_energy_from_thermo_log, prepare_mtp_inputs_from_lammps
 from crystal_diffusion.models.mtp import MTPWithMLIP3
 
 
@@ -129,3 +132,109 @@ def test_read_cfgs(mtp_instance):
     assert np.array_equal(df['fy'], [4.1, 4.2, 4.3])
     assert np.array_equal(df['fz'], [5.1, 5.2, 5.3])
     assert np.array_equal(df['nbh_grades'], [6.1, 6.2, 6.3])
+
+
+def test_extract_structure_and_forces_from_file(tmpdir):
+    # test the function reading the lammps files.
+    # TODO refactor to move to data/
+    # Create a mock LAMMPS output
+    yaml_content = {
+        'box': [[0, 10], [0, 10], [0, 10]],  # x_lim, y_lim, z_lim
+        'keywords': ['x', 'y', 'z', 'type', 'fx', 'fy', 'fz'],
+        'data': [[1, 1, 1, 1, 0.1, 0.2, 0.3], [2, 2, 2, 2, 0.4, 0.5, 0.6]]
+    }
+    yaml_file = os.path.join(tmpdir, "lammps.yaml")
+    with open(yaml_file, "w") as f:
+        yaml.dump(yaml_content, f, sort_keys=False)
+
+    # Mock atom dict that the function expects
+    atom_dict = {1: 'H', 2: 'He'}
+
+    # Call the function
+    structures, forces = extract_structure_and_forces_from_file(yaml_file, atom_dict)
+
+    # Verify structures
+    assert isinstance(structures, list)
+    assert len(structures) == 1
+    assert all(isinstance(structure, Structure) for structure in structures)
+
+    # Verify the lattice was set up correctly, assuming a simple cubic lattice
+    assert np.allclose(structures[0].lattice.matrix, np.diag([10, 10, 10]))
+
+    # Verify species and positions
+    species = structures[0].species
+    assert [str(s) for s in species] == ['H', 'He']
+    np.testing.assert_array_almost_equal(structures[0].frac_coords, [[1, 1, 1], [2, 2, 2]])
+
+    # Verify forces
+    assert isinstance(forces, list)
+    assert len(forces) == 1
+    assert len(forces[0]) == 2  # Two sets of forces for two atoms
+    expected_forces = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
+    assert np.allclose(forces[0], expected_forces)
+
+
+def test_extract_energy_from_thermo_log(tmpdir):
+    # test the function reading the lammps thermodynamic output files.
+    # TODO refactor to move to data/
+    # Create a mock LAMMPS thermodynamic output
+    log_content = """
+    keywords:
+      - Step
+      - KinEng
+      - PotEng
+    data:
+      - [0, 50.5, -100.0]
+      - [1, 51.0, -101.5]
+    """
+    yaml_path = os.path.join(tmpdir, "thermo.yaml")
+    with open(yaml_path, "w") as f:
+        f.write(log_content)
+
+    # Call the function
+    energies = extract_energy_from_thermo_log(yaml_path)
+
+    # Check the results
+    expected_energies = [-49.5, -50.5]  # KinEng + PotEng for each step
+    assert isinstance(energies, list)
+    assert energies == expected_energies
+
+
+@pytest.fixture
+def mock_extract_energy_from_thermo_log(mocker):
+    return mocker.patch('crystal_diffusion.train_mtp.extract_energy_from_thermo_log', return_value=[])
+
+@pytest.fixture
+def mock_extract_structure_and_forces(mocker):
+    return mocker.patch('crystal_diffusion.train_mtp.extract_structure_and_forces_from_file', return_value=([], []))
+
+def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_extract_energy_from_thermo_log, tmpdir):
+    # Create mock file paths
+    output_yaml_files = [os.path.join(tmpdir, "output1.yaml"), os.path.join(tmpdir, "output2.yaml")]
+    thermo_yaml_files = [os.path.join(tmpdir, "thermo1.yaml"), os.path.join(tmpdir, "thermo2.yaml")]
+
+    # Mock atom dictionary
+    atom_dict = {1: 'H', 2: 'He'}
+
+    # Call the function
+    mtp_inputs = prepare_mtp_inputs_from_lammps(output_yaml_files, thermo_yaml_files, atom_dict)
+
+    # Verify that the mocks were called correctly
+    assert mock_extract_structure_and_forces.call_count == 2
+    mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict)
+
+    assert mock_extract_energy_from_thermo_log.call_count == 2
+    mock_extract_energy_from_thermo_log.assert_called_with(thermo_yaml_files[1])
+
+    # Verify that the result is correctly structured
+    assert 'structure' in mtp_inputs
+    assert 'energy' in mtp_inputs
+    assert 'forces' in mtp_inputs
+    assert isinstance(mtp_inputs['structure'], list)
+    assert isinstance(mtp_inputs['energy'], list)
+    assert isinstance(mtp_inputs['forces'], list)
+
+    # Verify that the data from the mocks is aggregated into the results correctly
+    assert mtp_inputs['structure'] == mock_extract_structure_and_forces.return_value[0] * len(output_yaml_files)
+    assert mtp_inputs['forces'] == mock_extract_structure_and_forces.return_value[1] * len(output_yaml_files)
+    assert mtp_inputs['energy'] == mock_extract_energy_from_thermo_log.return_value * len(thermo_yaml_files)

From 3f81f0f3de2e008a8f2724c5dab852435280fb3d Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 13:32:12 -0400
Subject: [PATCH 21/29] code review part 3

---
 crystal_diffusion/models/mtp.py |  3 +++
 crystal_diffusion/train_mtp.py  | 27 ++++++++-----------
 tests/models/test_mtp.py        | 48 ++++++++++++++++++++++++++++++++-
 3 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 104c00d3..9cee4074 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -80,6 +80,9 @@ def evaluate(self,
             dataframe with ground truth energies, forces
             dataframe with predicted energies, forces, MaxVol gamma (nbh grades)
         """
+        if self.fitted_mtp is None:
+            raise AttributeError('MTP was not trained. Please call train() before evaluate().')
+
         original_file = "original.cfgs"
         predict_file = "predict.cfgs"
         test_structures, test_forces, test_stresses = check_structures_forces_stresses(
diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index c84daec3..7d73fe8d 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -3,6 +3,7 @@
 Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp.
 """
 import os
+from collections import namedtuple
 from typing import Any, Dict, List, Tuple
 
 import numpy as np
@@ -18,9 +19,9 @@
 
 # TODO list of yaml files should come from an external call
 # yaml dump file
-lammps_yaml = ['lammps_scripts/Si/si-custom/dump.si-300-1.yaml']
+lammps_yaml = ['examples/local/mtp_example/dump.si-300-1.yaml']
 # yaml thermodynamic variables
-lammps_thermo_yaml = ['lammps_scripts/Si/si-custom/thermo_log.yaml']
+lammps_thermo_yaml = ['examples/local/mtp_example/thermo_log.yaml']
 # note that the YAML output does not contain the map from index to atomic species
 # this will have to be taken from elsewhere
 # use a manual map for now
@@ -108,12 +109,12 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
     return mtp_inputs
 
 
-def train_mtp(train_inputs: Dict[str, Any], mlip_cmd_path: str, save_dir: str) -> MTPWithMLIP3:
-    """Create and evaluate an MTP potential.
+def train_mtp(train_inputs: namedtuple, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
+    """Create and train an MTP potential.
 
     Args:
         train_inputs: inputs for training. Should contain structure, energies and forces
-        mlip_cmd_path: path to MLIP-3 folder
+        mlip_folder_path: path to MLIP-3 folder
         save_dir: path to directory where to save the fitted model
 
     Returns:
@@ -121,7 +122,7 @@ def train_mtp(train_inputs: Dict[str, Any], mlip_cmd_path: str, save_dir: str) -
     """
     # TODO more kwargs for MTP training. See maml and mlip-3 documentation.
     # create MTP
-    mtp = MTPWithMLIP3(mlip_path=mlip_cmd_path)
+    mtp = MTPWithMLIP3(mlip_path=mlip_folder_path)
     # train
     mtp.train(
         train_structures=train_inputs["structure"],
@@ -136,8 +137,8 @@ def train_mtp(train_inputs: Dict[str, Any], mlip_cmd_path: str, save_dir: str) -
     return mtp
 
 
-def evaluate_mtp(eval_inputs: Dict[str, Any], mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """Create and evaluate an MTP potential.
+def evaluate_mtp(eval_inputs: namedtuple, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """Evaluate a trained MTP potential.
 
     Args:
         eval_inputs: inputs to evaluate. Should contain structure, energies and forces
@@ -175,14 +176,8 @@ def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tu
     gt_energy = df_orig.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'})
     gt_energy = (gt_energy['energy'] / gt_energy['atom_index']).to_numpy()
 
-    predicted_forces = df_predict.groupby('structure_index').agg({'fx': 'sum', 'fy': 'sum', 'fz': 'sum',
-                                                                  'atom_index': 'count'})
-    predicted_forces = np.concatenate([(predicted_forces[f'f{x}'] / predicted_forces['atom_index']).to_numpy()
-                                       for x in ['x', 'y', 'z']])
-
-    gt_forces = df_orig.groupby('structure_index').agg({'fx': 'sum', 'fy': 'sum', 'fz': 'sum',
-                                                        'atom_index': 'count'})
-    gt_forces = np.concatenate([(gt_forces[f'f{x}'] / gt_forces['atom_index']).to_numpy() for x in ['x', 'y', 'z']])
+    predicted_forces = (df_predict[['fx', 'fy', 'fz']].to_numpy().flatten())
+    gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten())
 
     return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces)
 
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 1c91da04..252fab10 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -2,12 +2,16 @@
 from pathlib import Path
 
 import numpy as np
+import pandas as pd
 import pytest
 import yaml
 from pymatgen.core import Structure
+from sklearn.metrics import mean_absolute_error
 
-from crystal_diffusion.train_mtp import extract_structure_and_forces_from_file, extract_energy_from_thermo_log, prepare_mtp_inputs_from_lammps
 from crystal_diffusion.models.mtp import MTPWithMLIP3
+from crystal_diffusion.train_mtp import (
+    extract_energy_from_thermo_log, extract_structure_and_forces_from_file,
+    get_metrics_from_pred, prepare_mtp_inputs_from_lammps)
 
 
 class FakeStructure:
@@ -238,3 +242,45 @@ def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_
     assert mtp_inputs['structure'] == mock_extract_structure_and_forces.return_value[0] * len(output_yaml_files)
     assert mtp_inputs['forces'] == mock_extract_structure_and_forces.return_value[1] * len(output_yaml_files)
     assert mtp_inputs['energy'] == mock_extract_energy_from_thermo_log.return_value * len(thermo_yaml_files)
+
+
+def test_get_metrics_from_pred():
+    # test function from train_mtp
+    # TODO get better metrics and refactor the script
+    # Assuming there are 2 structures, each with 2 atoms (Total 4 readings)
+    df_orig = pd.DataFrame({
+        'structure_index': [0, 0, 1, 1],
+        'atom_index': [0, 1, 0, 1],
+        'energy': [1, 1, 3, 3],  # Total energy for the structure is the same for both atoms
+        'fx': [0.1, -0.1, 0.2, -0.2],
+        'fy': [0.1, -0.1, 0.2, -0.2],
+        'fz': [0.1, -0.1, 0.2, -0.2]
+    })
+
+    # Predicted data with some error introduced
+    df_predict = pd.DataFrame({
+        'structure_index': [0, 0, 1, 1],
+        'atom_index': [0, 1, 0, 1],
+        'energy': [1.05, 0.95, 3.1, 2.9],  # Energy has a slight variation
+        'fx': [0.12, -0.08, 0.23, -0.17],
+        'fy': [0.09, -0.11, 0.19, -0.21],
+        'fz': [0.11, -0.09, 0.18, -0.22]
+    })
+
+    # Calculate expected MAE for energy and forces
+    expected_mae_energy = mean_absolute_error(
+        df_orig.groupby('structure_index')['energy'].mean(),
+        df_predict.groupby('structure_index')['energy'].mean()
+    )
+
+    expected_mae_forces = mean_absolute_error(
+        df_orig[['fx', 'fy', 'fz']].values.flatten(),
+        df_predict[['fx', 'fy', 'fz']].values.flatten()
+    )
+
+    # Call the function under test
+    mae_energy, mae_forces = get_metrics_from_pred(df_orig, df_predict)
+
+    # Verify the MAE values are as expected
+    assert mae_energy == expected_mae_energy
+    assert mae_forces == expected_mae_forces

From c6e6e8c9e28d0319e4c7e92b0b26ba6a664068aa Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 13:39:19 -0400
Subject: [PATCH 22/29] flake8

---
 crystal_diffusion/models/mtp.py | 1 -
 tests/models/test_mtp.py        | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 9cee4074..1f85c333 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -209,7 +209,6 @@ def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
         df = pd.DataFrame(df)
         return df
 
-
     @staticmethod
     def _call_mlip(cmd_list: List[str]) -> Tuple[str, int]:
         """Call MLIP library with subprocess.
diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 252fab10..6cb353b2 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -208,10 +208,12 @@ def test_extract_energy_from_thermo_log(tmpdir):
 def mock_extract_energy_from_thermo_log(mocker):
     return mocker.patch('crystal_diffusion.train_mtp.extract_energy_from_thermo_log', return_value=[])
 
+
 @pytest.fixture
 def mock_extract_structure_and_forces(mocker):
     return mocker.patch('crystal_diffusion.train_mtp.extract_structure_and_forces_from_file', return_value=([], []))
 
+
 def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_extract_energy_from_thermo_log, tmpdir):
     # Create mock file paths
     output_yaml_files = [os.path.join(tmpdir, "output1.yaml"), os.path.join(tmpdir, "output2.yaml")]

From 2c7077d1a8af6b35c7158af165872c11171d753d Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 13:45:40 -0400
Subject: [PATCH 23/29] missing period

---
 crystal_diffusion/models/mtp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 1f85c333..23f283c2 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -227,7 +227,7 @@ def _call_mlip(cmd_list: List[str]) -> Tuple[str, int]:
 
     @staticmethod
     def _call_cmd_to_stdout(cmd: List[str], output_file: _io.TextIOWrapper):
-        """Call commands with subprocess.POpen and pipe output to a file
+        """Call commands with subprocess.POpen and pipe output to a file.
 
         Args:
             cmd: list of commands & arguments to run

From ad2e425c9d0b3982d62f1198c7de03780546e495 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 13:49:06 -0400
Subject: [PATCH 24/29] isort

---
 crystal_diffusion/models/mtp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 23f283c2..68c00f1f 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -4,7 +4,6 @@
 However, it cannot be called as a standard lightning module as it relies on the MLIP-3 library for the model
 implementation.
 """
-import _io
 import itertools
 import os
 import re
@@ -13,6 +12,7 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple
 
+import _io
 import numpy as np
 import pandas as pd
 from maml.apps.pes import MTPotential

From be5f3450d14155a83081238f715a04f1d62dbfc1 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 14:02:05 -0400
Subject: [PATCH 25/29] pytype fixes

---
 crystal_diffusion/models/mtp.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mtp.py
index 68c00f1f..67570495 100644
--- a/crystal_diffusion/models/mtp.py
+++ b/crystal_diffusion/models/mtp.py
@@ -10,9 +10,8 @@
 import shutil
 import subprocess
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, TextIO, Tuple
 
-import _io
 import numpy as np
 import pandas as pd
 from maml.apps.pes import MTPotential
@@ -210,7 +209,7 @@ def convert_to_dataframe(docs: List[Dict[str, Any]]) -> pd.DataFrame:
         return df
 
     @staticmethod
-    def _call_mlip(cmd_list: List[str]) -> Tuple[str, int]:
+    def _call_mlip(cmd_list: List[str]) -> Tuple[bytes, int]:
         """Call MLIP library with subprocess.
 
         Args:
@@ -226,7 +225,7 @@ def _call_mlip(cmd_list: List[str]) -> Tuple[str, int]:
         return stdout, rc
 
     @staticmethod
-    def _call_cmd_to_stdout(cmd: List[str], output_file: _io.TextIOWrapper):
+    def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO):
         """Call commands with subprocess.POpen and pipe output to a file.
 
         Args:

From 99f7ea12cf1f31301711fe2c9f98f0aa06c9f475 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 14:47:52 -0400
Subject: [PATCH 26/29] pytype & namedtuple funtime

---
 crystal_diffusion/train_mtp.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index 7d73fe8d..0b7fa87a 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -4,7 +4,7 @@
 """
 import os
 from collections import namedtuple
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, NamedTuple, Tuple
 
 import numpy as np
 import pandas as pd
@@ -81,10 +81,17 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
     return energies
 
 
+class MTP_Inputs(NamedTuple):
+    """Create a namedtuple instance for MTP inputs."""
+    structure: List[Structure]
+    forces: List[List[float]]
+    energy: List[float]
+
+
 def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
                                    thermo_yaml: List[str],
                                    atom_dict: Dict[int, Any]
-                                   ) -> Dict[str, Any]:
+                                   ) -> MTP_Inputs:
     """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format.
 
     Args:
@@ -93,7 +100,7 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
         atom_dict: mapping of LAMMPS indices to atom type.
 
     Returns:
-        dict with structure, energies and forces usable by MTP.
+        namedtuple with structure, energies and forces usable by MTP.
     """
     mtp_inputs = {
         'structure': [],
@@ -106,10 +113,13 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str],
         mtp_inputs['forces'] += forces
     for filename in thermo_yaml:
         mtp_inputs['energy'] += extract_energy_from_thermo_log(filename)
+    mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'],
+                            energy=mtp_inputs['energy'],
+                            forces=mtp_inputs['forces'])
     return mtp_inputs
 
 
-def train_mtp(train_inputs: namedtuple, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
+def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3:
     """Create and train an MTP potential.
 
     Args:
@@ -125,9 +135,9 @@ def train_mtp(train_inputs: namedtuple, mlip_folder_path: str, save_dir: str) ->
     mtp = MTPWithMLIP3(mlip_path=mlip_folder_path)
     # train
     mtp.train(
-        train_structures=train_inputs["structure"],
-        train_energies=train_inputs["energy"],
-        train_forces=train_inputs["forces"],
+        train_structures=train_inputs.structure,
+        train_energies=train_inputs.energy,
+        train_forces=train_inputs.forces,
         train_stresses=None,
         max_dist=5,
         stress_weight=0,
@@ -137,7 +147,7 @@ def train_mtp(train_inputs: namedtuple, mlip_folder_path: str, save_dir: str) ->
     return mtp
 
 
-def evaluate_mtp(eval_inputs: namedtuple, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
+def evaluate_mtp(eval_inputs: MTP_Inputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Evaluate a trained MTP potential.
 
     Args:
@@ -149,9 +159,9 @@ def evaluate_mtp(eval_inputs: namedtuple, mtp: MTPWithMLIP3) -> Tuple[pd.DataFra
     """
     # evaluate
     df_orig, df_predict = mtp.evaluate(
-        test_structures=eval_inputs["structure"],
-        test_energies=eval_inputs["energy"],
-        test_forces=eval_inputs["forces"],
+        test_structures=eval_inputs.structure,
+        test_energies=eval_inputs.energy,
+        test_forces=eval_inputs.forces,
         test_stresses=None,
     )
     return df_orig, df_predict

From 890eda1e44190dda67155421470087e66815080f Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 14:50:18 -0400
Subject: [PATCH 27/29] flake8

---
 crystal_diffusion/train_mtp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/train_mtp.py
index 0b7fa87a..923ad007 100644
--- a/crystal_diffusion/train_mtp.py
+++ b/crystal_diffusion/train_mtp.py
@@ -3,7 +3,6 @@
 Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp.
 """
 import os
-from collections import namedtuple
 from typing import Any, Dict, List, NamedTuple, Tuple
 
 import numpy as np
@@ -83,6 +82,7 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]:
 
 class MTP_Inputs(NamedTuple):
     """Create a namedtuple instance for MTP inputs."""
+
     structure: List[Structure]
     forces: List[List[float]]
     energy: List[float]

From 597c83a09132ac687b042adfa48818ca1a9f8db3 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 12 Mar 2024 14:56:21 -0400
Subject: [PATCH 28/29] unit test fix for namedtuple

---
 tests/models/test_mtp.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 6cb353b2..030c8635 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -233,17 +233,14 @@ def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_
     mock_extract_energy_from_thermo_log.assert_called_with(thermo_yaml_files[1])
 
     # Verify that the result is correctly structured
-    assert 'structure' in mtp_inputs
-    assert 'energy' in mtp_inputs
-    assert 'forces' in mtp_inputs
-    assert isinstance(mtp_inputs['structure'], list)
-    assert isinstance(mtp_inputs['energy'], list)
-    assert isinstance(mtp_inputs['forces'], list)
+    assert isinstance(mtp_inputs.structure, list)
+    assert isinstance(mtp_inputs.energy, list)
+    assert isinstance(mtp_inputs.forces, list)
 
     # Verify that the data from the mocks is aggregated into the results correctly
-    assert mtp_inputs['structure'] == mock_extract_structure_and_forces.return_value[0] * len(output_yaml_files)
-    assert mtp_inputs['forces'] == mock_extract_structure_and_forces.return_value[1] * len(output_yaml_files)
-    assert mtp_inputs['energy'] == mock_extract_energy_from_thermo_log.return_value * len(thermo_yaml_files)
+    assert mtp_inputs.structure == mock_extract_structure_and_forces.return_value[0] * len(output_yaml_files)
+    assert mtp_inputs.forces == mock_extract_structure_and_forces.return_value[1] * len(output_yaml_files)
+    assert mtp_inputs.energy == mock_extract_energy_from_thermo_log.return_value * len(thermo_yaml_files)
 
 
 def test_get_metrics_from_pred():

From d9f0756b5da1a58d591662b86771791ccf087db7 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Wed, 13 Mar 2024 14:54:49 -0400
Subject: [PATCH 29/29] fixing mae on energy

---
 tests/models/test_mtp.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py
index 030c8635..2ad3fa6c 100644
--- a/tests/models/test_mtp.py
+++ b/tests/models/test_mtp.py
@@ -260,17 +260,16 @@ def test_get_metrics_from_pred():
     df_predict = pd.DataFrame({
         'structure_index': [0, 0, 1, 1],
         'atom_index': [0, 1, 0, 1],
-        'energy': [1.05, 0.95, 3.1, 2.9],  # Energy has a slight variation
+        'energy': [1.1, 1.1, 3.3, 3.3],  # energy cannot be different per atom for a given structure
         'fx': [0.12, -0.08, 0.23, -0.17],
         'fy': [0.09, -0.11, 0.19, -0.21],
         'fz': [0.11, -0.09, 0.18, -0.22]
     })
 
     # Calculate expected MAE for energy and forces
-    expected_mae_energy = mean_absolute_error(
-        df_orig.groupby('structure_index')['energy'].mean(),
-        df_predict.groupby('structure_index')['energy'].mean()
-    )
+    # 1 value per structure - here, we take indices 0 and 2
+    expected_mae_energy = np.abs(df_orig['energy'].iloc[[0, 2]] - df_predict['energy'].iloc[[0, 2]]).mean() / 2
+    # we take the energy per atom. 2 atoms per structure here, so we can simply divide by 2
 
     expected_mae_forces = mean_absolute_error(
         df_orig[['fx', 'fy', 'fz']].values.flatten(),