diff --git a/lobsterpy/featurize/__init__.py b/lobsterpy/featurize/__init__.py index f4885426..354e5941 100644 --- a/lobsterpy/featurize/__init__.py +++ b/lobsterpy/featurize/__init__.py @@ -2,3 +2,4 @@ # Distributed under the terms of a BSD 3-Clause "New" or "Revised" License """This package provides the modules for featurzing Lobster data ready for ML studies.""" +from .utils import get_file_paths, get_structure_path diff --git a/lobsterpy/featurize/batch.py b/lobsterpy/featurize/batch.py index a7513dda..76ad58e4 100644 --- a/lobsterpy/featurize/batch.py +++ b/lobsterpy/featurize/batch.py @@ -12,7 +12,6 @@ import numpy as np import pandas as pd -from monty.os.path import zpath from tqdm.autonotebook import tqdm from lobsterpy.featurize.core import ( @@ -24,6 +23,8 @@ ) from lobsterpy.structuregraph.graph import LobsterGraph +from . import get_file_paths + warnings.filterwarnings("ignore") @@ -67,13 +68,26 @@ def __init__( Possible options are `bonding`, `antibonding` or `overall` :param charge_type: set charge type used for computing ionicity. Possible options are `mulliken`, `loewdin` or `both`. - :param bonds: `all_bonds` or `cation_anion_bonds` + :param bonds: `all` or `cation-anion` bonds :param orbital_resolved: bool indicating whether LobsterPy analysis is performed orbital wise :param include_cobi_data: bool stating to include COBICAR.lobster features :param include_coop_data: bool stating to include COOPCAR.lobster features :param e_range: range of energy relative to fermi for which moment features needs to be computed :param n_jobs: parallel processes to run """ + # Check for valid parameters of string type + allowed_str_inputs = { + "charge_type": ["mulliken", "loewdin", "both"], + "bonds": ["all", "cation-anion"], + "feature_type": ["bonding", "antibonding", "overall"], + } + for param, param_string in zip([charge_type, bonds, feature_type], ["charge_type", "bonds", "feature_type"]): + if param not in allowed_str_inputs[param_string]: + raise ValueError( + f"Parameter {param_string} set to {param} but must be in " + f"{list(allowed_str_inputs[param_string])}." + ) + self.path_to_lobster_calcs = path_to_lobster_calcs self.path_to_jsons = path_to_jsons self.feature_type = feature_type @@ -125,126 +139,57 @@ def _featurizecoxx(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: Effective interaction number and moment features (center, width, skewness and kurtosis) """ - dir_name = Path(path_to_lobster_calc) + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "cohpcar", "icohplist"] + ) + structure_path = file_paths.get("poscar") + + coxx = FeaturizeCOXX( + path_to_coxxcar=str(file_paths.get("cohpcar")), + path_to_icoxxlist=str(file_paths.get("icohplist")), + path_to_structure=str(structure_path), + feature_type=self.feature_type, + e_range=self.e_range, + ) + + df = coxx.get_summarized_coxx_df() + del coxx + + if self.include_cobi_data: + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["cobicar", "icobilist"] + ) - req_files = { - "structure_path": "POSCAR", - "coxxcar_path": "COHPCAR.lobster", - "icoxxlist_path": "ICOHPLIST.lobster", - } - for file, default_value in req_files.items(): - # Check if "POSCAR" exists, and if not, check for "POSCAR.lobster" - if file == "structure_path": - for filename in [default_value, "POSCAR.lobster"]: - poscar_path = dir_name / filename - req_files[file] = poscar_path # type: ignore - if not poscar_path.exists(): - gz_file_path = Path(zpath(poscar_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - break - else: - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - coxxcar_path = req_files.get("coxxcar_path") - structure_path = req_files.get("structure_path") - icoxxlist_path = req_files.get("icoxxlist_path") - - if ( - coxxcar_path.exists() # type: ignore - and structure_path.exists() # type: ignore - and icoxxlist_path.exists() # type: ignore - ): coxx = FeaturizeCOXX( - path_to_coxxcar=str(coxxcar_path), - path_to_icoxxlist=str(icoxxlist_path), + path_to_coxxcar=str(file_paths.get("cobicar")), + path_to_icoxxlist=str(file_paths.get("icobilist")), path_to_structure=str(structure_path), feature_type=self.feature_type, e_range=self.e_range, + are_cobis=True, ) - df_cohp = coxx.get_summarized_coxx_df() + df_cobi = coxx.get_summarized_coxx_df() + df = pd.concat([df, df_cobi], axis=1) del coxx - else: - raise Exception(f"COHPCAR.lobster or POSCAR or ICOHPLIST.lobster file not found in {dir_name.name}") - - if self.include_cobi_data: - req_files = { - "coxxcar_path": "COBICAR.lobster", - "icoxxlist_path": "ICOBILIST.lobster", - } - for file, default_value in req_files.items(): - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - coxxcar_path = req_files.get("coxxcar_path") - icoxxlist_path = req_files.get("icoxxlist_path") - - if coxxcar_path.exists() and icoxxlist_path.exists(): # type: ignore - coxx = FeaturizeCOXX( - path_to_coxxcar=str(coxxcar_path), - path_to_icoxxlist=str(icoxxlist_path), - path_to_structure=str(structure_path), - feature_type=self.feature_type, - e_range=self.e_range, - are_cobis=True, - ) - - df_cobi = coxx.get_summarized_coxx_df() - del coxx - - else: - raise Exception(f"COBICAR.lobster or ICOBILIST.lobster file not found in {dir_name.name}") if self.include_coop_data: - req_files = { - "coxxcar_path": "COOPCAR.lobster", - "icoxxlist_path": "ICOOPLIST.lobster", - } - for file, default_value in req_files.items(): - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - coxxcar_path = req_files.get("coxxcar_path") - icoxxlist_path = req_files.get("icoxxlist_path") - - if coxxcar_path.exists() and icoxxlist_path.exists(): # type: ignore - coxx = FeaturizeCOXX( - path_to_coxxcar=str(coxxcar_path), - path_to_icoxxlist=str(icoxxlist_path), - path_to_structure=str(structure_path), - feature_type=self.feature_type, - e_range=self.e_range, - are_coops=True, - ) - - df_coop = coxx.get_summarized_coxx_df() - del coxx + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["coopcar", "icooplist"] + ) - else: - raise Exception(f"COOPCAR.lobster or ICOOPLIST.lobster file not found in {dir_name.name}") + coxx = FeaturizeCOXX( + path_to_coxxcar=str(file_paths.get("coopcar")), + path_to_icoxxlist=str(file_paths.get("icooplist")), + path_to_structure=str(structure_path), + feature_type=self.feature_type, + e_range=self.e_range, + are_coops=True, + ) - if self.include_cobi_data and self.include_coop_data: - df = pd.concat([df_cohp, df_cobi, df_coop], axis=1) - elif self.include_cobi_data and not self.include_coop_data: - df = pd.concat([df_cohp, df_cobi], axis=1) - elif not self.include_cobi_data and self.include_coop_data: - df = pd.concat([df_cohp, df_coop], axis=1) - else: - df = df_cohp + df_coop = coxx.get_summarized_coxx_df() + df = pd.concat([df, df_coop], axis=1) + del coxx return df @@ -258,66 +203,38 @@ def _featurizecharges(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: A pandas dataframe with computed ionicity for the structure """ - dir_name = Path(path_to_lobster_calc) + file_paths = get_file_paths(path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "charge"]) - req_files = { - "charge_path": "CHARGE.lobster", - "structure_path": "POSCAR", - } - for file, default_value in req_files.items(): - if file == "structure_path": - for filename in [default_value, "POSCAR.lobster"]: - poscar_path = dir_name / filename - req_files[file] = poscar_path # type: ignore - if not poscar_path.exists(): - gz_file_path = Path(zpath(poscar_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - break - else: - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - charge_path = req_files.get("charge_path") - structure_path = req_files.get("structure_path") - - if charge_path.exists() and structure_path.exists(): # type: ignore - if self.charge_type == "mulliken": - charge_mull = FeaturizeCharges( - path_to_charge=str(charge_path), - path_to_structure=str(structure_path), - charge_type="mulliken", - ) - df = charge_mull.get_df() - elif self.charge_type == "loewdin": - charge_loew = FeaturizeCharges( - path_to_charge=str(charge_path), - path_to_structure=str(structure_path), - charge_type="loewdin", - ) - df = charge_loew.get_df() - elif self.charge_type == "both": - charge_mull = FeaturizeCharges( - path_to_charge=str(charge_path), - path_to_structure=str(structure_path), - charge_type="mulliken", - ) - df_mull = charge_mull.get_df() + if self.charge_type == "mulliken": + charge_mull = FeaturizeCharges( + path_to_charge=str(file_paths.get("charge")), + path_to_structure=str(file_paths.get("poscar")), + charge_type="mulliken", + ) + df = charge_mull.get_df() + elif self.charge_type == "loewdin": + charge_loew = FeaturizeCharges( + path_to_charge=str(file_paths.get("charge")), + path_to_structure=str(file_paths.get("poscar")), + charge_type="loewdin", + ) + df = charge_loew.get_df() + else: + charge_mull = FeaturizeCharges( + path_to_charge=str(file_paths.get("charge")), + path_to_structure=str(file_paths.get("poscar")), + charge_type="mulliken", + ) + df_mull = charge_mull.get_df() - charge_loew = FeaturizeCharges( - path_to_charge=str(charge_path), - path_to_structure=str(structure_path), - charge_type="loewdin", - ) - df_loew = charge_loew.get_df() + charge_loew = FeaturizeCharges( + path_to_charge=str(file_paths.get("charge")), + path_to_structure=str(file_paths.get("poscar")), + charge_type="loewdin", + ) + df_loew = charge_loew.get_df() - df = pd.concat([df_mull, df_loew], axis=1) - else: - raise Exception(f"CHARGE.lobster or POSCAR not found in {dir_name.name}") + df = pd.concat([df_mull, df_loew], axis=1) return df @@ -568,74 +485,40 @@ def _fingerprint_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: A pandas dataframe with COXX fingerprint object """ - dir_name = Path(path_to_lobster_calc) - if self.fingerprint_for.upper() == "COBI": - req_files = { - "coxxcar_path": "COBICAR.lobster", - "icoxxlist_path": "ICOBILIST.lobster", - } - for file, default_value in req_files.items(): - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - coxxcar_path = req_files.get("coxxcar_path") - icoxxlist_path = req_files.get("icoxxlist_path") + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "cobicar", "icobilist"] + ) + + coxxcar_path = file_paths.get("cobicar") + icoxxlist_path = file_paths.get("icobilist") are_cobis = True are_coops = False elif self.fingerprint_for.upper() == "COOP": - req_files = { - "coxxcar_path": "COOPCAR.lobster", - "icoxxlist_path": "ICOOPLIST.lobster", - } - for file, default_value in req_files.items(): - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - coxxcar_path = req_files.get("coxxcar_path") - icoxxlist_path = req_files.get("icoxxlist_path") + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "coopcar", "icooplist"] + ) + + coxxcar_path = file_paths.get("coopcar") + icoxxlist_path = file_paths.get("icooplist") are_cobis = False are_coops = True else: - req_files = { - "coxxcar_path": "COHPCAR.lobster", - "icoxxlist_path": "ICOHPLIST.lobster", - } - for file, default_value in req_files.items(): - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - coxxcar_path = req_files.get("coxxcar_path") - icoxxlist_path = req_files.get("icoxxlist_path") + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "cohpcar", "icohplist"] + ) + + coxxcar_path = file_paths.get("cohpcar") + icoxxlist_path = file_paths.get("icohplist") are_cobis = False are_coops = False - for filename in ["POSCAR", "POSCAR.lobster"]: - structure_path = dir_name / filename - if not structure_path.exists(): - gz_file_path = Path(zpath(structure_path)) - if gz_file_path.exists(): - structure_path = gz_file_path # type: ignore - break - coxx = FeaturizeCOXX( path_to_coxxcar=str(coxxcar_path), path_to_icoxxlist=str(icoxxlist_path), - path_to_structure=str(structure_path), + path_to_structure=str(file_paths.get("poscar")), feature_type=self.feature_type, e_range=self.e_range, are_coops=are_coops, @@ -729,52 +612,20 @@ def _get_sg_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: A structure graph with LOBSTER data as edge and node properties in structure graph objects """ dir_name = Path(path_to_lobster_calc) - - req_files = { - "charge_path": "CHARGE.lobster", - "cohpcar_path": "COHPCAR.lobster", - "icohplist_path": "ICOHPLIST.lobster", - "icooplist_path": "ICOOPLIST.lobster", - "icobilist_path": "ICOBILIST.lobster", - "madelung_path": "MadelungEnergies.lobster", - "structure_path": "POSCAR", - } - - for file, default_value in req_files.items(): - if file == "structure_path": - for filename in [default_value, "POSCAR.lobster"]: - poscar_path = dir_name / filename - req_files[file] = poscar_path # type: ignore - if not poscar_path.exists(): - gz_file_path = Path(zpath(poscar_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - break - else: - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - charge_path = str(req_files.get("charge_path")) - cohpcar_path = str(req_files.get("cohpcar_path")) - icohplist_path = str(req_files.get("icohplist_path")) - icooplist_path = str(req_files.get("icooplist_path")) - icobilist_path = str(req_files.get("icobilist_path")) - madelung_path = str(req_files.get("madelung_path")) - structure_path = str(req_files.get("structure_path")) + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, + requested_files=["charge", "cohpcar", "icohplist", "icooplist", "icobilist", "madelung", "poscar"], + ) graph = LobsterGraph( - path_to_poscar=structure_path, - path_to_charge=charge_path, - path_to_cohpcar=cohpcar_path, - path_to_icohplist=icohplist_path, + path_to_poscar=str(file_paths.get("poscar")), + path_to_charge=str(file_paths.get("charge")), + path_to_cohpcar=str(file_paths.get("cohpcar")), + path_to_icohplist=str(file_paths.get("icohplist")), add_additional_data_sg=self.add_additional_data_sg, - path_to_icooplist=icooplist_path, - path_to_icobilist=icobilist_path, - path_to_madelung=madelung_path, + path_to_icooplist=str(file_paths.get("icooplist")), + path_to_icobilist=str(file_paths.get("icobilist")), + path_to_madelung=str(file_paths.get("madelung")), which_bonds=self.which_bonds, start=self.start, ) @@ -875,44 +726,20 @@ def _get_dos_moments_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: Returns: A pandas dataframe with computed PDOS moment features """ - dir_name = Path(path_to_lobster_calc) - req_files = { - "doscar_path": ("DOSCAR.LSO.lobster" if self.use_lso_dos else "DOSCAR.lobster"), - "structure_path": "POSCAR", - } - for file, default_value in req_files.items(): - if file == "structure_path": - for filename in [default_value, "POSCAR.lobster"]: - poscar_path = dir_name / filename - req_files[file] = poscar_path # type: ignore - if not poscar_path.exists(): - gz_file_path = Path(zpath(poscar_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - break - else: - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - doscar_path = req_files.get("doscar_path") - structure_path = req_files.get("structure_path") - - if doscar_path.exists() and structure_path.exists(): # type: ignore - featurize_dos = FeaturizeDoscar( - path_to_doscar=str(doscar_path), - path_to_structure=str(structure_path), - add_element_dos_moments=self.add_element_dos_moments, - e_range=self.e_range, - ) - df = featurize_dos.get_df() - else: - raise Exception(f"DOSCAR.lobster or DOSCAR.LSO.lobster or POSCAR not found in {dir_name.name}") + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, + requested_files=["poscar", "doscar"], + use_lso_dos=self.use_lso_dos, + ) - return df + featurize_dos = FeaturizeDoscar( + path_to_doscar=str(file_paths.get("doscar")), + path_to_structure=str(file_paths.get("poscar")), + add_element_dos_moments=self.add_element_dos_moments, + e_range=self.e_range, + ) + + return featurize_dos.get_df() def _get_dos_fingerprints_df(self, path_to_lobster_calc: str | Path) -> pd.DataFrame: """ @@ -923,48 +750,23 @@ def _get_dos_fingerprints_df(self, path_to_lobster_calc: str | Path) -> pd.DataF Returns: A pandas dataframe with DOS fingerprint objects """ - dir_name = Path(path_to_lobster_calc) + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, + requested_files=["poscar", "doscar"], + use_lso_dos=self.use_lso_dos, + ) - req_files = { - "doscar_path": ("DOSCAR.LSO.lobster" if self.use_lso_dos else "DOSCAR.lobster"), - "structure_path": "POSCAR", - } - for file, default_value in req_files.items(): - if file == "structure_path": - for filename in [default_value, "POSCAR.lobster"]: - poscar_path = dir_name / filename - req_files[file] = poscar_path # type: ignore - if not poscar_path.exists(): - gz_file_path = Path(zpath(poscar_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - break - else: - file_path = dir_name / default_value - req_files[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files[file] = gz_file_path # type: ignore - - doscar_path = req_files.get("doscar_path") - structure_path = req_files.get("structure_path") - - if doscar_path.exists() and structure_path.exists(): # type: ignore - featurize_dos = FeaturizeDoscar( - path_to_doscar=str(doscar_path), - path_to_structure=str(structure_path), - e_range=self.e_range, - ) - df = featurize_dos.get_fingerprint_df( - fp_type=self.fingerprint_type, - normalize=self.normalize, - n_bins=self.n_bins, - ) - else: - raise Exception(f"DOSCAR.lobster or DOSCAR.LSO.lobster or POSCAR not found in {dir_name.name}") + featurize_dos = FeaturizeDoscar( + path_to_doscar=str(file_paths.get("doscar")), + path_to_structure=str(file_paths.get("poscar")), + e_range=self.e_range, + ) - return df + return featurize_dos.get_fingerprint_df( + fp_type=self.fingerprint_type, + normalize=self.normalize, + n_bins=self.n_bins, + ) def get_df(self) -> pd.DataFrame: """ diff --git a/lobsterpy/featurize/core.py b/lobsterpy/featurize/core.py index abbab454..cb00bd4a 100644 --- a/lobsterpy/featurize/core.py +++ b/lobsterpy/featurize/core.py @@ -14,7 +14,6 @@ import numpy as np import pandas as pd from mendeleev import element -from monty.os.path import zpath from numpy import ndarray from pymatgen.core.structure import Structure from pymatgen.electronic_structure.cohp import CompleteCohp @@ -25,6 +24,8 @@ from lobsterpy.cohp.analyze import Analysis +from . import get_file_paths + warnings.filterwarnings("ignore") @@ -268,54 +269,19 @@ def get_lobsterpy_cba_dict(path_to_lobster_calc: str | Path, bonds: str, orbital Returns a dictionary with lobster summarized bonding analysis data """ - dir_name = Path(str(path_to_lobster_calc)) - - # check if files are compressed (.gz) and update file paths - req_files_lobsterpy = { - "structure_path": "POSCAR", - "cohpcar_path": "COHPCAR.lobster", - "icohplist_path": "ICOHPLIST.lobster", - "charge_path": "CHARGE.lobster", - } - - for file, default_value in req_files_lobsterpy.items(): - if file == "structure_path": - for filename in [default_value, "POSCAR.lobster"]: - poscar_path = dir_name / filename - req_files_lobsterpy[file] = poscar_path # type: ignore - if not poscar_path.exists(): - gz_file_path = Path(zpath(poscar_path)) - if gz_file_path.exists(): - req_files_lobsterpy[file] = gz_file_path # type: ignore - break - else: - file_path = dir_name / default_value - req_files_lobsterpy[file] = file_path # type: ignore - if not file_path.exists(): - gz_file_path = Path(zpath(file_path)) - if gz_file_path.exists(): - req_files_lobsterpy[file] = gz_file_path # type: ignore - else: - raise Exception( - "Path provided for Lobster calc directory seems incorrect." - "It does not contain COHPCAR.lobster, ICOHPLIST.lobster, POSCAR and " - "CHARGE.lobster files needed for automatic analysis using LobsterPy" - ) - - cohpcar_path = req_files_lobsterpy.get("cohpcar_path") - charge_path = req_files_lobsterpy.get("charge_path") - structure_path = req_files_lobsterpy.get("structure_path") - icohplist_path = req_files_lobsterpy.get("icohplist_path") + file_paths = get_file_paths( + path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "cohpcar", "icohplist", "charge"] + ) which_bonds = bonds.replace("-", "_") bond_type = f"{which_bonds}_bonds" try: analyse = Analysis( - path_to_poscar=str(structure_path), - path_to_icohplist=str(icohplist_path), - path_to_cohpcar=str(cohpcar_path), - path_to_charge=str(charge_path), + path_to_poscar=str(file_paths.get("poscar")), + path_to_icohplist=str(file_paths.get("icohplist")), + path_to_cohpcar=str(file_paths.get("cohpcar")), + path_to_charge=str(file_paths.get("charge")), summed_spins=False, # we will always use spin polarization here cutoff_icohp=0.10, which_bonds=which_bonds, @@ -326,15 +292,9 @@ def get_lobsterpy_cba_dict(path_to_lobster_calc: str | Path, bonds: str, orbital except ValueError: data = {bond_type: {"lobsterpy_data": {}}} - madelung_energies_path = dir_name / "MadelungEnergies.lobster" - # check if .gz file exists and update Madelung Energies path - if not madelung_energies_path.exists(): - gz_file_path = Path(zpath(madelung_energies_path)) - if gz_file_path.exists(): - madelung_energies_path = gz_file_path - - if madelung_energies_path.exists(): - madelung_obj = MadelungEnergies(filename=str(madelung_energies_path)) + try: + madelung_path = get_file_paths(path_to_lobster_calc=path_to_lobster_calc, requested_files=["madelung"]) + madelung_obj = MadelungEnergies(filename=str(madelung_path.get("madelung"))) madelung_energies = { "Mulliken": madelung_obj.madelungenergies_Mulliken, @@ -342,8 +302,7 @@ def get_lobsterpy_cba_dict(path_to_lobster_calc: str | Path, bonds: str, orbital "Ewald_splitting": madelung_obj.ewald_splitting, } data["madelung_energies"] = madelung_energies - - else: + except Exception: warnings.warn( "MadelungEnergies.lobster file not found in Lobster calc directory provided" " Will set Madelung Energies for crystal structure values to NaN" @@ -972,7 +931,7 @@ def _calc_ionicity(self) -> float: structure = Structure.from_file(self.path_to_structure) if self.charge_type.lower() not in ["mulliken", "loewdin"]: - raise ValueError("Please check the requested charge_type. Possible options are `Mulliken` or `Loewdin`") + raise ValueError("Please check the requested charge_type. Possible options are `mulliken` or `loewdin`") ch_veff = [] tol = 1e-6 diff --git a/lobsterpy/featurize/utils.py b/lobsterpy/featurize/utils.py new file mode 100644 index 00000000..acb57a7f --- /dev/null +++ b/lobsterpy/featurize/utils.py @@ -0,0 +1,86 @@ +# Copyright (c) lobsterpy development team +# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License + +"""This package provides the modules for featurzing Lobster data ready for ML studies.""" +from __future__ import annotations + +from pathlib import Path + +from monty.os.path import zpath + + +def get_file_paths( + path_to_lobster_calc: str | Path = "", requested_files: list[str] = [], use_lso_dos: bool = True +) -> dict: + """ + Get file paths for LobsterPy featurizations, raise Exception if not all of requested paths exist. + + :param path_to_lobster_calc: path to root LOBSTER calc directory + :param requested_files: files to return paths for. + :param use_lso_dos: solely required for BatchDosFeaturizer. + Will force featurizer to use DOSCAR.LSO.lobster instead of DOSCAR.lobster. + + :return: dict that assigns each item of requested_files its path + + """ + default_values = { + "poscar": "POSCAR", + "cohpcar": "COHPCAR.lobster", + "icohplist": "ICOHPLIST.lobster", + "cobicar": "COBICAR.lobster", + "icobilist": "ICOBILIST.lobster", + "coopcar": "COOPCAR.lobster", + "icooplist": "ICOOPLIST.lobster", + "charge": "CHARGE.lobster", + "madelung": "MadelungEnergies.lobster", + "doscar": ("DOSCAR.LSO.lobster" if use_lso_dos else "DOSCAR.lobster"), + } + + lobster_path = Path(path_to_lobster_calc) + file_paths = {} + missing_files = [] + + for file in requested_files: + file_str = default_values.get(file) + file_str = file_str if isinstance(file_str, str) else file + if file == "poscar": + try: + file_paths[file] = get_structure_path(lobster_path=lobster_path) + except Exception: + missing_files.append(default_values["poscar"]) + else: + file_path = lobster_path / file_str + if file_path.exists(): + file_paths[file] = file_path + else: + gz_file_path = Path(zpath(file_path)) + if gz_file_path.exists(): + file_paths[file] = gz_file_path + else: + missing_files.append(default_values[file]) + + if missing_files: + raise Exception(f"Files {missing_files} not found in {lobster_path.name}.") + + return file_paths + + +def get_structure_path(lobster_path: Path) -> Path: + """ + Search iteratively for (unzipped / zipped) structure file. + + POSCAR is prioritized over POSCAR.lobster. + + :param lobster_path: path to root LOBSTER calc directory + + :return: path to structure file + """ + for filename in ["POSCAR", "POSCAR.lobster"]: + poscar_path = lobster_path / filename + if poscar_path.exists(): + return poscar_path + gz_file_path = Path(zpath(poscar_path)) + if gz_file_path.exists(): + return gz_file_path + + raise Exception diff --git a/tests/featurize/test_batch.py b/tests/featurize/test_batch.py index a1bd1ff1..c6573516 100644 --- a/tests/featurize/test_batch.py +++ b/tests/featurize/test_batch.py @@ -675,6 +675,22 @@ def test_batch_dos_featurizer_lso(self): class TestExceptions: def test_batch_summary_featurizer_exception(self): + with pytest.raises(ValueError) as err0: # noqa: PT012, PT011 + self.summary_featurize_with_json_ex = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir / "test_data/Featurizer_test_data/Lobster_calcs_exceptions/1/", + bonds="all", + feature_type="nonbonding", + include_cobi_data=True, + include_coop_data=True, + e_range=[-15, 0], + ) + + _ = self.summary_featurize_with_json_ex.get_df() + + assert str(err0.value) == ( + "Parameter feature_type set to nonbonding but must be in ['bonding', 'antibonding', 'overall']." + ) + with pytest.raises(Exception) as err1: # noqa: PT012, PT011 self.summary_featurize_with_json_ex = BatchSummaryFeaturizer( path_to_lobster_calcs=TestDir / "test_data/Featurizer_test_data/Lobster_calcs_exceptions/1/", @@ -687,7 +703,7 @@ def test_batch_summary_featurizer_exception(self): _ = self.summary_featurize_with_json_ex.get_df() - assert str(err1.value) == "COBICAR.lobster or ICOBILIST.lobster file not found in mp-2176" + assert str(err1.value) == "Files ['COBICAR.lobster', 'ICOBILIST.lobster'] not found in mp-2176." with pytest.raises(Exception) as err2: # noqa: PT012, PT011 self.summary_featurize_with_json_ex2 = BatchSummaryFeaturizer( @@ -701,7 +717,7 @@ def test_batch_summary_featurizer_exception(self): _ = self.summary_featurize_with_json_ex2.get_df() - assert str(err2.value) == "COOPCAR.lobster or ICOOPLIST.lobster file not found in mp-1000" + assert str(err2.value) == "Files ['COOPCAR.lobster', 'ICOOPLIST.lobster'] not found in mp-1000." # COXX exception with pytest.raises(Exception) as err3: # noqa: PT012, PT011 @@ -711,7 +727,7 @@ def test_batch_summary_featurizer_exception(self): path_to_lobster_calc=self.raise_coxx_exception.path_to_lobster_calcs ) - assert str(err3.value) == "COHPCAR.lobster or POSCAR or ICOHPLIST.lobster file not found in JSONS" + assert str(err3.value) == "Files ['POSCAR', 'COHPCAR.lobster', 'ICOHPLIST.lobster'] not found in JSONS." # Charges exception with pytest.raises(Exception) as err4: # noqa: PT012, PT011 @@ -721,7 +737,7 @@ def test_batch_summary_featurizer_exception(self): path_to_lobster_calc=self.raise_ch_exception.path_to_lobster_calcs ) - assert str(err4.value) == "CHARGE.lobster or POSCAR not found in JSONS" + assert str(err4.value) == "Files ['POSCAR', 'CHARGE.lobster'] not found in JSONS." # Fingerprint similarity exception with pytest.raises(Exception) as err8: # noqa: PT012, PT011 diff --git a/tests/featurize/test_core.py b/tests/featurize/test_core.py index 720fd632..41f4ec2c 100644 --- a/tests/featurize/test_core.py +++ b/tests/featurize/test_core.py @@ -669,9 +669,8 @@ def test_lobsterpy_featurize_exception(self): _ = self.featurize_mp1249_json.get_df() assert ( - str(err.value) == "Path provided for Lobster calc directory seems incorrect." - "It does not contain COHPCAR.lobster, ICOHPLIST.lobster, POSCAR and " - "CHARGE.lobster files needed for automatic analysis using LobsterPy" + str(err.value) == "Files ['POSCAR', 'COHPCAR.lobster', 'ICOHPLIST.lobster', 'CHARGE.lobster'] " + "not found in ..." ) with pytest.raises(Exception) as err: # noqa: PT012, PT011 @@ -702,7 +701,7 @@ def test_featurize_charges(self): _ = self.featurize_cdf_charge.get_df() - assert str(err.value) == "Please check the requested charge_type. Possible options are `Mulliken` or `Loewdin`" + assert str(err.value) == "Please check the requested charge_type. Possible options are `mulliken` or `loewdin`" def test_featurize_coxx(self): with pytest.raises(Exception) as err: # noqa: PT012, PT011 diff --git a/tests/featurize/test_utils.py b/tests/featurize/test_utils.py new file mode 100644 index 00000000..2c2ace25 --- /dev/null +++ b/tests/featurize/test_utils.py @@ -0,0 +1,58 @@ +import gzip +import shutil +from pathlib import Path + +from pymatgen.core import Structure + +from lobsterpy.featurize import get_file_paths, get_structure_path + +CurrentDir = Path(__file__).absolute().parent +TestDir = CurrentDir / "../" + + +def test_get_structure_path(tmp_path): + """ + Tests that POSCAR path is returned by get_structure_path function. + + Tests that in case of both LOBSTER and VASP structure files present, the VASP + file is read. + """ + with ( + gzip.open(TestDir / "test_data/test_structure_path_handling/POSCAR.gz", "rb") as zipped_poscar, + open(tmp_path / "POSCAR", "wb") as unzipped_poscar, + ): + shutil.copyfileobj(zipped_poscar, unzipped_poscar) + + poscar_path_unzipped = get_structure_path(lobster_path=tmp_path) + assert isinstance(poscar_path_unzipped, Path) + + poscar_path_both = get_structure_path(lobster_path=TestDir / "test_data/test_structure_path_handling") + assert isinstance(poscar_path_both, Path) + + elements = Structure.from_file(poscar_path_both).elements + assert "Zn" not in [el.symbol for el in elements] + + +def test_get_file_paths(tmp_path): + """ + Tests that dict of str: Path is returned by get_file_paths(). + """ + file_paths_zipped = get_file_paths( + path_to_lobster_calc=TestDir / "test_data/BaTaO2N1", + requested_files=["poscar", "cohpcar", "charge", "icohplist"], + ) + for key, value in file_paths_zipped.items(): + assert isinstance(key, str) + assert isinstance(value, Path) + + for file in ["COHPCAR.lobster", "ICOHPLIST.lobster"]: + with ( + gzip.open(TestDir / f"test_data/BaTaO2N1/{file}.gz", "rb") as zipped_file, + open(tmp_path / file, "wb") as unzipped_file, + ): + shutil.copyfileobj(zipped_file, unzipped_file) + + file_paths_unzipped = get_file_paths(path_to_lobster_calc=tmp_path, requested_files=["cohpcar", "icohplist"]) + for key, value in file_paths_unzipped.items(): + assert isinstance(key, str) + assert isinstance(value, Path) diff --git a/tests/test_data/test_structure_path_handling/POSCAR.gz b/tests/test_data/test_structure_path_handling/POSCAR.gz new file mode 100755 index 00000000..e83fec4b Binary files /dev/null and b/tests/test_data/test_structure_path_handling/POSCAR.gz differ diff --git a/tests/test_data/test_structure_path_handling/POSCAR.lobster.gz b/tests/test_data/test_structure_path_handling/POSCAR.lobster.gz new file mode 100755 index 00000000..73690d39 Binary files /dev/null and b/tests/test_data/test_structure_path_handling/POSCAR.lobster.gz differ