Skip to content

Commit

Permalink
Merge pull request #242 from kaueltzen/structure_path
Browse files Browse the repository at this point in the history
[WIP] File path handling in featurizer module
  • Loading branch information
JaGeo authored Feb 26, 2024
2 parents f37797b + 8969adc commit 6916d28
Show file tree
Hide file tree
Showing 9 changed files with 324 additions and 403 deletions.
1 change: 1 addition & 0 deletions lobsterpy/featurize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License

"""This package provides the modules for featurzing Lobster data ready for ML studies."""
from .utils import get_file_paths, get_structure_path
482 changes: 142 additions & 340 deletions lobsterpy/featurize/batch.py

Large diffs are not rendered by default.

69 changes: 14 additions & 55 deletions lobsterpy/featurize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import numpy as np
import pandas as pd
from mendeleev import element
from monty.os.path import zpath
from numpy import ndarray
from pymatgen.core.structure import Structure
from pymatgen.electronic_structure.cohp import CompleteCohp
Expand All @@ -25,6 +24,8 @@

from lobsterpy.cohp.analyze import Analysis

from . import get_file_paths

warnings.filterwarnings("ignore")


Expand Down Expand Up @@ -268,54 +269,19 @@ def get_lobsterpy_cba_dict(path_to_lobster_calc: str | Path, bonds: str, orbital
Returns a dictionary with lobster summarized bonding analysis data
"""
dir_name = Path(str(path_to_lobster_calc))

# check if files are compressed (.gz) and update file paths
req_files_lobsterpy = {
"structure_path": "POSCAR",
"cohpcar_path": "COHPCAR.lobster",
"icohplist_path": "ICOHPLIST.lobster",
"charge_path": "CHARGE.lobster",
}

for file, default_value in req_files_lobsterpy.items():
if file == "structure_path":
for filename in [default_value, "POSCAR.lobster"]:
poscar_path = dir_name / filename
req_files_lobsterpy[file] = poscar_path # type: ignore
if not poscar_path.exists():
gz_file_path = Path(zpath(poscar_path))
if gz_file_path.exists():
req_files_lobsterpy[file] = gz_file_path # type: ignore
break
else:
file_path = dir_name / default_value
req_files_lobsterpy[file] = file_path # type: ignore
if not file_path.exists():
gz_file_path = Path(zpath(file_path))
if gz_file_path.exists():
req_files_lobsterpy[file] = gz_file_path # type: ignore
else:
raise Exception(
"Path provided for Lobster calc directory seems incorrect."
"It does not contain COHPCAR.lobster, ICOHPLIST.lobster, POSCAR and "
"CHARGE.lobster files needed for automatic analysis using LobsterPy"
)

cohpcar_path = req_files_lobsterpy.get("cohpcar_path")
charge_path = req_files_lobsterpy.get("charge_path")
structure_path = req_files_lobsterpy.get("structure_path")
icohplist_path = req_files_lobsterpy.get("icohplist_path")
file_paths = get_file_paths(
path_to_lobster_calc=path_to_lobster_calc, requested_files=["poscar", "cohpcar", "icohplist", "charge"]
)

which_bonds = bonds.replace("-", "_")
bond_type = f"{which_bonds}_bonds"

try:
analyse = Analysis(
path_to_poscar=str(structure_path),
path_to_icohplist=str(icohplist_path),
path_to_cohpcar=str(cohpcar_path),
path_to_charge=str(charge_path),
path_to_poscar=str(file_paths.get("poscar")),
path_to_icohplist=str(file_paths.get("icohplist")),
path_to_cohpcar=str(file_paths.get("cohpcar")),
path_to_charge=str(file_paths.get("charge")),
summed_spins=False, # we will always use spin polarization here
cutoff_icohp=0.10,
which_bonds=which_bonds,
Expand All @@ -326,24 +292,17 @@ def get_lobsterpy_cba_dict(path_to_lobster_calc: str | Path, bonds: str, orbital
except ValueError:
data = {bond_type: {"lobsterpy_data": {}}}

madelung_energies_path = dir_name / "MadelungEnergies.lobster"
# check if .gz file exists and update Madelung Energies path
if not madelung_energies_path.exists():
gz_file_path = Path(zpath(madelung_energies_path))
if gz_file_path.exists():
madelung_energies_path = gz_file_path

if madelung_energies_path.exists():
madelung_obj = MadelungEnergies(filename=str(madelung_energies_path))
try:
madelung_path = get_file_paths(path_to_lobster_calc=path_to_lobster_calc, requested_files=["madelung"])
madelung_obj = MadelungEnergies(filename=str(madelung_path.get("madelung")))

madelung_energies = {
"Mulliken": madelung_obj.madelungenergies_Mulliken,
"Loewdin": madelung_obj.madelungenergies_Loewdin,
"Ewald_splitting": madelung_obj.ewald_splitting,
}
data["madelung_energies"] = madelung_energies

else:
except Exception:
warnings.warn(
"MadelungEnergies.lobster file not found in Lobster calc directory provided"
" Will set Madelung Energies for crystal structure values to NaN"
Expand Down Expand Up @@ -972,7 +931,7 @@ def _calc_ionicity(self) -> float:
structure = Structure.from_file(self.path_to_structure)

if self.charge_type.lower() not in ["mulliken", "loewdin"]:
raise ValueError("Please check the requested charge_type. Possible options are `Mulliken` or `Loewdin`")
raise ValueError("Please check the requested charge_type. Possible options are `mulliken` or `loewdin`")

ch_veff = []
tol = 1e-6
Expand Down
86 changes: 86 additions & 0 deletions lobsterpy/featurize/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright (c) lobsterpy development team
# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License

"""This package provides the modules for featurzing Lobster data ready for ML studies."""
from __future__ import annotations

from pathlib import Path

from monty.os.path import zpath


def get_file_paths(
path_to_lobster_calc: str | Path = "", requested_files: list[str] = [], use_lso_dos: bool = True
) -> dict:
"""
Get file paths for LobsterPy featurizations, raise Exception if not all of requested paths exist.
:param path_to_lobster_calc: path to root LOBSTER calc directory
:param requested_files: files to return paths for.
:param use_lso_dos: solely required for BatchDosFeaturizer.
Will force featurizer to use DOSCAR.LSO.lobster instead of DOSCAR.lobster.
:return: dict that assigns each item of requested_files its path
"""
default_values = {
"poscar": "POSCAR",
"cohpcar": "COHPCAR.lobster",
"icohplist": "ICOHPLIST.lobster",
"cobicar": "COBICAR.lobster",
"icobilist": "ICOBILIST.lobster",
"coopcar": "COOPCAR.lobster",
"icooplist": "ICOOPLIST.lobster",
"charge": "CHARGE.lobster",
"madelung": "MadelungEnergies.lobster",
"doscar": ("DOSCAR.LSO.lobster" if use_lso_dos else "DOSCAR.lobster"),
}

lobster_path = Path(path_to_lobster_calc)
file_paths = {}
missing_files = []

for file in requested_files:
file_str = default_values.get(file)
file_str = file_str if isinstance(file_str, str) else file
if file == "poscar":
try:
file_paths[file] = get_structure_path(lobster_path=lobster_path)
except Exception:
missing_files.append(default_values["poscar"])
else:
file_path = lobster_path / file_str
if file_path.exists():
file_paths[file] = file_path
else:
gz_file_path = Path(zpath(file_path))
if gz_file_path.exists():
file_paths[file] = gz_file_path
else:
missing_files.append(default_values[file])

if missing_files:
raise Exception(f"Files {missing_files} not found in {lobster_path.name}.")

return file_paths


def get_structure_path(lobster_path: Path) -> Path:
"""
Search iteratively for (unzipped / zipped) structure file.
POSCAR is prioritized over POSCAR.lobster.
:param lobster_path: path to root LOBSTER calc directory
:return: path to structure file
"""
for filename in ["POSCAR", "POSCAR.lobster"]:
poscar_path = lobster_path / filename
if poscar_path.exists():
return poscar_path
gz_file_path = Path(zpath(poscar_path))
if gz_file_path.exists():
return gz_file_path

raise Exception
24 changes: 20 additions & 4 deletions tests/featurize/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,22 @@ def test_batch_dos_featurizer_lso(self):

class TestExceptions:
def test_batch_summary_featurizer_exception(self):
with pytest.raises(ValueError) as err0: # noqa: PT012, PT011
self.summary_featurize_with_json_ex = BatchSummaryFeaturizer(
path_to_lobster_calcs=TestDir / "test_data/Featurizer_test_data/Lobster_calcs_exceptions/1/",
bonds="all",
feature_type="nonbonding",
include_cobi_data=True,
include_coop_data=True,
e_range=[-15, 0],
)

_ = self.summary_featurize_with_json_ex.get_df()

assert str(err0.value) == (
"Parameter feature_type set to nonbonding but must be in ['bonding', 'antibonding', 'overall']."
)

with pytest.raises(Exception) as err1: # noqa: PT012, PT011
self.summary_featurize_with_json_ex = BatchSummaryFeaturizer(
path_to_lobster_calcs=TestDir / "test_data/Featurizer_test_data/Lobster_calcs_exceptions/1/",
Expand All @@ -687,7 +703,7 @@ def test_batch_summary_featurizer_exception(self):

_ = self.summary_featurize_with_json_ex.get_df()

assert str(err1.value) == "COBICAR.lobster or ICOBILIST.lobster file not found in mp-2176"
assert str(err1.value) == "Files ['COBICAR.lobster', 'ICOBILIST.lobster'] not found in mp-2176."

with pytest.raises(Exception) as err2: # noqa: PT012, PT011
self.summary_featurize_with_json_ex2 = BatchSummaryFeaturizer(
Expand All @@ -701,7 +717,7 @@ def test_batch_summary_featurizer_exception(self):

_ = self.summary_featurize_with_json_ex2.get_df()

assert str(err2.value) == "COOPCAR.lobster or ICOOPLIST.lobster file not found in mp-1000"
assert str(err2.value) == "Files ['COOPCAR.lobster', 'ICOOPLIST.lobster'] not found in mp-1000."

# COXX exception
with pytest.raises(Exception) as err3: # noqa: PT012, PT011
Expand All @@ -711,7 +727,7 @@ def test_batch_summary_featurizer_exception(self):
path_to_lobster_calc=self.raise_coxx_exception.path_to_lobster_calcs
)

assert str(err3.value) == "COHPCAR.lobster or POSCAR or ICOHPLIST.lobster file not found in JSONS"
assert str(err3.value) == "Files ['POSCAR', 'COHPCAR.lobster', 'ICOHPLIST.lobster'] not found in JSONS."

# Charges exception
with pytest.raises(Exception) as err4: # noqa: PT012, PT011
Expand All @@ -721,7 +737,7 @@ def test_batch_summary_featurizer_exception(self):
path_to_lobster_calc=self.raise_ch_exception.path_to_lobster_calcs
)

assert str(err4.value) == "CHARGE.lobster or POSCAR not found in JSONS"
assert str(err4.value) == "Files ['POSCAR', 'CHARGE.lobster'] not found in JSONS."

# Fingerprint similarity exception
with pytest.raises(Exception) as err8: # noqa: PT012, PT011
Expand Down
7 changes: 3 additions & 4 deletions tests/featurize/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,9 +669,8 @@ def test_lobsterpy_featurize_exception(self):
_ = self.featurize_mp1249_json.get_df()

assert (
str(err.value) == "Path provided for Lobster calc directory seems incorrect."
"It does not contain COHPCAR.lobster, ICOHPLIST.lobster, POSCAR and "
"CHARGE.lobster files needed for automatic analysis using LobsterPy"
str(err.value) == "Files ['POSCAR', 'COHPCAR.lobster', 'ICOHPLIST.lobster', 'CHARGE.lobster'] "
"not found in ..."
)

with pytest.raises(Exception) as err: # noqa: PT012, PT011
Expand Down Expand Up @@ -702,7 +701,7 @@ def test_featurize_charges(self):

_ = self.featurize_cdf_charge.get_df()

assert str(err.value) == "Please check the requested charge_type. Possible options are `Mulliken` or `Loewdin`"
assert str(err.value) == "Please check the requested charge_type. Possible options are `mulliken` or `loewdin`"

def test_featurize_coxx(self):
with pytest.raises(Exception) as err: # noqa: PT012, PT011
Expand Down
58 changes: 58 additions & 0 deletions tests/featurize/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import gzip
import shutil
from pathlib import Path

from pymatgen.core import Structure

from lobsterpy.featurize import get_file_paths, get_structure_path

CurrentDir = Path(__file__).absolute().parent
TestDir = CurrentDir / "../"


def test_get_structure_path(tmp_path):
"""
Tests that POSCAR path is returned by get_structure_path function.
Tests that in case of both LOBSTER and VASP structure files present, the VASP
file is read.
"""
with (
gzip.open(TestDir / "test_data/test_structure_path_handling/POSCAR.gz", "rb") as zipped_poscar,
open(tmp_path / "POSCAR", "wb") as unzipped_poscar,
):
shutil.copyfileobj(zipped_poscar, unzipped_poscar)

poscar_path_unzipped = get_structure_path(lobster_path=tmp_path)
assert isinstance(poscar_path_unzipped, Path)

poscar_path_both = get_structure_path(lobster_path=TestDir / "test_data/test_structure_path_handling")
assert isinstance(poscar_path_both, Path)

elements = Structure.from_file(poscar_path_both).elements
assert "Zn" not in [el.symbol for el in elements]


def test_get_file_paths(tmp_path):
"""
Tests that dict of str: Path is returned by get_file_paths().
"""
file_paths_zipped = get_file_paths(
path_to_lobster_calc=TestDir / "test_data/BaTaO2N1",
requested_files=["poscar", "cohpcar", "charge", "icohplist"],
)
for key, value in file_paths_zipped.items():
assert isinstance(key, str)
assert isinstance(value, Path)

for file in ["COHPCAR.lobster", "ICOHPLIST.lobster"]:
with (
gzip.open(TestDir / f"test_data/BaTaO2N1/{file}.gz", "rb") as zipped_file,
open(tmp_path / file, "wb") as unzipped_file,
):
shutil.copyfileobj(zipped_file, unzipped_file)

file_paths_unzipped = get_file_paths(path_to_lobster_calc=tmp_path, requested_files=["cohpcar", "icohplist"])
for key, value in file_paths_unzipped.items():
assert isinstance(key, str)
assert isinstance(value, Path)
Binary file not shown.
Binary file not shown.

0 comments on commit 6916d28

Please sign in to comment.