Skip to content

Commit

Permalink
Make defect naming handling more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
kavanase committed Nov 27, 2024
1 parent 4dbb970 commit f12115f
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 19 deletions.
17 changes: 9 additions & 8 deletions doped/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1239,17 +1239,18 @@ def __repr__(self):
from doped.utils.parsing import _get_bulk_supercell

bulk_supercell = _get_bulk_supercell(self)
if bulk_supercell is not None:
formula = bulk_supercell.composition.get_reduced_formula_and_factor(iupac_ordering=True)[0]
else:
try:
defect_name = self.defect.name
try:
defect_name = self.defect.name
if bulk_supercell is not None:
formula = bulk_supercell.composition.get_reduced_formula_and_factor(iupac_ordering=True)[0]
else:
formula = self.defect.structure.composition.get_reduced_formula_and_factor(
iupac_ordering=True
)[0]
except AttributeError:
defect_name = "unknown"
formula = "unknown"
except AttributeError:
defect_name = "unknown"
formula = "unknown"

properties, methods = _doped_obj_properties_methods(self)
return (
f"doped DefectEntry: {self.name}, with bulk composition: {formula} and defect: {defect_name}. "
Expand Down
46 changes: 35 additions & 11 deletions doped/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,28 @@ def _get_neutral_defect_entry(
return neutral_defect_entry


def _check_if_name_subset(long_name: str, poss_subset_name: str):
"""
Check if the longer name is a superset of the shorter name, where
underscores are used as a name delimiter as in ``doped``.
Previously used ``startswith`` to compare, but caused issues
with e.g. ``v_Cl`` and ``v_C`` defects in the same material.
"""
subset_num_underscores = poss_subset_name.count("_")
for i in range(subset_num_underscores + 1):
long_name_part = long_name.split("_")[i]
if "." in long_name_part and any(char.isdigit() for char in long_name_part):
# closest site info, use startswith for this as we no longer use delimiters
if not long_name_part.startswith(poss_subset_name.split("_")[i]):
return False

elif long_name_part != poss_subset_name.split("_")[i]:
return False

return True


def name_defect_entries(
defect_entries: list[Union[DefectEntry, Defect]],
element_list: Optional[list[str]] = None,
Expand All @@ -499,7 +521,7 @@ def name_defect_entries(
defect_entries (list): List of ``DefectEntry`` or ``Defect`` objects to name.
element_list (list):
Sorted list of elements in the host structure, so that
closest_site_info returns deterministic results (in case two
``closest_site_info`` returns deterministic results (in case two
different elements located at the same distance from defect site).
Default is None.
symm_ops (list):
Expand All @@ -516,7 +538,7 @@ def get_shorter_name(full_defect_name, split_number):
return full_defect_name.rsplit("_", split_number)[0]

def get_matching_names(defect_naming_dict, defect_name):
return [name for name in defect_naming_dict if name.startswith(defect_name)]
return [name for name in defect_naming_dict if _check_if_name_subset(name, defect_name)]

def handle_unique_match(defect_naming_dict, matching_names, split_number):
if len(matching_names) == 1:
Expand Down Expand Up @@ -555,7 +577,7 @@ def handle_multiple_matches(defect_naming_dict, full_defect_name, defect_entry,
except IndexError:
return handle_repeated_name(defect_naming_dict, full_defect_name)

if not any(name.startswith(full_defect_name) for name in defect_naming_dict):
if not any(_check_if_name_subset(name, full_defect_name) for name in defect_naming_dict):
return defect_naming_dict, full_defect_name

if n == 3: # if still not unique after 3rd nearest neighbour, just use alphabetical indexing
Expand All @@ -570,7 +592,9 @@ def handle_repeated_name(defect_naming_dict, full_defect_name):
defect_naming_dict[f"{name}a"] = prev_defect_entry
defect_name = f"{full_defect_name}b"
break
if full_defect_name == name[:-1]:
if full_defect_name == name[:-1] and not Element("H").is_valid_symbol(name.split("_")[-1]):
# if name is a subset barring the last letter, and last underscore split is not an Element
# (i.e. ``v_Cl`` not being matched with ``v_C``)
last_letters = [name[-1] for name in defect_naming_dict if name[:-1] == full_defect_name]
last_letters.sort()
new_letter = chr(ord(last_letters[-1]) + 1)
Expand All @@ -595,14 +619,14 @@ def handle_repeated_name(defect_naming_dict, full_defect_name):
full_defect_name = get_defect_name_from_defect(defect, element_list, symm_ops)
split_number = 1 if defect.defect_type == core.DefectType.Interstitial else 2
shorter_defect_name = get_shorter_name(full_defect_name, split_number)
if not any(name.startswith(shorter_defect_name) for name in defect_naming_dict):
if not any(_check_if_name_subset(name, shorter_defect_name) for name in defect_naming_dict):
defect_naming_dict[shorter_defect_name] = defect_entry
continue

matching_shorter_names = get_matching_names(defect_naming_dict, shorter_defect_name)
defect_naming_dict = handle_unique_match(defect_naming_dict, matching_shorter_names, split_number)
shorter_defect_name = get_shorter_name(full_defect_name, split_number - 1)
if not any(name.startswith(shorter_defect_name) for name in defect_naming_dict):
if not any(_check_if_name_subset(name, shorter_defect_name) for name in defect_naming_dict):
defect_naming_dict[shorter_defect_name] = defect_entry
continue

Expand All @@ -611,7 +635,7 @@ def handle_repeated_name(defect_naming_dict, full_defect_name):
defect_naming_dict, matching_shorter_names, split_number - 1
)
shorter_defect_name = get_shorter_name(full_defect_name, split_number - 2)
if not any(name.startswith(shorter_defect_name) for name in defect_naming_dict):
if not any(_check_if_name_subset(name, shorter_defect_name) for name in defect_naming_dict):
defect_naming_dict[shorter_defect_name] = defect_entry
continue

Expand Down Expand Up @@ -1785,14 +1809,14 @@ def _defect_generator_info(self):
charges = [
name.rsplit("_", 1)[1]
for name in self.defect_entries
if name.startswith(f"{defect_name}_")
if _check_if_name_subset(name, defect_name)
] # so e.g. Te_i_m1 doesn't match with Te_i_m1b
# convert list of strings to one string with comma-separated charges
charges = "[" + ",".join(charges) + "]"
defect_entry = next(
entry
for name, entry in self.defect_entries.items()
if name.startswith(defect_name)
if _check_if_name_subset(name, defect_name)
)
frac_coords_string = (
"N/A"
Expand Down Expand Up @@ -1837,7 +1861,7 @@ def _process_name_and_charge_states_and_get_matching_entries(
charge_states = [round(charge_states)]

matching_entry_names_wout_charge = [
name for name in self.defect_entries if name.startswith(defect_entry_name)
name for name in self.defect_entries if _check_if_name_subset(name, defect_entry_name)
]
if not match_charge_states:
return charge_states, matching_entry_names_wout_charge
Expand Down Expand Up @@ -1877,7 +1901,7 @@ def add_charge_states(self, defect_entry_name: str, charge_states: Union[list, i
previous_defect_entry = next(
entry
for name, entry in self.defect_entries.items()
if name.startswith(defect_entry_name_wout_charge)
if _check_if_name_subset(name, defect_entry_name_wout_charge)
)
for charge in charge_states:
defect_entry = copy.deepcopy(previous_defect_entry)
Expand Down

0 comments on commit f12115f

Please sign in to comment.