Skip to content

Commit

Permalink
Improve guess_defect_position and add test
Browse files Browse the repository at this point in the history
  • Loading branch information
kavanase committed Nov 1, 2024
1 parent 016d994 commit 3b01655
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 45 deletions.
112 changes: 68 additions & 44 deletions doped/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from monty.json import MontyDecoder
from monty.serialization import dumpfn, loadfn
from pymatgen.analysis.defects import core
from pymatgen.analysis.defects.finder import cosine_similarity, get_site_vecs
from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
from pymatgen.core.sites import PeriodicSite
from pymatgen.core.structure import Structure
Expand All @@ -26,14 +27,13 @@
from pymatgen.io.vasp.inputs import Poscar
from pymatgen.io.vasp.outputs import Procar, Vasprun
from pymatgen.util.typing import PathLike
from scipy.stats import zscore
from tqdm import tqdm

from doped import _doped_obj_properties_methods, _ignore_pmg_warnings
from doped.core import DefectEntry, guess_and_set_oxi_states_with_timeout
from doped.generation import get_defect_name_from_defect, get_defect_name_from_entry, name_defect_entries
from doped.thermodynamics import DefectThermodynamics
from doped.utils.efficiency import get_voronoi_nodes
from doped.utils.efficiency import _parse_site_species_str, get_voronoi_nodes
from doped.utils.parsing import (
_compare_incar_tags,
_compare_kpoints,
Expand Down Expand Up @@ -347,58 +347,82 @@ def defect_from_structures(

def guess_defect_position(defect_supercell: Structure) -> np.ndarray[float]:
"""
Given an input defect supercell, guess the position of the defect by
identifying outlier sites (in terms of minimum interatomic distances) and
return their mean position.
This is done by calculating the normalised z-scores (``scipy.stats.zscore``)
of the minimum interatomic distances for each site in the defect supercell
(separated by and normalised within each species), and taking sites with >50%
of the maximum z-score (i.e. deviation) as outliers, then returning the mean
of their cartesian coordinates (accounting for PBC).
These coordinates are unlikely to _directly_ match the defect position, but
should provide a good estimate in most cases.
Guess the position (in Cartesian coordinates) of a defect in an input
defect supercell, without a bulk/reference supercell.
This is achieved by computing cosine dissimilarities between site SOAP
vectors (and the mean SOAP vectors for each species) and then determining
the centre of mass of sites, weighted by the squared cosine dissimilarities.
For accurate defect site determination, the ``defect_from_structure``
function (or underlying code) is preferred. These coordinates are unlikely
to _directly_ match the defect position (especially in the presence of random
noise), but should provide a pretty good estimate in most cases. If the
defect is an extrinsic interstitial/substitution, then this will identify
the exact defect site.
Args:
defect_supercell (Structure):
Defect supercell structure.
Returns:
np.ndarray[float]: Guessed mean position of the defect.
np.ndarray[float]: Guessed position of the defect in **Cartesian** coordinates.
"""
element_idx_dict = { # distance matrix broken down by species
element: [i for i, site in enumerate(defect_supercell) if site.specie.symbol == str(element)]
for element in defect_supercell.composition.elements

# Note from profiling: This function is pretty fast (e.g. ~25 s for ~1000 frames of a ~100-atom
# supercell on SK's 2021 MacBook Pro), but the main bottleneck is SOAP vector creation,
# if we ever needed to accelerate
def cos_dissimilarity(vec1, vec2):
return 1 - cosine_similarity(vec1, vec2)

# if there is only one site of a particular element in the defect supercell, then we guess it as the
# defect site (extrinsic substitution/interstitial):
i_elt_dict = {
i: _parse_site_species_str(site, wout_charge=True) for i, site in enumerate(defect_supercell.sites)
}
for elt in defect_supercell.composition.elements:
if list(i_elt_dict.values()).count(elt.symbol) == 1:
return defect_supercell.sites[list(i_elt_dict.values()).index(elt.symbol)].coords

soap_vecs = [site_vec.vec for site_vec in get_site_vecs(defect_supercell)]
elt_mean_soap_vec_dict = {
elt.symbol: np.mean(
[soap_vec for i, soap_vec in enumerate(soap_vecs) if i_elt_dict[i] == elt.symbol],
axis=0,
)
for elt in defect_supercell.composition.elements
}
zscores = np.zeros(len(defect_supercell))

distance_matrix = defect_supercell.distance_matrix
np.fill_diagonal(distance_matrix, np.inf) # set diagonal to np.inf to ignore self-distances of 0

for site_indices in element_idx_dict.values():
element_dist_matrix = distance_matrix[:, site_indices] # (N_of_that_element, N_sites) matrix
min_interatomic_distances_per_atom = np.min(element_dist_matrix, axis=0) # min along columns
zscores[site_indices] = zscore(min_interatomic_distances_per_atom)

abs_zscores = np.abs(zscores)
max_z = np.max(abs_zscores)
outliers = np.where(abs_zscores > 0.5 * max_z) # all sites above 50% of max z (normalised deviation)

# get centre of mass of outliers:
# for this, we need to account for PBC, so take the first outlier, then for the rest, take the image
# which is closest to the first outlier, then use the Cartesian coordinates of this group to get CoM:
first_outlier = defect_supercell.sites[outliers[0][0]]
clustered_outlier_coords = [first_outlier.coords]
for outlier_idx in outliers[0][1:]:
image = first_outlier.distance_and_image(defect_supercell.sites[outlier_idx])[1]
clustered_outlier_coords.append(
cos_dissimilarities = [
cos_dissimilarity(soap_vecs[i], elt_mean_soap_vec_dict[i_elt]) for i, i_elt in i_elt_dict.items()
]

rel_cos_dissimilarities = np.zeros(len(defect_supercell))
for elt in elt_mean_soap_vec_dict:
indices = [i for i, i_elt in i_elt_dict.items() if i_elt == elt]
avg_cos_dissimilarity = np.mean([cos_dissimilarities[i] for i in indices])
rel_cos_dissimilarities[indices] = np.array(cos_dissimilarities)[indices] / avg_cos_dissimilarity

largest_outlier = defect_supercell.sites[
np.where(rel_cos_dissimilarities == np.max(rel_cos_dissimilarities))[0][0]
]
cos_diss_frac_coords_dict = {np.max(rel_cos_dissimilarities): largest_outlier.frac_coords}
for i, site in enumerate(defect_supercell.sites):
if not np.all(site.frac_coords == largest_outlier.frac_coords):
image = largest_outlier.distance_and_image(site)[1]
cos_diss_frac_coords_dict[rel_cos_dissimilarities[i]] = site.frac_coords + image

cos_diss_coords_dict = dict(
zip(
cos_diss_frac_coords_dict.keys(),
defect_supercell.lattice.get_cartesian_coords(
defect_supercell.sites[outlier_idx].frac_coords + image
)
np.array(list(cos_diss_frac_coords_dict.values()))
),
)

return np.mean(clustered_outlier_coords, axis=0)
)
return np.average( # weighted centre of mass
np.array(list(cos_diss_coords_dict.values())),
axis=0,
weights=np.array(list(cos_diss_coords_dict.keys())) ** 2,
)


def defect_name_from_structures(bulk_structure: Structure, defect_structure: Structure):
Expand Down
2 changes: 1 addition & 1 deletion doped/utils/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ def get_coords_and_idx_of_species(structure, species_name, frac_coords=True):
coords = []
idx = []
for i, site in enumerate(structure):
if _parse_site_species_str(site) == species_name:
if _parse_site_species_str(site, wout_charge=True) == species_name:
coords.append(site.frac_coords if frac_coords else site.coords)
idx.append(i)

Expand Down
15 changes: 15 additions & 0 deletions tests/test_thermodynamics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pymatgen.core.composition import Composition
from pymatgen.electronic_structure.dos import FermiDos

from doped.analysis import guess_defect_position
from doped.generation import _sort_defect_entries
from doped.thermodynamics import DefectThermodynamics, get_fermi_dos, scissor_dos
from doped.utils.parsing import _get_defect_supercell_bulk_site_coords, get_vasprun
Expand Down Expand Up @@ -412,6 +413,20 @@ def _check_defect_thermo(
for warning in w
)

# test defect position guessing:
print("Checking defect position guessing")
guessed_def_pos_deviations = [
entry.defect_supercell_site.distance_and_image_from_frac_coords(
entry.bulk_supercell.lattice.get_fractional_coords(
guess_defect_position(entry.defect_supercell)
)
)[0]
for entry in defect_thermo.defect_entries.values()
]
print(np.mean(guessed_def_pos_deviations))
first_entry = next(iter(defect_thermo.defect_entries.values()))
assert np.mean(guessed_def_pos_deviations) < np.max(first_entry.bulk_supercell.lattice.abc) * 0.2

def _check_CdTe_example_dist_tol(self, defect_thermo, num_grouped_defects):
print(f"Testing CdTe updated dist_tol: {defect_thermo.dist_tol}")
tl_df = defect_thermo.get_transition_levels()
Expand Down

0 comments on commit 3b01655

Please sign in to comment.