Skip to content

Commit

Permalink
feat: support for visium descendants in obs['assay_ontology_term_id'] (
Browse files Browse the repository at this point in the history
…#1148)

Co-authored-by: Evan Molinelli <[email protected]>
  • Loading branch information
ejmolinelli and Evan Molinelli authored Dec 6, 2024
1 parent b517b86 commit 9f84b18
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 12 deletions.
13 changes: 9 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(self, ignore_labels=False):
# keys will be one of gencode.SupportedOrganisms
self.gene_checkers = dict()

def reset(self):
def reset(self, hi_res_size: Optional[int] = None, true_mat_size: Optional[int] = None):
self.errors = []
self.warnings = []
self.is_valid = False
Expand All @@ -76,6 +76,8 @@ def reset(self):
self.is_spatial = None
self.is_visium = None
self.is_visium_and_is_single_true = None
self._hires_max_dimension_size = hi_res_size
self._visium_and_is_single_true_matrix_size = true_mat_size

# Matrix (e.g., X, raw.X, ...) number non-zero cache
self.number_non_zero = dict()
Expand All @@ -99,6 +101,7 @@ def visium_and_is_single_true_matrix_size(self) -> Optional[int]:
if bool(
self.adata.obs["assay_ontology_term_id"]
.apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True))
.astype(bool)
.any()
):
self._visium_error_suffix = f"{ERROR_SUFFIX_VISIUM_11M} and {ERROR_SUFFIX_IS_SINGLE}"
Expand All @@ -118,6 +121,7 @@ def hires_max_dimension_size(self) -> Optional[int]:
if bool(
self.adata.obs["assay_ontology_term_id"]
.apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True))
.astype(bool)
.any()
):
self._visium_error_suffix = ERROR_SUFFIX_VISIUM_11M
Expand Down Expand Up @@ -172,7 +176,7 @@ def _is_supported_spatial_assay(self) -> bool:
try:
_spatial = (
self._is_visium_including_descendants()
or self.adata.obs.assay_ontology_term_id.isin([ASSAY_SLIDE_SEQV2]).any()
or self.adata.obs.assay_ontology_term_id.isin([ASSAY_SLIDE_SEQV2]).astype(bool).any()
)
self.is_spatial = bool(_spatial)
except AttributeError:
Expand Down Expand Up @@ -1981,6 +1985,7 @@ def _is_visium_including_descendants(self) -> bool:
self.adata.obs[_assay_key]
.astype("string")
.apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True))
.astype(bool)
.any()
)

Expand Down Expand Up @@ -2099,15 +2104,15 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None) -> bo
:rtype bool
"""
logger.info("Starting validation...")
# Re-start errors in case a new h5ad is being validated
self.reset()

if h5ad_path:
logger.debug("Reading the h5ad file...")
self.adata = read_h5ad(h5ad_path)
self.h5ad_path = h5ad_path
self._validate_encoding_version()
logger.debug("Successfully read the h5ad file")
# Re-start errors in case a new h5ad is being validated
self.reset()

# Fetches schema def for latest major schema version
self._set_schema_def()
Expand Down
54 changes: 46 additions & 8 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from cellxgene_schema.validate import (
ASSAY_VISIUM_11M,
ERROR_SUFFIX_IS_SINGLE,
ERROR_SUFFIX_SPATIAL,
ERROR_SUFFIX_VISIUM,
ERROR_SUFFIX_VISIUM_11M,
ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE,
Expand Down Expand Up @@ -85,8 +86,8 @@ def validator_with_spatial_and_is_single_false(validator) -> Validator:
@pytest.fixture
def validator_with_visium_assay(validator) -> Validator:
validator.adata = examples.adata_visium.copy()
validator._visium_and_is_single_true_matrix_size = 2
validator._hires_max_dimension_size = None
validator.reset(None, None)

return validator


Expand Down Expand Up @@ -207,6 +208,7 @@ def test_raw_values__invalid_spatial(self, validator_with_visium_assay, invalid_

validator = validator_with_visium_assay
validator.adata.raw.X[0, 1] = invalid_value
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == [
"ERROR: All non-zero values in raw matrix must be positive integers of type numpy.float32.",
Expand Down Expand Up @@ -247,7 +249,8 @@ def test_raw_values__contains_zero_row_in_tissue_1(self, validator_with_visium_a
Raw Matrix contains a row with all zeros and in_tissue is 1, but no values are in_tissue 0.
"""

validator = validator_with_visium_assay
validator: Validator = validator_with_visium_assay
validator.reset(None, 2)
validator.adata.obs["in_tissue"] = 1
validator.adata.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32)
validator.adata.raw.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32)
Expand All @@ -265,6 +268,7 @@ def test_raw_values__contains_zero_row_in_tissue_1_mixed_in_tissue_values(self,
validator: Validator = validator_with_visium_assay
validator.adata.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32)
validator.adata.raw.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32)
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == [
"ERROR: Each observation with obs['in_tissue'] == 1 must have at least one "
Expand All @@ -286,6 +290,7 @@ def test_raw_values__contains_all_zero_rows_in_tissue_0(self, validator_with_vis
)
validator.adata.raw = validator.adata.copy()
validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True)
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == [
"ERROR: If obs['in_tissue'] contains at least one value 0, then there must be at least "
Expand All @@ -304,6 +309,7 @@ def test_raw_values__contains_some_zero_rows_in_tissue_0(self, validator_with_vi
validator.adata.obs["cell_type_ontology_term_id"] = "unknown"
validator.adata.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32)
validator.adata.raw.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32)
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == []

Expand All @@ -328,8 +334,6 @@ def test_raw_values__invalid_visium_and_is_single_true_row_length(
validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id

# hires image size must be present in order to validate the raw.
validator._visium_and_is_single_true_matrix_size = None
validator._hires_max_dimension_size = image_size
validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = numpy.zeros(
(1, image_size, 3), dtype=numpy.uint8
)
Expand Down Expand Up @@ -640,15 +644,40 @@ def test_assay_ontology_term_id__as_categorical(self, validator_with_visium_assa
validator: Validator = validator_with_visium_assay

# check encoding as string
validator._check_spatial_obs()
validator.reset(None, 2)
validator._check_spatial()
validator._validate_raw()
assert validator.errors == []
validator.reset()

# force encoding as 'categorical'
validator.reset(None, 2)
validator.adata.obs["assay_ontology_term_id"] = validator.adata.obs["assay_ontology_term_id"].astype("category")
validator._check_spatial_obs()
validator._check_spatial()
validator._validate_raw()
assert validator.errors == []

@pytest.mark.parametrize(
"assay_ontology_term_id, all_same",
[("EFO:0010961", True), ("EFO:0030062", True), ("EFO:0022860", True), ("EFO:0008995", False)],
)
def test_assay_ontology_term_id__all_same(self, validator_with_visium_assay, assay_ontology_term_id, all_same):
"""
Spatial assays (descendants of Visium Spatia Gene Expression, or Slide-SeqV2) require all values in the column to be identical.
"""
validator: Validator = validator_with_visium_assay

# mix values (with otherwise allowed values)
validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id
validator.adata.obs["assay_ontology_term_id"].iloc[0] = "EFO:0010183"

# check that unique values are allowed
validator._check_spatial_obs()
EXPECTED_ERROR = f"When {ERROR_SUFFIX_SPATIAL}, all observations must contain the same value."
if all_same:
assert EXPECTED_ERROR in validator.errors
else:
assert validator.errors not in validator.errors

def test_cell_type_ontology_term_id_invalid_term(self, validator_with_adata):
validator = validator_with_adata
validator.adata.obs.loc[validator.adata.obs.index[0], "cell_type_ontology_term_id"] = "EFO:0000001"
Expand Down Expand Up @@ -1698,6 +1727,7 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata):

# Second row should have identical donor id + genetic ancestry values, so this should pass validation
validator.adata.obs.iloc[1] = validator.adata.obs.iloc[0].values

validator.validate_adata()
assert validator.errors == []

Expand All @@ -1708,11 +1738,13 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata):
validator.adata.obs["genetic_ancestry_Indigenous_American"] = [0.0, 0.0]
validator.adata.obs["genetic_ancestry_Oceanian"] = [0.0, 0.0]
validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0]
validator.reset(None, 2)
validator.validate_adata()
assert len(validator.errors) > 0

# Change the donor id back to two different donor id's. Now, this should pass validation
validator.adata.obs["donor_id"] = original_donor_id_column
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == []

Expand Down Expand Up @@ -1795,6 +1827,7 @@ def test_feature_is_filtered(self, validator_with_adata):
X[i, 0] = 0
X[0, 0] = 1

validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == [
"ERROR: Some features are 'True' in 'feature_is_filtered' of dataframe 'var', "
Expand All @@ -1804,6 +1837,7 @@ def test_feature_is_filtered(self, validator_with_adata):

# Test that feature_is_filtered is a bool and not a string
var["feature_is_filtered"] = "string"
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == [
"ERROR: Column 'feature_is_filtered' in dataframe 'var' must be boolean, not 'object'."
Expand Down Expand Up @@ -2383,6 +2417,7 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key):

# Check embedding has any NaN
obsm[key][0:100, 1] = numpy.nan
validator.reset(None, 2)
validator.validate_adata()

if key != "spatial":
Expand All @@ -2393,6 +2428,7 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key):
# Check embedding has all NaNs
all_nan = numpy.full(obsm[key].shape, numpy.nan)
obsm[key] = all_nan
validator.reset(None, 2)
validator.validate_adata()
if key != "spatial":
assert validator.errors == [f"ERROR: adata.obsm['{key}'] contains all NaN values."]
Expand All @@ -2419,6 +2455,7 @@ def test_obsm_values_no_X_embedding__visium_dataset(self, validator_with_visium_
validator = validator_with_visium_assay
validator.adata.uns["default_embedding"] = "spatial"
del validator.adata.obsm["X_umap"]
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == []
assert validator.is_spatial is True
Expand Down Expand Up @@ -2522,6 +2559,7 @@ def test_obsm_key_name_whitespace(self, validator_with_adata):

del obsm["X_ umap"]
obsm["u m a p"] = obsm["X_umap"]
validator.reset(None, 2)
validator.validate_adata()
assert validator.errors == [
"ERROR: Embedding key in 'adata.obsm' u m a p does not match the regex pattern ^[a-zA-Z][a-zA-Z0-9_.-]*$."
Expand Down

0 comments on commit 9f84b18

Please sign in to comment.