Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More robust name validation #703

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning][].

## [0.x.x] - 2024-xx-xx

### Changed

- Validation of element and column names allowing `_-.` and alphanumeric characters #624

### Minor

- Added `clip: bool = False` parameter to `polygon_query()` #670
Expand Down
15 changes: 15 additions & 0 deletions docs/design_doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,21 @@ _SpatialData_ follows the OME-NGFF specifications whenever possible and therefor
- Any `Element` MAY be annotated by `Tables`; also `Shapes` and `Points` MAY contain annotations within themselves as additional dataframe columns (e.g. intensity of point spread function of a each point, or gene id).
- `Tables` CAN NOT be annotated by other `Tables`.

#### Naming

Names of SpatialData elements must fulfill certain restrictions to ensure robust storage and compatibility:

- MUST NOT be the empty string ``.
- MUST only contain alphanumeric characters or hyphens `-`, dots `.`, underscores `_`. Alphanumeric includes letters from different alphabets and number-like symbols, but excludes whitespace, slashes and other symbols.
- MUST NOT be the full strings `.` or `..`, which would have special meanings as file paths.
- MUST NOT start with double underscores `__`.
- MUST NOT only differ in character case, to avoid name collisions on case-insensitive systems.

In tables, the above restrictions apply to the column names of `obs` and `var`, and to the key names of the `obsm`,
`obsp`, `varm`, `varp`, `uns`, `layers` slots (for example `adata.obs['has space']` and `adata.layers['.']` are not
allowed).
Additionally, dataframes in tables MUST NOT have a column named `_index`, which is reserved.

#### Images

Images of a sample. Should conform to the [OME-NGFF concept of an image](https://ngff.openmicroscopy.org/latest/#image-layout).
Expand Down
27 changes: 14 additions & 13 deletions src/spatialdata/_core/_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from dask.dataframe import DataFrame as DaskDataFrame
from geopandas import GeoDataFrame

from spatialdata._core.validation import check_key_is_case_insensitively_unique, check_valid_name
from spatialdata._types import Raster_T
from spatialdata.models import (
Image2DModel,
Expand All @@ -30,30 +31,30 @@ def __init__(self, shared_keys: set[str | None]) -> None:
self._shared_keys = shared_keys
super().__init__()

@staticmethod
def _check_valid_name(name: str) -> None:
if not isinstance(name, str):
raise TypeError(f"Name must be a string, not {type(name).__name__}.")
if len(name) == 0:
raise ValueError("Name cannot be an empty string.")
if not all(c.isalnum() or c in "_-" for c in name):
raise ValueError("Name must contain only alphanumeric characters, underscores, and hyphens.")
def _add_shared_key(self, key: str) -> None:
self._shared_keys.add(key)

def _remove_shared_key(self, key: str) -> None:
self._shared_keys.remove(key)

@staticmethod
def _check_key(key: str, element_keys: Iterable[str], shared_keys: set[str | None]) -> None:
Elements._check_valid_name(key)
check_valid_name(key)
if key in element_keys:
warn(f"Key `{key}` already exists. Overwriting it in-memory.", UserWarning, stacklevel=2)
else:
if key in shared_keys:
raise KeyError(f"Key `{key}` already exists.")
try:
check_key_is_case_insensitively_unique(key, shared_keys)
except ValueError as e:
# Validation raises ValueError, but inappropriate mapping key must raise KeyError.
raise KeyError(*e.args) from e

def __setitem__(self, key: str, value: Any) -> None:
self._shared_keys.add(key)
self._add_shared_key(key)
super().__setitem__(key, value)

def __delitem__(self, key: str) -> None:
self._shared_keys.remove(key)
self._remove_shared_key(key)
super().__delitem__(key)


Expand Down
37 changes: 20 additions & 17 deletions src/spatialdata/_core/spatialdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
from xarray import DataArray

from spatialdata._core._elements import Images, Labels, Points, Shapes, Tables
from spatialdata._core.validation import (
check_all_keys_case_insensitively_unique,
check_target_region_column_symmetry,
check_valid_name,
validate_table_attr_keys,
)
from spatialdata._logging import logger
from spatialdata._types import ArrayLike, Raster_T
from spatialdata._utils import _deprecation_alias, _error_message_add_element
Expand All @@ -33,7 +39,6 @@
PointsModel,
ShapesModel,
TableModel,
check_target_region_column_symmetry,
get_model,
get_table_keys,
)
Expand Down Expand Up @@ -1115,6 +1120,12 @@ def _validate_can_safely_write_to_path(
" the current Zarr store." + WORKAROUND
)

def _validate_all_elements(self) -> None:
for element_type, element_name, element in self.gen_elements():
check_valid_name(element_name)
if element_type == "tables":
validate_table_attr_keys(element)

def write(
self,
file_path: str | Path,
Expand Down Expand Up @@ -1150,6 +1161,7 @@ def write(
if isinstance(file_path, str):
file_path = Path(file_path)
self._validate_can_safely_write_to_path(file_path, overwrite=overwrite)
self._validate_all_elements()

store = parse_url(file_path, mode="w").store
_ = zarr.group(store=store, overwrite=overwrite)
Expand Down Expand Up @@ -1244,9 +1256,7 @@ def write_element(
self.write_element(name, overwrite=overwrite)
return

from spatialdata._core._elements import Elements

Elements._check_valid_name(element_name)
check_valid_name(element_name)
self._validate_element_names_are_unique()
element = self.get(element_name)
if element is None:
Expand All @@ -1265,6 +1275,8 @@ def write_element(
break
if element_type is None:
raise ValueError(f"Element with name {element_name} not found in SpatialData object.")
if element_type == "tables":
validate_table_attr_keys(element)

self._check_element_not_on_disk_with_different_type(element_type=element_type, element_name=element_name)

Expand Down Expand Up @@ -1316,10 +1328,9 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None:
self.delete_element_from_disk(name)
return

from spatialdata._core._elements import Elements
from spatialdata._io._utils import _backed_elements_contained_in_path

Elements._check_valid_name(element_name)
check_valid_name(element_name)

if self.path is None:
raise ValueError("The SpatialData object is not backed by a Zarr store.")
Expand Down Expand Up @@ -1451,10 +1462,8 @@ def write_transformations(self, element_name: str | None = None) -> None:
element_name
The name of the element to write. If None, write the transformations of all elements.
"""
from spatialdata._core._elements import Elements

if element_name is not None:
Elements._check_valid_name(element_name)
check_valid_name(element_name)

# recursively write the transformation for all the SpatialElement
if element_name is None:
Expand Down Expand Up @@ -1541,10 +1550,8 @@ def write_metadata(self, element_name: str | None = None, consolidate_metadata:
-----
When using the methods `write()` and `write_element()`, the metadata is written automatically.
"""
from spatialdata._core._elements import Elements

if element_name is not None:
Elements._check_valid_name(element_name)
check_valid_name(element_name)

self.write_transformations(element_name)
# TODO: write .uns['spatialdata_attrs'] metadata for AnnData.
Expand Down Expand Up @@ -1994,11 +2001,7 @@ def _validate_element_names_are_unique(self) -> None:
ValueError
If the element names are not unique.
"""
element_names = set()
for _, element_name, _ in self.gen_elements():
if element_name in element_names:
raise ValueError(f"Element name {element_name!r} is not unique.")
element_names.add(element_name)
check_all_keys_case_insensitively_unique([name for _, name, _ in self.gen_elements()])

def _find_element(self, element_name: str) -> tuple[str, str, SpatialElement | AnnData]:
"""
Expand Down
Loading
Loading