From 3ba2646c1ff0c6261c49ef64af2d854662139000 Mon Sep 17 00:00:00 2001 From: rly Date: Sat, 10 Aug 2024 15:35:08 -0700 Subject: [PATCH] Check DynamicTableRegion data is in bounds --- CHANGELOG.md | 4 ++- src/hdmf/common/table.py | 48 ++++++++++++++++++++------ src/hdmf/container.py | 13 +++++++ tests/unit/common/test_linkedtables.py | 2 +- tests/unit/common/test_table.py | 28 +++++++++++++++ 5 files changed, 83 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a6369094..fd7d765d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,9 @@ ### Enhancements - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157) - +- Added a check when setting or adding data to a `DynamicTableRegion` or setting the `table` attribute of a `DynamicTableRegion` + that the data values are in bounds of the linked table. This can be turned off for + `DynamicTableRegion.__init__` using the keyword argument `validate_data=False`. @rly [#1168](https://github.com/hdmf-dev/hdmf/pull/1168) ## HDMF 3.14.3 (July 29, 2024) ### Enhancements diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py index 2e90b0cdf..91aee84a1 100644 --- a/src/hdmf/common/table.py +++ b/src/hdmf/common/table.py @@ -1360,17 +1360,37 @@ class DynamicTableRegion(VectorData): 'table', ) + MAX_ROWS_TO_VALIDATE_INIT = int(1e3) + @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'}, {'name': 'description', 'type': str, 'doc': 'a description of what this region represents'}, {'name': 'table', 'type': DynamicTable, 'doc': 'the DynamicTable this region applies to', 'default': None}, + {'name': 'validate_data', 'type': bool, + 'doc': 'whether to validate the data is in bounds of the linked table', 'default': True}, allow_positional=AllowPositional.WARNING) def __init__(self, **kwargs): - t = popargs('table', kwargs) + t, validate_data = popargs('table', 'validate_data', kwargs) + data = getargs('data', kwargs) + if validate_data: + self._validate_index_in_range(data, t) + super().__init__(**kwargs) - self.table = t + if t is not None: # set the table attribute using fields to avoid another validation in the setter + self.fields['table'] = t + + def _validate_index_in_range(self, data, table): + """If the length of data is small, and if data contains an index that is out of bounds, then raise an error. + If the object is being constructed from a file, raise a warning instead to ensure invalid data can still be + read. + """ + if table and len(data) <= self.MAX_ROWS_TO_VALIDATE_INIT: + for val in data: + if val >= len(table) or val < 0: + error_msg = f"DynamicTableRegion index {val} is out of bounds for {type(table)} '{table.name}'." + self._error_on_new_warn_on_construct(error_msg, error_cls=IndexError) @property def table(self): @@ -1378,24 +1398,26 @@ def table(self): return self.fields.get('table') @table.setter - def table(self, val): + def table(self, table): """ - Set the table this DynamicTableRegion should be pointing to + Set the table this DynamicTableRegion should be pointing to. - :param val: The DynamicTable this DynamicTableRegion should be pointing to + This will validate all data elements in this DynamicTableRegion to ensure they are within bounds. + + :param table: The DynamicTable this DynamicTableRegion should be pointing to :raises: AttributeError if table is already in fields :raises: IndexError if the current indices are out of bounds for the new table given by val """ - if val is None: + if table is None: return if 'table' in self.fields: msg = "can't set attribute 'table' -- already set" raise AttributeError(msg) - dat = self.data - if isinstance(dat, DataIO): - dat = dat.data - self.fields['table'] = val + + self.fields['table'] = table + for val in self.data: + self._validate_new_data_element(val) def __getitem__(self, arg): return self.get(arg) @@ -1540,6 +1562,12 @@ def _validate_on_set_parent(self): warn(msg, stacklevel=2) return super()._validate_on_set_parent() + def _validate_new_data_element(self, arg): + """Validate that the new index is within bounds of the table. Raises an IndexError if not.""" + if self.table and (arg >= len(self.table) or arg < 0): + raise IndexError(f"DynamicTableRegion index {arg} is out of bounds for " + f"{type(self.table)} '{self.table.name}'.") + def _uint_precision(elements): """ Calculate the uint precision needed to encode a set of elements """ diff --git a/src/hdmf/container.py b/src/hdmf/container.py index 287809406..a2cf2214e 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -567,6 +567,19 @@ def _validate_on_set_parent(self): """ pass + def _error_on_new_warn_on_construct(self, error_msg: str, error_cls: type = ValueError): + """Raise a ValueError when a check is violated on instance creation. + To ensure backwards compatibility, this method throws a warning + instead of raising an error when reading from a file, ensuring that + files with invalid data can be read. If error_msg is set to None + the function will simply return without further action. + """ + if error_msg is None: + return + if not self._in_construct_mode: + raise error_cls(error_msg) + warn(error_msg) + class Container(AbstractContainer): """A container that can contain other containers and has special functionality for printing.""" diff --git a/tests/unit/common/test_linkedtables.py b/tests/unit/common/test_linkedtables.py index 3c1c63170..0244ad41e 100644 --- a/tests/unit/common/test_linkedtables.py +++ b/tests/unit/common/test_linkedtables.py @@ -445,7 +445,7 @@ def test_to_hierarchical_dataframe_indexed_dtr_on_last_level(self): p1 = DynamicTable(name='parent_table', description='parent_table', columns=[VectorData(name='p1', description='p1', data=np.arange(3)), dtr_p1, vi_dtr_p1]) # Super-parent table - dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=np.arange(4), table=p1) + dtr_sp = DynamicTableRegion(name='sl1', description='sl1', data=[0, 1, 2, 0], table=p1) vi_dtr_sp = VectorIndex(name='sl1_index', data=[1, 2, 3], target=dtr_sp) spt = DynamicTable(name='super_parent_table', description='super_parent_table', columns=[VectorData(name='sp1', description='sp1', data=np.arange(3)), dtr_sp, vi_dtr_sp]) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py index 00b3c14a3..63c48672e 100644 --- a/tests/unit/common/test_table.py +++ b/tests/unit/common/test_table.py @@ -1287,6 +1287,34 @@ def test_no_df_nested(self): with self.assertRaisesWith(ValueError, msg): dynamic_table_region.get(0, df=False, index=False) + def test_init_out_of_bounds(self): + table = self.with_columns_and_data() + with self.assertRaises(IndexError): + DynamicTableRegion(name='dtr', data=[0, 1, 2, 2, 5], description='desc', table=table) + + def test_init_out_of_bounds_long(self): + table = self.with_columns_and_data() + data = np.ones(DynamicTableRegion.MAX_ROWS_TO_VALIDATE_INIT+1, dtype=int)*5 + dtr = DynamicTableRegion(name='dtr', data=data, description='desc', table=table) + assert dtr.data is data # no exception raised + + def test_init_out_of_bounds_no_validate(self): + table = self.with_columns_and_data() + dtr = DynamicTableRegion(name='dtr', data=[0, 1, 5], description='desc', table=table, validate_data=False) + self.assertEqual(dtr.data, [0, 1, 5]) # no exception raised + + def test_add_row_out_of_bounds(self): + table = self.with_columns_and_data() + dtr = DynamicTableRegion(name='dtr', data=[0, 1, 2, 2], description='desc', table=table) + with self.assertRaises(IndexError): + dtr.add_row(5) + + def test_set_table_out_of_bounds(self): + table = self.with_columns_and_data() + dtr = DynamicTableRegion(name='dtr', data=[0, 1, 5], description='desc') + with self.assertRaises(IndexError): + dtr.table = table + class DynamicTableRegionRoundTrip(H5RoundTripMixin, TestCase):