diff --git a/backend_py/primary/primary/services/summary_delta_vectors.py b/backend_py/primary/primary/services/summary_delta_vectors.py index 484d88dd9..e480160d8 100644 --- a/backend_py/primary/primary/services/summary_delta_vectors.py +++ b/backend_py/primary/primary/services/summary_delta_vectors.py @@ -4,7 +4,7 @@ import pyarrow.compute as pc import numpy as np -from .utils.summary_vector_table_helpers import validate_summary_vector_table_pa +from primary.services.service_exceptions import InvalidDataError, Service @dataclass @@ -16,6 +16,33 @@ class RealizationDeltaVector: unit: str +def _validate_summary_vector_table_pa( + vector_table: pa.Table, vector_name: str, service: Service = Service.GENERAL +) -> None: + """ + Check if the pyarrow vector table is valid. + + Expect the pyarrow single vector table to contain the following columns: DATE, REAL, vector_name. + + Raises InvalidDataError if the table does not contain the expected columns. + """ + expected_columns = {"DATE", "REAL", vector_name} + actual_columns = set(vector_table.column_names) + if actual_columns != expected_columns: + unexpected_columns = actual_columns - expected_columns + raise InvalidDataError(f"Unexpected columns in table {unexpected_columns}", service) + + # Validate table column types + if vector_table.field("DATE").type != pa.timestamp("ms"): + raise InvalidDataError( + f'DATE column must be of type timestamp(ms), but got {vector_table.field("DATE").type}', service + ) + if vector_table.field("REAL").type != pa.int16(): + raise InvalidDataError("REAL column must be of type int16", service) + if vector_table.field(vector_name).type != pa.float32(): + raise InvalidDataError(f"{vector_name} column must be of type float32", service) + + def create_delta_vector_table( compare_vector_table: pa.Table, reference_vector_table: pa.Table, vector_name: str ) -> pa.Table: @@ -33,8 +60,8 @@ def create_delta_vector_table( `Note`: Pre-processing of DATE-columns, e.g. resampling, should be done before calling this function. """ - validate_summary_vector_table_pa(compare_vector_table, vector_name) - validate_summary_vector_table_pa(reference_vector_table, vector_name) + _validate_summary_vector_table_pa(compare_vector_table, vector_name) + _validate_summary_vector_table_pa(reference_vector_table, vector_name) joined_vector_table = compare_vector_table.join( reference_vector_table, keys=["DATE", "REAL"], join_type="inner", right_suffix="_reference" @@ -60,7 +87,7 @@ def create_realization_delta_vector_list( """ Create a list of RealizationDeltaVector from the delta vector table. """ - validate_summary_vector_table_pa(delta_vector_table, vector_name) + _validate_summary_vector_table_pa(delta_vector_table, vector_name) real_arr_np = delta_vector_table.column("REAL").to_numpy() unique_reals, first_occurrence_idx, real_counts = np.unique(real_arr_np, return_index=True, return_counts=True) diff --git a/backend_py/primary/primary/services/sumo_access/summary_access.py b/backend_py/primary/primary/services/sumo_access/summary_access.py index de5052ac7..bb4a7acc7 100644 --- a/backend_py/primary/primary/services/sumo_access/summary_access.py +++ b/backend_py/primary/primary/services/sumo_access/summary_access.py @@ -16,7 +16,6 @@ sort_table_on_real_then_date, is_date_column_monotonically_increasing, ) -from primary.services.utils.summary_vector_table_helpers import validate_summary_vector_table_pa from primary.services.service_exceptions import ( Service, NoDataError, @@ -163,9 +162,6 @@ async def get_vector_async( ) -> List[RealizationVector]: table, vector_metadata = await self.get_vector_table_async(vector_name, resampling_frequency, realizations) - # Verify that columns are as we expect - validate_summary_vector_table_pa(table, vector_name, Service.SUMO) - real_arr_np = table.column("REAL").to_numpy() unique_reals, first_occurrence_idx, real_counts = np.unique(real_arr_np, return_index=True, return_counts=True) diff --git a/backend_py/primary/primary/services/utils/summary_vector_table_helpers.py b/backend_py/primary/primary/services/utils/summary_vector_table_helpers.py deleted file mode 100644 index 4dcc8e43e..000000000 --- a/backend_py/primary/primary/services/utils/summary_vector_table_helpers.py +++ /dev/null @@ -1,31 +0,0 @@ -import pyarrow as pa - -from primary.services.service_exceptions import InvalidDataError, Service - - -def validate_summary_vector_table_pa( - vector_table: pa.Table, vector_name: str, service: Service = Service.GENERAL -) -> None: - """ - Check if the pyarrow vector table is valid. - - Expect the pyarrow single vector table to contain the following columns: DATE, REAL, vector_name. - - Raises InvalidDataError if the table does not contain the expected columns. - """ - expected_columns = {"DATE", "REAL", vector_name} - actual_columns = set(vector_table.column_names) - if actual_columns != expected_columns: - unexpected_columns = actual_columns - expected_columns - raise InvalidDataError(f"Unexpected columns in table {unexpected_columns}", service) - - # Validate table column types - - if vector_table.field("DATE").type != pa.timestamp("ms"): - raise InvalidDataError( - f'DATE column must be of type timestamp(ms), but got {vector_table.field("DATE").type}', service - ) - if vector_table.field("REAL").type != pa.int16(): - raise InvalidDataError("REAL column must be of type int16", service) - if vector_table.field(vector_name).type != pa.float32(): - raise InvalidDataError(f"{vector_name} column must be of type float32", service) diff --git a/backend_py/primary/tests/unit/services/test_summary_delta_vectors.py b/backend_py/primary/tests/unit/services/test_summary_delta_vectors.py index 046f234fc..07287c4ac 100644 --- a/backend_py/primary/tests/unit/services/test_summary_delta_vectors.py +++ b/backend_py/primary/tests/unit/services/test_summary_delta_vectors.py @@ -1,7 +1,13 @@ +import pytest import pyarrow as pa -from primary.services.summary_delta_vectors import create_delta_vector_table -from primary.services.summary_delta_vectors import create_realization_delta_vector_list, RealizationDeltaVector +from primary.services.service_exceptions import InvalidDataError, Service +from primary.services.summary_delta_vectors import ( + create_delta_vector_table, + create_realization_delta_vector_list, + RealizationDeltaVector, + _validate_summary_vector_table_pa, +) VECTOR_TABLE_SCHEMA = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), ("vector", pa.float32())]) @@ -116,3 +122,71 @@ def test_create_realization_delta_vector_list_with_empty_table(): # Validate the result assert result == expected_result + + +def test_validate_summary_vector_table_pa_valid(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7.0, 8.0, 9.0]} + schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), (vector_name, pa.float32())]) + table = pa.Table.from_pydict(data, schema=schema) + try: + _validate_summary_vector_table_pa(table, vector_name) + except InvalidDataError: + pytest.fail("validate_summary_vector_table_pa raised InvalidDataError unexpectedly!") + + +def test_validate_summary_vector_table_pa_missing_column(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6]} + schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16())]) + table = pa.Table.from_pydict(data, schema=schema) + with pytest.raises(InvalidDataError): + _validate_summary_vector_table_pa(table, vector_name) + + +def test_validate_summary_vector_table_pa_unexpected_column(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7.0, 8.0, 9.0], "EXTRA": [10.0, 11.0, 12.0]} + schema = pa.schema( + [("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), (vector_name, pa.float32()), ("EXTRA", pa.float32())] + ) + table = pa.Table.from_pydict(data, schema=schema) + with pytest.raises(InvalidDataError): + _validate_summary_vector_table_pa(table, vector_name) + + +def test_validate_summary_vector_table_pa_invalid_date_type(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7.0, 8.0, 9.0]} + schema = pa.schema([("DATE", pa.int32()), ("REAL", pa.int16()), (vector_name, pa.float32())]) + table = pa.Table.from_pydict(data, schema=schema) + with pytest.raises(InvalidDataError): + _validate_summary_vector_table_pa(table, vector_name) + + +def test_validate_summary_vector_table_pa_invalid_real_type(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4.0, 5.0, 6.0], vector_name: [7.0, 8.0, 9.0]} + schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.float32()), (vector_name, pa.float32())]) + table = pa.Table.from_pydict(data, schema=schema) + with pytest.raises(InvalidDataError): + _validate_summary_vector_table_pa(table, vector_name) + + +def test_validate_summary_vector_table_pa_invalid_vector_type(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7, 8, 9]} + schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), (vector_name, pa.int32())]) + table = pa.Table.from_pydict(data, schema=schema) + with pytest.raises(InvalidDataError): + _validate_summary_vector_table_pa(table, vector_name) + + +def test_validate_summary_vector_table_pa_sumo_service(): + vector_name = "VECTOR" + data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6]} + schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16())]) + table = pa.Table.from_pydict(data, schema=schema) + with pytest.raises(InvalidDataError) as excinfo: + _validate_summary_vector_table_pa(table, vector_name, Service.SUMO) + assert excinfo.value.service == Service.SUMO diff --git a/backend_py/primary/tests/unit/services/utils/test_summary_vector_table_helpers.py b/backend_py/primary/tests/unit/services/utils/test_summary_vector_table_helpers.py deleted file mode 100644 index c0fa35b82..000000000 --- a/backend_py/primary/tests/unit/services/utils/test_summary_vector_table_helpers.py +++ /dev/null @@ -1,72 +0,0 @@ -import pytest -import pyarrow as pa -from primary.services.service_exceptions import InvalidDataError, Service -from primary.services.utils.summary_vector_table_helpers import validate_summary_vector_table_pa - - -def test_validate_summary_vector_table_pa_valid(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7.0, 8.0, 9.0]} - schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), (vector_name, pa.float32())]) - table = pa.Table.from_pydict(data, schema=schema) - try: - validate_summary_vector_table_pa(table, vector_name) - except InvalidDataError: - pytest.fail("validate_summary_vector_table_pa raised InvalidDataError unexpectedly!") - - -def test_validate_summary_vector_table_pa_missing_column(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6]} - schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16())]) - table = pa.Table.from_pydict(data, schema=schema) - with pytest.raises(InvalidDataError): - validate_summary_vector_table_pa(table, vector_name) - - -def test_validate_summary_vector_table_pa_unexpected_column(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7.0, 8.0, 9.0], "EXTRA": [10.0, 11.0, 12.0]} - schema = pa.schema( - [("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), (vector_name, pa.float32()), ("EXTRA", pa.float32())] - ) - table = pa.Table.from_pydict(data, schema=schema) - with pytest.raises(InvalidDataError): - validate_summary_vector_table_pa(table, vector_name) - - -def test_validate_summary_vector_table_pa_invalid_date_type(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7.0, 8.0, 9.0]} - schema = pa.schema([("DATE", pa.int32()), ("REAL", pa.int16()), (vector_name, pa.float32())]) - table = pa.Table.from_pydict(data, schema=schema) - with pytest.raises(InvalidDataError): - validate_summary_vector_table_pa(table, vector_name) - - -def test_validate_summary_vector_table_pa_invalid_real_type(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4.0, 5.0, 6.0], vector_name: [7.0, 8.0, 9.0]} - schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.float32()), (vector_name, pa.float32())]) - table = pa.Table.from_pydict(data, schema=schema) - with pytest.raises(InvalidDataError): - validate_summary_vector_table_pa(table, vector_name) - - -def test_validate_summary_vector_table_pa_invalid_vector_type(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6], vector_name: [7, 8, 9]} - schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16()), (vector_name, pa.int32())]) - table = pa.Table.from_pydict(data, schema=schema) - with pytest.raises(InvalidDataError): - validate_summary_vector_table_pa(table, vector_name) - - -def test_validate_summary_vector_table_pa_sumo_service(): - vector_name = "VECTOR" - data = {"DATE": [1, 2, 3], "REAL": [4, 5, 6]} - schema = pa.schema([("DATE", pa.timestamp("ms")), ("REAL", pa.int16())]) - table = pa.Table.from_pydict(data, schema=schema) - with pytest.raises(InvalidDataError) as excinfo: - validate_summary_vector_table_pa(table, vector_name, Service.SUMO) - assert excinfo.value.service == Service.SUMO