From d55523e5a44c9bcf2b81c42f882a64504f3c8186 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 17 Dec 2024 13:29:38 +1100 Subject: [PATCH 01/29] fix: add check for missing/invalid args in project --- map2loop/project.py | 85 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/map2loop/project.py b/map2loop/project.py index 84aa0eea..ff319745 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -18,7 +18,7 @@ gdal.UseExceptions() import geopandas import beartype -from beartype.typing import Union, List +from beartype.typing import Union, List, Dict, Any import pathlib import numpy import pandas @@ -75,7 +75,7 @@ def __init__( loop_project_filename: str = "", overwrite_loopprojectfile: bool = False, **kwargs, - ): + ): """ The initialiser for the map2loop project @@ -119,6 +119,28 @@ def __init__( TypeError: Type of bounding_box not a dict or tuple ValueError: use_australian_state_data not in state list ['WA', 'SA', 'QLD', 'NSW', 'TAS', 'VIC', 'ACT', 'NT'] """ + + # Throw error if unexpected keyword arguments are passed to project + allowed_kwargs = {"metadata_filename"} + for key in kwargs.keys(): + if key not in allowed_kwargs: + logger.error( + f"Unexpected keyword argument '{key}' passed to Project. Allowed keywords: {', '.join(allowed_kwargs)}." + ) + raise TypeError( + f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding." + ) + + # make sure all the needed arguments are provided + self.validate_required_inputs( + bounding_box=bounding_box, + working_projection=working_projection, + geology_filename=geology_filename, + structure_filename=structure_filename, + dtm_filename=dtm_filename, + config_dictionary=config_dictionary, + config_filename=config_filename, + ) self._error_state = ErrorState.NONE self._error_state_msg = "" self.verbose_level = verbose_level @@ -233,6 +255,58 @@ def __init__( if len(kwargs): logger.warning(f"Unused keyword arguments: {kwargs}") + @beartype.beartype + def validate_required_inputs( + self, + bounding_box: Dict[str, Union[float, int]], + working_projection: str, + geology_filename: str, + structure_filename: str, + dtm_filename: str, + config_filename: str = None, + config_dictionary: Dict[str, Any] = {}, + ) -> None: + + required_inputs = { + "bounding_box": bounding_box, + "working_projection": working_projection, + "geology_filename": geology_filename, + "structure_filename": structure_filename, + "dtm_filename": dtm_filename, + } + + # Check for missing required inputs in project + missing_inputs = [key for key, value in required_inputs.items() if not value] + + if missing_inputs: + missing_list = ", ".join(missing_inputs) + logger.error( + f"Project construction is missing required inputs: {missing_list}. " + "Please add them to the Project()." + ) + raise ValueError( + f"Project construction is missing required inputs: {missing_list}. " + "Please add them to the Project()." + ) + + # Either config_filename or config_dictionary must be provided (but not both or neither) + if not config_filename and not config_dictionary: + logger.error( + "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project." + ) + raise ValueError( + "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project." + ) + if config_filename and config_dictionary: + logger.error( + "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one." + ) + raise ValueError( + "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one." + ) + + + # Getters and Setters @beartype.beartype def set_ignore_lithology_codes(self, codes: list): @@ -734,9 +808,10 @@ def save_into_projectfile(self): logger.info('Saving data into loop project file') if not self.loop_filename: logger.info('No loop project file specified, creating a new one') - self.loop_filename = os.path.join( - self.map_data.tmp_path, os.path.basename(self.map_data.tmp_path) + ".loop3d" - ) + output_dir = pathlib.Path.cwd() + output_dir.mkdir(parents=True, exist_ok=True) + filename = "new_project.loop3d" + self.loop_filename = str(output_dir / filename) file_exists = os.path.isfile(self.loop_filename) From 596ddb3aa60b2a4c529d24cd1447658b8ff3092c Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 17 Dec 2024 13:30:47 +1100 Subject: [PATCH 02/29] chore: add mention to issue --- map2loop/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/map2loop/project.py b/map2loop/project.py index ff319745..6883d79b 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -269,7 +269,7 @@ def validate_required_inputs( required_inputs = { "bounding_box": bounding_box, - "working_projection": working_projection, + "working_projection": working_projection, # this may be removed when fix is added for https://github.com/Loop3D/map2loop/issues/103 "geology_filename": geology_filename, "structure_filename": structure_filename, "dtm_filename": dtm_filename, From 3345bbec278343c29875911484efd9724df91593 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 15:53:52 +1100 Subject: [PATCH 03/29] chore: clear warnings --- map2loop/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/map2loop/project.py b/map2loop/project.py index 6883d79b..41f84447 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -292,10 +292,10 @@ def validate_required_inputs( # Either config_filename or config_dictionary must be provided (but not both or neither) if not config_filename and not config_dictionary: logger.error( - "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project." + "A config file is required to run map2loop - use either 'config_filename' or 'config_dictionary' to initialise the project." ) raise ValueError( - "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project." + "A config file is required to run map2loop - use either 'config_filename' or 'config_dictionary' to initialise the project." ) if config_filename and config_dictionary: logger.error( From b1a94bd3e013a1fd566884b49c5f70dea85b8d3a Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 15:55:32 +1100 Subject: [PATCH 04/29] chore: make warnings clear --- map2loop/project.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/map2loop/project.py b/map2loop/project.py index 41f84447..e9b4d919 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -299,10 +299,10 @@ def validate_required_inputs( ) if config_filename and config_dictionary: logger.error( - "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one." + "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one config." ) raise ValueError( - "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one." + "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one config." ) From 1bb6e4787f7d789c24e0f06bb2052f9e0e2a53e3 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 15:56:21 +1100 Subject: [PATCH 05/29] chore: make warnings clear --- map2loop/project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/map2loop/project.py b/map2loop/project.py index e9b4d919..d4fce9a4 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -128,7 +128,7 @@ def __init__( f"Unexpected keyword argument '{key}' passed to Project. Allowed keywords: {', '.join(allowed_kwargs)}." ) raise TypeError( - f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding." + f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding with map2loop processing" ) # make sure all the needed arguments are provided From 27ea51df28a9fe62b13b25a8cb5be7c6f333cc9c Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 16:09:21 +1100 Subject: [PATCH 06/29] tests: bypass the necessary dataset requirement --- .../test_ignore_codes_setters_getters.py | 38 ++++++++++--------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/tests/project/test_ignore_codes_setters_getters.py b/tests/project/test_ignore_codes_setters_getters.py index 4cebdba7..34de34a4 100644 --- a/tests/project/test_ignore_codes_setters_getters.py +++ b/tests/project/test_ignore_codes_setters_getters.py @@ -2,6 +2,7 @@ from map2loop.project import Project from map2loop.m2l_enums import Datatype import map2loop +from unittest.mock import patch # Sample test function for lithology and fault ignore codes @@ -21,24 +22,25 @@ def test_set_get_ignore_codes(): "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"}, "geology": {"unitname_column": "unitname", "alt_unitname_column": "code"}, } - - project = Project( - working_projection='EPSG:28350', - bounding_box=bbox_3d, - geology_filename=str( - pathlib.Path(map2loop.__file__).parent - / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson') - ), - fault_filename=str( - pathlib.Path(map2loop.__file__).parent - / pathlib.Path('_datasets/geodata_files/hamersley/faults.geojson') - ), - dtm_filename=str( - pathlib.Path(map2loop.__file__).parent - / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif') - ), - config_dictionary=config_dictionary, - ) + with patch.object(Project, 'validate_required_inputs', return_value=None): + project = Project( + working_projection='EPSG:28350', + bounding_box=bbox_3d, + geology_filename=str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson') + ), + fault_filename=str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/faults.geojson') + ), + dtm_filename=str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif') + ), + config_dictionary=config_dictionary, + structure_filename="", + ) # Define test ignore codes for lithology and faults lithology_codes = ["cover", "Fortescue_Group", "A_FO_od"] From c9da10dba3c4614b7e272fd72eb90864771862b1 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 16:09:52 +1100 Subject: [PATCH 07/29] fix: skip required file checks if using loop server data --- map2loop/project.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/map2loop/project.py b/map2loop/project.py index d4fce9a4..f6de1941 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -132,15 +132,16 @@ def __init__( ) # make sure all the needed arguments are provided - self.validate_required_inputs( - bounding_box=bounding_box, - working_projection=working_projection, - geology_filename=geology_filename, - structure_filename=structure_filename, - dtm_filename=dtm_filename, - config_dictionary=config_dictionary, - config_filename=config_filename, - ) + if not use_australian_state_data: # this check has to skip if using Loop server data + self.validate_required_inputs( + bounding_box=bounding_box, + working_projection=working_projection, + geology_filename=geology_filename, + structure_filename=structure_filename, + dtm_filename=dtm_filename, + config_dictionary=config_dictionary, + config_filename=config_filename, + ) self._error_state = ErrorState.NONE self._error_state_msg = "" self.verbose_level = verbose_level From 713abffcbeb0f098641898dbc76c798b67127c56 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 16:33:00 +1100 Subject: [PATCH 08/29] fix: make the config check through project --- map2loop/config.py | 19 ++++++------------- map2loop/project.py | 3 ++- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/map2loop/config.py b/map2loop/config.py index 48d017d3..2a0efdf7 100644 --- a/map2loop/config.py +++ b/map2loop/config.py @@ -103,9 +103,6 @@ def update_from_dictionary(self, dictionary: dict, lower: bool = True): # make sure dictionary doesn't contain legacy keys self.check_for_legacy_keys(dictionary) - # make sure it has the minimum requirements - self.validate_config_dictionary(dictionary) - if "structure" in dictionary: self.structure_config.update(dictionary["structure"]) for key in dictionary["structure"].keys(): @@ -218,25 +215,20 @@ def update_from_file( @beartype.beartype def validate_config_dictionary(self, config_dict: dict) -> None: - """ - Validate the structure and keys of the configuration dictionary. - - Args: - config_dict (dict): The config dictionary to validate. - - Raises: - ValueError: If the dictionary does not meet the minimum requirements for ma2p2loop. - """ required_keys = { "structure": {"dipdir_column", "dip_column"}, "geology": {"unitname_column", "alt_unitname_column"}, } + # Loop over "structure" and "geology" for section, keys in required_keys.items(): + + # 1) Check that "section" exists if section not in config_dict: logger.error(f"Missing required section '{section}' in config dictionary.") raise ValueError(f"Missing required section '{section}' in config dictionary.") - + + # 2) Check that each required key is in config_dict[section] for key in keys: if key not in config_dict[section]: logger.error( @@ -246,6 +238,7 @@ def validate_config_dictionary(self, config_dict: dict) -> None: f"Missing required key '{key}' for '{section}' section of the config dictionary." ) + @beartype.beartype def check_for_legacy_keys(self, config_dict: dict) -> None: diff --git a/map2loop/project.py b/map2loop/project.py index f6de1941..6e4303c6 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -142,6 +142,7 @@ def __init__( config_dictionary=config_dictionary, config_filename=config_filename, ) + self._error_state = ErrorState.NONE self._error_state_msg = "" self.verbose_level = verbose_level @@ -230,12 +231,12 @@ def __init__( self.map_data.set_config_filename(config_filename) if config_dictionary != {}: + self.map_data.config.validate_config_dictionary(config_dictionary) self.map_data.config.update_from_dictionary(config_dictionary) if clut_filename != "": self.map_data.set_colour_filename(clut_filename) - # Load all data (both shape and raster) self.map_data.load_all_map_data() From b52d54014fc493846ff3a068f8dc8b00bb5b0f82 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 16:48:18 +1100 Subject: [PATCH 09/29] tests: add tests for config checks --- tests/project/test_config_arguments.py | 150 +++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 tests/project/test_config_arguments.py diff --git a/tests/project/test_config_arguments.py b/tests/project/test_config_arguments.py new file mode 100644 index 00000000..229bcabc --- /dev/null +++ b/tests/project/test_config_arguments.py @@ -0,0 +1,150 @@ +import pytest +import pathlib +from unittest.mock import patch +from map2loop.project import Project +from map2loop.m2l_enums import Datatype +import map2loop + +# ------------------------------------------------------------------------------ +# Common fixtures or helper data (bounding box, minimal filenames, etc.) +# ------------------------------------------------------------------------------ + +@pytest.fixture +def minimal_bounding_box(): + return { + "minx": 515687.31005864, + "miny": 7493446.76593407, + "maxx": 562666.860106543, + "maxy": 7521273.57407786, + "base": -3200, + "top": 3000, + } + +@pytest.fixture +def geology_file(): + return str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson') + ) + +@pytest.fixture +def structure_file(): + return str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/structure.geojson') + ) + +@pytest.fixture +def dtm_file(): + return str( + pathlib.Path(map2loop.__file__).parent + / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif') + ) + +@pytest.fixture +def valid_config_dictionary(): + """ + A valid config dictionary that meets the 'structure' and 'geology' requirements + """ + return { + "structure": { + "dipdir_column": "azimuth2", + "dip_column": "dip" + }, + "geology": { + "unitname_column": "unitname", + "alt_unitname_column": "code", + } + } + + + +# 1) config_filename and config_dictionary both present should raise ValueError +def test_config_filename_and_dictionary_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary +): + + with pytest.raises(ValueError, match="Both 'config_filename' and 'config_dictionary' were provided"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_filename="dummy_config.json", + config_dictionary=valid_config_dictionary, + ) + +# 2) No config_filename or config_dictionary should raise ValueError +def test_no_config_provided_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file +): + + with pytest.raises(ValueError, match="A config file is required to run map2loop"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + ) + +# 3) Passing an unexpected argument should raise TypeError +def test_unexpected_argument_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary +): + + with pytest.raises(TypeError, match="unexpected keyword argument 'config_file'"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_dictionary=valid_config_dictionary, + config_file="wrong_kwarg.json", + ) + +# 4) Dictionary missing a required key should raise ValueError + +def test_dictionary_missing_required_key_raises_error( + minimal_bounding_box, geology_file, dtm_file, structure_file +): + + invalid_dictionary = { + "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"}, + "geology": {"unitname_column": "unitname"} # alt_unitname_column missing + } + + with pytest.raises(ValueError, match="Missing required key 'alt_unitname_column' for 'geology'"): + Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_dictionary=invalid_dictionary, + ) + +# 5) All good => The Project should be created without errors +def test_good_config_runs_successfully( + minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary +): + project = None + try: + project = Project( + bounding_box=minimal_bounding_box, + working_projection="EPSG:28350", + geology_filename=geology_file, + dtm_filename=dtm_file, + structure_filename=structure_file, + config_dictionary=valid_config_dictionary, + ) + except Exception as e: + pytest.fail(f"Project initialization raised an unexpected exception: {e}") + + assert project is not None, "Project was not created." + assert project.map_data.config.structure_config["dipdir_column"] == "azimuth2" + assert project.map_data.config.structure_config["dip_column"] == "dip" + assert project.map_data.config.geology_config["unitname_column"] == "unitname" + assert project.map_data.config.geology_config["alt_unitname_column"] == "code" \ No newline at end of file From 632287237e2f3fc231c51b7605c2ec0d6fd526a3 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 7 Jan 2025 05:51:37 +0000 Subject: [PATCH 10/29] style: style fixes by ruff and autoformatting by black --- tests/project/test_config_arguments.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/project/test_config_arguments.py b/tests/project/test_config_arguments.py index 229bcabc..53e6ce35 100644 --- a/tests/project/test_config_arguments.py +++ b/tests/project/test_config_arguments.py @@ -1,8 +1,6 @@ import pytest import pathlib -from unittest.mock import patch from map2loop.project import Project -from map2loop.m2l_enums import Datatype import map2loop # ------------------------------------------------------------------------------ From 554ed6aace57e047fb4696820b1894ea49384db6 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 08:33:26 +1100 Subject: [PATCH 11/29] fiz: actually use libmamba on build --- .github/workflows/conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 1a9949f1..47cf32e0 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -28,7 +28,7 @@ jobs: shell: bash -l {0} run: | conda install -c conda-forge conda-build scikit-build-core numpy anaconda-client conda-libmamba-solver -y - conda build -c conda-forge -c loop3d --output-folder conda conda --python ${{matrix.python-version}} + conda build -c conda-forge -c loop3d --output-folder conda conda --python ${{matrix.python-version}} --solver=libmamba anaconda upload --label main conda/*/*.tar.bz2 - name: upload artifacts From f9752b2511714777b829c2418fdc039154e825af Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 08:42:07 +1100 Subject: [PATCH 12/29] revert previous commit --- .github/workflows/conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 47cf32e0..bd9b0f28 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -28,7 +28,7 @@ jobs: shell: bash -l {0} run: | conda install -c conda-forge conda-build scikit-build-core numpy anaconda-client conda-libmamba-solver -y - conda build -c conda-forge -c loop3d --output-folder conda conda --python ${{matrix.python-version}} --solver=libmamba + conda build -c conda-forge -c loop3d --output-folder conda conda --python ${{matrix.python-version}}reve anaconda upload --label main conda/*/*.tar.bz2 - name: upload artifacts From 9be3ac6b2e4f9bbe7e62720eeffdf6d704ec3a15 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 12:12:12 +1100 Subject: [PATCH 13/29] fix: add init commit --- map2loop/data_checks.py | 444 ++++++++++++++++++++++++++++++++++++++++ map2loop/mapdata.py | 315 +++++++++++++--------------- 2 files changed, 586 insertions(+), 173 deletions(-) create mode 100644 map2loop/data_checks.py diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py new file mode 100644 index 00000000..66ec94e6 --- /dev/null +++ b/map2loop/data_checks.py @@ -0,0 +1,444 @@ +#internal imports +from .m2l_enums import Datatype + +#external imports +import beartype as beartype +from beartype.typing import Tuple +import geopandas +import shapely +import pandas + +from .logging import getLogger +logger = getLogger(__name__) + +@beartype.beartype +def check_geology_fields_validity(mapdata) -> tuple[bool, str]: + #TODO (AR) - add check for gaps in geology data + """ + Validate the columns in GEOLOGY geodataframe + + Several checks to ensure that the geology data: + - Is loaded and valid. + - Contains required columns with appropriate types and no missing or blank values. + - Has optional columns with valid types, if present. + - Does not contain duplicate in IDs. + - Ensures the geometry column has valid geometries. + + Returns: + Tuple[bool, str]: A tuple indicating success (False) or failure (True) + """ + # Check if geology data is loaded and valid + if ( + mapdata.raw_data[Datatype.GEOLOGY] is None + or type(mapdata.raw_data[Datatype.GEOLOGY]) is not geopandas.GeoDataFrame + ): + logger.error("GEOLOGY data is not loaded or is not a valid GeoDataFrame") + return (True, "GEOLOGY data is not loaded or is not a valid GeoDataFrame") + + geology_data = mapdata.raw_data[Datatype.GEOLOGY] + config = mapdata.config.geology_config + + # 1. Check geometry validity - tested & working + if not geology_data.geometry.is_valid.all(): + logger.error("Invalid geometries found. Please fix those before proceeding with map2loop processing") + return (True, "Invalid geometries found in datatype GEOLOGY") + + # # 2. Required Columns & are they str, and then empty or null? + required_columns = [config["unitname_column"], config["alt_unitname_column"]] + for col in required_columns: + if col not in geology_data.columns: + logger.error(f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.") + return (True, f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.") + if not geology_data[col].apply(lambda x: isinstance(x, str)).all(): + config_key = [k for k, v in config.items() if v == col][0] + logger.error(f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.") + return (True, f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.") + if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any(): + config_key = [k for k, v in config.items() if v == col][0] + logger.error(f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") + return (True, f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") + + # # 3. Optional Columns + optional_string_columns = [ + "group_column", "supergroup_column", "description_column", + "rocktype_column", "alt_rocktype_column", + ] + + for key in optional_string_columns: + if key in config and config[key] in geology_data.columns: + if not geology_data[config[key]].apply(lambda x: isinstance(x, str)).all(): + logger.warning( + f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-string values. " + "Map2loop processing might not work as expected." + ) + + optional_numeric_columns = ["minage_column", "maxage_column", "objectid_column"] + for key in optional_numeric_columns: + if key in config and config[key] in geology_data.columns: + if not geology_data[config[key]].apply(lambda x: isinstance(x, (int, float))).all(): + logger.warning( + f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-numeric values. " + "Map2loop processing might not work as expected." + ) + + # # 4. Check for duplicates in ID + if "objectid_column" in config and config["objectid_column"] in geology_data.columns: + objectid_values = geology_data[config["objectid_column"]] + + # Check for None, NaN, or other null-like values + if objectid_values.isnull().any(): + logger.error( + f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values. Ensure all values are valid and non-null." + ) + return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values.") + + # Check for duplicate values + if objectid_values.duplicated().any(): + logger.error( + f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column'). Please make sure that the column contains unique values." + ) + return (True, f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column').") + + # Check for uniqueness + if not objectid_values.is_unique: + logger.error( + f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values. Ensure all values are unique." + ) + return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values.") + + + # 5. Check for NaNs/blanks in optional fields with warnings + warning_fields = [ + "group_column", "supergroup_column", "description_column", + "rocktype_column", "minage_column", "maxage_column", + ] + for key in warning_fields: + col = config.get(key) + if col and col in geology_data.columns: + # Check if column contains string values before applying `.str` + if pandas.api.types.is_string_dtype(geology_data[col]): + if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any(): + logger.warning( + f"Datatype GEOLOGY: NaN or blank values found in optional column '{col}' (config key: '{key}')." + ) + else: + # Non-string columns, check only for NaN values + if geology_data[col].isnull().any(): + logger.warning( + f"Datatype GEOLOGY: NaN values found in optional column '{col}' (config key: '{key}')." + ) + + + logger.info("Geology fields validation passed.") + return (False, "") + +@beartype.beartype +def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: + """ + Validate the structure data for required and optional fields. + + Performs the following checks: + - Ensures the structure map is loaded, valid, and contains at least two structures. + - Validates the geometry column + - Checks required numeric columns (`dip_column`, `dipdir_column`) for existence, dtype, range, and null values. + - Checks optional string columns (`description_column`, `overturned_column`) for type and null/empty values. + - Validates the optional numeric `objectid_column` for type, null values, and duplicates. + + Returns: + Tuple[bool, str]: A tuple where the first value indicates if validation failed (True = failed), + and the second value provides a message describing the issue. + """ + + # Check type and size of loaded structure map + if ( + mapdata.raw_data[Datatype.STRUCTURE] is None + or type(mapdata.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame + ): + logger.warning("Structure map is not loaded or valid") + return (True, "Structure map is not loaded or valid") + + if len(mapdata.raw_data[Datatype.STRUCTURE]) < 2: + logger.warning( + "Datatype STRUCTURE: map does with not enough orientations to complete calculations (need at least 2), projection may be inconsistent" + ) + + structure_data = mapdata.raw_data[Datatype.STRUCTURE] + config = mapdata.config.structure_config + + # 1. Check geometry validity + if not structure_data.geometry.is_valid.all(): + logger.error("datatype STRUCTURE: Invalid geometries found. Please fix those before proceeding with map2loop processing") + return (True, "Invalid geometries found in datatype STRUCTURE") + + # 2. Check mandatory numeric columns + required_columns = [config["dipdir_column"], config["dip_column"]] + for col in required_columns: + if col not in structure_data.columns: + logger.error(f"DDatatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.") + return (True, f"Datatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.") + if not structure_data[col].apply(lambda x: isinstance(x, (int, float))).all(): + config_key = [k for k, v in config.items() if v == col][0] + logger.error(f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.") + return (True, f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.") + if structure_data[col].isnull().any(): + config_key = [k for k, v in config.items() if v == col][0] + logger.error(f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") + return (True, f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") + + if config["dip_column"] in structure_data.columns: + invalid_dip = ~((structure_data[config["dip_column"]] >= 0) & (structure_data[config["dip_column"]] <= 90)) + if invalid_dip.any(): + logger.warning( + f"Datatype STRUCTURE: Column '{config['dip_column']}' has values that are not between 0 and 90 degrees. Is this intentional?" + ) + + if config["dipdir_column"] in structure_data.columns: + invalid_dipdir = ~((structure_data[config["dipdir_column"]] >= 0) & (structure_data[config["dipdir_column"]] <= 360)) + if invalid_dipdir.any(): + logger.warning( + f"Datatype STRUCTURE: Column '{config['dipdir_column']}' has values that are not between 0 and 360 degrees. Is this intentional?" + ) + + # check validity of optional string columns + optional_string_columns = ["description_column", "overturned_column"] + for key in optional_string_columns: + if key in config and config[key] in structure_data.columns: + column_name = config[key] + if not structure_data[column_name].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.warning( + f"Datatype STRUCTURE: Optional column with config key: '{key}' contains non-string values. " + "Map2loop processing might not work as expected." + ) + if structure_data[column_name].isnull().any() or structure_data[column_name].str.strip().eq("").any(): + logger.warning( + f"Datatype STRUCTURE: Optional column config key: '{key}' contains NaN, empty, or null values. " + "Map2loop processing might not work as expected." + ) + + # check ID column for type, null values, and duplicates + optional_numeric_column_key = "objectid_column" + optional_numeric_column = config.get(optional_numeric_column_key) + + if optional_numeric_column: + if optional_numeric_column in structure_data.columns: + # Check for non-integer values + if not structure_data[optional_numeric_column].apply(lambda x: isinstance(x, int) or pandas.isnull(x)).all(): + logger.error( + f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values. Rectify this, or remove this column from the config - map2loop will generate a new ID." + ) + return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values.") + # Check for NaN + if structure_data[optional_numeric_column].isnull().any(): + logger.error( + f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values. Rectify this, or remove this column from the config - map2loop will generate a new ID." + ) + return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values.") + # Check for duplicates + if structure_data[optional_numeric_column].duplicated().any(): + logger.error( + f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values. Rectify this, or remove this column from the config - map2loop will generate a new ID." + ) + return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values.") + + return (False, "") + +@beartype.beartype +def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: + + # Check type of loaded fault map + if ( + mapdata.raw_data[Datatype.FAULT] is None + or type(mapdata.raw_data[Datatype.FAULT]) is not geopandas.GeoDataFrame + ): + logger.warning("Fault map is not loaded or valid") + return (True, "Fault map is not loaded or valid") + + fault_data = mapdata.raw_data[Datatype.FAULT] + config = mapdata.config.fault_config + + # Check geometry + if not fault_data.geometry.is_valid.all(): + logger.error("datatype FAULT: Invalid geometries found. Please fix those before proceeding with map2loop processing") + return (True, "Invalid geometries found in FAULT data.") + + # Check for LineString or MultiLineString geometries + if not fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all(): + invalid_types = fault_data[~fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))] + logger.error( + f"FAULT data contains invalid geometry types. Rows with invalid geometry types: {invalid_types.index.tolist()}" + ) + return (True, "FAULT data contains geometries that are not LineString or MultiLineString.") + + # Check "structtype_column" if it exists + if "structtype_column" in config: + structtype_column = config["structtype_column"] + + # Ensure the column exists in the data + if structtype_column not in fault_data.columns: + logger.warning( + f"Datatype FAULT: '{structtype_column}' (config key: 'structtype_column') is missing from the fault data. Consider removing that key from the config" + ) + else: + # Check if all entries in the column are strings + if not fault_data[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.error( + f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." + ) + return (True, f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.") + + # Warn about empty or null cells + if fault_data[structtype_column].isnull().any() or fault_data[structtype_column].str.strip().eq("").any(): + logger.warning( + f"Datatype FAULT: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected." + ) + + # Check if "fault_text" is defined and contained in the column + fault_text = config.get("fault_text", None) + + # Check if the structtype_column exists in the fault_data + if structtype_column not in fault_data.columns: + logger.warning( + f"Datatype FAULT: The column '{structtype_column}' is not present in the fault data." + ) + + else: + if not fault_data[structtype_column].str.contains(fault_text).any(): + logger.error( + f"Datatype FAULT: The 'fault_text' value '{fault_text}' is not found in column '{structtype_column}'. Project might end up with no faults" + ) + + #checks on name column + name_column = config.get("name_column") + if name_column not in fault_data.columns: + logger.warning( + f"Datatype FAULT: Column '{name_column}' (config key 'name_column') is missing from the fault data." + "Please ensure it is present, or remove that key from the config." + ) + + if name_column and name_column in fault_data.columns: + # Check if the column contains non-string values + if not fault_data[name_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.error( + f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains non-string values. Ensure all values are valid strings." + ) + return (True, f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains non-string values.") + + # Check for NaN values + if fault_data[name_column].isnull().any(): + logger.warning( + f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains NaN or empty values. This may affect processing." + ) + + # Check for duplicate values + if fault_data[name_column].duplicated().any(): + logger.warning( + f"Datatype FAULT: Column '{name_column}' contains duplicate values. This may affect processing." + ) + + # dips & strikes + # Check for dips and dip directions + strike_dips_columns = ["dip_column", "dipdir_column"] + + for key in strike_dips_columns: + column_name = config.get(key) + if column_name: # Only proceed if the config has this key + if column_name in fault_data.columns: + + #coerce to numeric + fault_data[column_name] = pandas.to_numeric(fault_data[column_name], errors='coerce') + + # Check if the column contains only numeric values + if not fault_data[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all(): + logger.warning( + f"Datatype FAULT: Column '{column_name}' (config key {key}) must contain only numeric values. Please ensure the column is numeric." + ) + + # Check for NaN or empty values + if fault_data[column_name].isnull().any(): + logger.warning( + f"Datatype FAULT: Column '{column_name}' (config key {key}) contains NaN or empty values. This may affect processing." + ) + + # Check range constraints + if key == "dip_column": + # Dips must be between 0 and 90 + invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 90)) + if invalid_values.any(): + logger.warning( + f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 90]. Was this intentional?" + ) + elif key == "dipdir_column": + # Dip directions must be between 0 and 360 + invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 360)) + if invalid_values.any(): + logger.warning( + f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 360]. Was this intentional?" + ) + else: + logger.warning( + f"Datatype FAULT: Column '{column_name}' (config key {key}) is missing from the fault data. Please ensure the column name is correct, or otherwise remove that key from the config." + ) + + + # dip estimates + dip_estimate_column = config.get("dip_estimate_column") + valid_directions = [ + "north_east", "south_east", "south_west", "north_west", + "north", "east", "south", "west" + ] + + if dip_estimate_column: + if dip_estimate_column in fault_data.columns: + # Ensure all values are in the set of valid directions or are NaN + invalid_values = fault_data[dip_estimate_column][ + ~fault_data[dip_estimate_column].apply(lambda x: x in valid_directions or pandas.isnull(x)) + ] + + if not invalid_values.empty: + logger.error( + f"Datatype FAULT: Column '{dip_estimate_column}' contains invalid values not in the set of allowed dip estimates: {valid_directions}." + ) + return ( + True, + f"Datatype FAULT: Column '{dip_estimate_column}' contains invalid values. Allowed values: {valid_directions}.", + ) + + # Warn if there are NaN or empty values + if fault_data[dip_estimate_column].isnull().any(): + logger.warning( + f"Datatype FAULT: Column '{dip_estimate_column}' contains NaN or empty values. This may affect processing." + ) + else: + logger.error( + f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data. Please ensure the column name is correct or remove that key from the config." + ) + return (True, f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data.") + + # Check ID column + id_column = config.get("objectid_column") + + if id_column: + if id_column in fault_data.columns: + # Check for non-integer values + # Attempt to coerce the ID column to integers because WA data says so (ARodrigues) + fault_data[id_column] = pandas.to_numeric(fault_data[id_column], errors='coerce') + + # Check if all values are integers or null after coercion + if not fault_data[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all(): + logger.warning( + f"Datatype FAULT: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs." + ) + + # Check for NaN values + if fault_data[id_column].isnull().any(): + logger.warning( + f"Datatype FAULT: ID column '{id_column}' contains NaN or null values. Rectify this or remove the key from the config to auto-generate IDs." + ) + + # Check for duplicates + if fault_data[id_column].duplicated().any(): + logger.error( + f"Datatype FAULT: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs." + ) + + return (False, "") diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index 4ef0be89..a1bb3ae3 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -3,6 +3,7 @@ from .config import Config from .aus_state_urls import AustraliaStateUrls from .utils import generate_random_hex_colors, calculate_minimum_fault_length +from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity # external imports import geopandas @@ -19,7 +20,7 @@ import beartype import os from io import BytesIO -from typing import Union +from typing import Union, Tuple import tempfile @@ -689,154 +690,42 @@ def check_map(self, datatype: Datatype): The datatype to check """ func = None + #check and parse geology data if datatype == Datatype.GEOLOGY: + validity_check, message = check_geology_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype GEOLOGY data validation failed: {message}") + return func = self.parse_geology_map + + #check and parse structure data elif datatype == Datatype.STRUCTURE: + validity_check, message = check_structure_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype STRUCTURE data validation failed: {message}") + return func = self.parse_structure_map + + #check and parse fault data elif datatype == Datatype.FAULT: + validity_check, message = check_fault_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype FAULT data validation failed: {message}") + return func = self.parse_fault_map - elif datatype == Datatype.FOLD: - func = self.parse_fold_map + elif datatype == Datatype.FAULT_ORIENTATION: func = self.parse_fault_orientations + + #check and parse fold data + elif datatype == Datatype.FOLD: + func = self.parse_fold_map + if func: error, message = func() if error: logger.error(message) - @beartype.beartype - def parse_fault_orientations(self) -> tuple: - """ - Parse the fault orientations shapefile data into a consistent format - - Returns: - tuple: A tuple of (bool: success/fail, str: failure message) - """ - # Check type and size of loaded structure map - if ( - self.raw_data[Datatype.FAULT_ORIENTATION] is None - or type(self.raw_data[Datatype.FAULT_ORIENTATION]) is not geopandas.GeoDataFrame - ): - logger.warning("Fault orientation shapefile is not loaded or valid") - return (True, "Fault orientation shapefile is not loaded or valid") - - # Create new geodataframe - fault_orientations = geopandas.GeoDataFrame( - self.raw_data[Datatype.FAULT_ORIENTATION]["geometry"] - ) - - config = self.config.fault_config - - # Parse dip direction and dip columns - if config["dipdir_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - if config["orientation_type"] == "strike": - fault_orientations["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( - lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 - ) - else: - fault_orientations["DIPDIR"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["dipdir_column"] - ] - else: - print( - f"Fault orientation shapefile does not contain dipdir_column '{config['dipdir_column']}'" - ) - - if config["dip_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - fault_orientations["DIP"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["dip_column"] - ] - else: - print( - f"Fault orientation shapefile does not contain dip_column '{config['dip_column']}'" - ) - - # TODO LG would it be worthwhile adding a description column for faults? - # it would be possible to parse out the fault displacement, type, slip direction - # if this was stored in the descriptions? - - # Add object id - if config["objectid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - fault_orientations["ID"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["objectid_column"] - ] - else: - fault_orientations["ID"] = numpy.arange(len(fault_orientations)) - self.data[Datatype.FAULT_ORIENTATION] = fault_orientations - return (False, "") - - @beartype.beartype - def parse_structure_map(self) -> tuple: - """ - Parse the structure shapefile data into a consistent format - - Returns: - tuple: A tuple of (bool: success/fail, str: failure message) - """ - # Check type and size of loaded structure map - if ( - self.raw_data[Datatype.STRUCTURE] is None - or type(self.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame - ): - logger.warning("Structure map is not loaded or valid") - return (True, "Structure map is not loaded or valid") - - if len(self.raw_data[Datatype.STRUCTURE]) < 2: - logger.warning( - "Stucture map does not enough orientations to complete calculations (need at least 2), projection may be inconsistent" - ) - - # Create new geodataframe - structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"]) - config = self.config.structure_config - - # Parse dip direction and dip columns - if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]: - if config["orientation_type"] == "strike": - structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( - lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 - ) - else: - structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]] - else: - print(f"Structure map does not contain dipdir_column '{config['dipdir_column']}'") - - # Ensure all DIPDIR values are within [0, 360] - structure["DIPDIR"] = structure["DIPDIR"] % 360.0 - - if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]] - else: - print(f"Structure map does not contain dip_column '{config['dip_column']}'") - - # Add bedding and overturned booleans - if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["OVERTURNED"] = ( - self.raw_data[Datatype.STRUCTURE][config["overturned_column"]] - .astype(str) - .str.contains(config["overturned_text"]) - ) - else: - structure["OVERTURNED"] = False - - if config["description_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["BEDDING"] = ( - self.raw_data[Datatype.STRUCTURE][config["description_column"]] - .astype(str) - .str.contains(config["bedding_text"]) - ) - else: - structure["BEDDING"] = False - - # Add object id - if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]] - else: - structure["ID"] = numpy.arange(len(structure)) - - self.data[Datatype.STRUCTURE] = structure - return (False, "") - @beartype.beartype def parse_geology_map(self) -> tuple: """ @@ -845,13 +734,6 @@ def parse_geology_map(self) -> tuple: Returns: tuple: A tuple of (bool: success/fail, str: failure message) """ - # Check type of loaded geology map - if ( - self.raw_data[Datatype.GEOLOGY] is None - or type(self.raw_data[Datatype.GEOLOGY]) is not geopandas.GeoDataFrame - ): - logger.warning("Geology map is not loaded or valid") - return (True, "Geology map is not loaded or valid") # Create new geodataframe geology = geopandas.GeoDataFrame(self.raw_data[Datatype.GEOLOGY]["geometry"]) @@ -862,22 +744,11 @@ def parse_geology_map(self) -> tuple: geology["UNITNAME"] = self.raw_data[Datatype.GEOLOGY][config["unitname_column"]].astype( str ) - else: - msg = f"Geology map does not contain unitname_column {config['unitname_column']}" - print(msg) - logger.warning(msg) - return (True, msg) + if config["alt_unitname_column"] in self.raw_data[Datatype.GEOLOGY]: geology["CODE"] = self.raw_data[Datatype.GEOLOGY][config["alt_unitname_column"]].astype( str ) - else: - msg = ( - f"Geology map does not contain alt_unitname_column {config['alt_unitname_column']}" - ) - print(msg) - logger.warning(msg) - return (True, msg) # Parse group and supergroup columns if config["group_column"] in self.raw_data[Datatype.GEOLOGY]: @@ -947,11 +818,9 @@ def parse_geology_map(self) -> tuple: else: geology["ID"] = numpy.arange(len(geology)) - # TODO: Check for duplicates in "ID" # TODO: Check that the exploded geology has more than 1 unit # Do we need to explode the geometry at this stage for geology/faults/folds??? # If not subsequent classes will need to be able to deal with them - # TODO: Check for Nans or blanks in "UNITNAME", "GROUP", "SUPERGROUP", "DESCRIPTION", "CODE", "ROCKTYPE" # Strip out whitespace (/n /t) and '-', ',', '?' from "UNITNAME", "CODE" "GROUP" "SUPERGROUP" geology["UNITNAME"] = geology["UNITNAME"].str.replace("[ -/?]", "_", regex=True) geology["CODE"] = geology["CODE"].str.replace("[ -/?]", "_", regex=True) @@ -970,12 +839,62 @@ def parse_geology_map(self) -> tuple: return (False, "") @beartype.beartype - def get_minimum_fault_length(self) -> Union[float, int, None]: + def parse_structure_map(self) -> tuple: """ - Get the minimum fault length + Parse the structure shapefile data into a consistent format + + Returns: + tuple: A tuple of (bool: success/fail, str: failure message) """ - return self.minimum_fault_length + # Create new geodataframe + structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"]) + config = self.config.structure_config + + # Parse dip direction and dip columns + if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]: + if config["orientation_type"] == "strike": + structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( + lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 + ) + else: + structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]] + + # Ensure all DIPDIR values are within [0, 360] + structure["DIPDIR"] = structure["DIPDIR"] % 360.0 + + if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]] + + + # Add bedding and overturned booleans + if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["OVERTURNED"] = ( + self.raw_data[Datatype.STRUCTURE][config["overturned_column"]] + .astype(str) + .str.contains(config["overturned_text"]) + ) + else: + structure["OVERTURNED"] = False + + if config["description_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["BEDDING"] = ( + self.raw_data[Datatype.STRUCTURE][config["description_column"]] + .astype(str) + .str.contains(config["bedding_text"]) + ) + else: + structure["BEDDING"] = False + + # Add object id + if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]: + structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]] + else: + structure["ID"] = numpy.arange(len(structure)) + + self.data[Datatype.STRUCTURE] = structure + return (False, "") + @beartype.beartype def parse_fault_map(self) -> tuple: @@ -985,14 +904,6 @@ def parse_fault_map(self) -> tuple: Returns: tuple: A tuple of (bool: success/fail, str: failure message) """ - # Check type of loaded fault map - if ( - self.raw_data[Datatype.FAULT] is None - or type(self.raw_data[Datatype.FAULT]) is not geopandas.GeoDataFrame - ): - logger.warning("Fault map is not loaded or valid") - return (True, "Fault map is not loaded or valid") - # Create a new geodataframe faults = geopandas.GeoDataFrame(self.raw_data[Datatype.FAULT]["geometry"]) @@ -1005,11 +916,12 @@ def parse_fault_map(self) -> tuple: self.minimum_fault_length = calculate_minimum_fault_length( bbox=self.bounding_box, area_percentage=0.05 ) - + logger.info(f"Calculated minimum fault length - {self.minimum_fault_length}") + # crop faults = faults.loc[faults.geometry.length >= self.minimum_fault_length] - - if config["structtype_column"] in self.raw_data[Datatype.FAULT]: + + if config["structtype_column"] in self.raw_data[Datatype.FAULT]: faults["FEATURE"] = self.raw_data[Datatype.FAULT][config["structtype_column"]] faults = faults[faults["FEATURE"].astype(str).str.contains(config["fault_text"])] if self.verbose_level > VerboseLevel.NONE: @@ -1039,7 +951,7 @@ def parse_fault_map(self) -> tuple: # Filter the DataFrame to remove rows where 'NAME' is in the existing_codes if existing_codes: faults = faults[~faults["NAME"].isin(existing_codes)] - logger.info(f"The following codes were found and removed: {existing_codes}") + logger.info(f"The following faults were found and removed as per the config: {existing_codes}") else: logger.info("None of the fault ignore codes exist in the original fault data.") pass @@ -1126,6 +1038,63 @@ def parse_fault_map(self) -> tuple: return (False, "") + @beartype.beartype + def parse_fault_orientations(self) -> tuple: + """ + Parse the fault orientations shapefile data into a consistent format + + Returns: + tuple: A tuple of (bool: success/fail, str: failure message) + """ + # Check type and size of loaded structure map + + + # Create new geodataframe + fault_orientations = geopandas.GeoDataFrame( + self.raw_data[Datatype.FAULT_ORIENTATION]["geometry"] + ) + + config = self.config.fault_config + + # Parse dip direction and dip columns + if config["dipdir_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: + if config["orientation_type"] == "strike": + fault_orientations["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( + lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 + ) + else: + fault_orientations["DIPDIR"] = self.raw_data[Datatype.FAULT_ORIENTATION][ + config["dipdir_column"] + ] + else: + print( + f"Fault orientation shapefile does not contain dipdir_column '{config['dipdir_column']}'" + ) + + if config["dip_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: + fault_orientations["DIP"] = self.raw_data[Datatype.FAULT_ORIENTATION][ + config["dip_column"] + ] + else: + print( + f"Fault orientation shapefile does not contain dip_column '{config['dip_column']}'" + ) + + # TODO LG would it be worthwhile adding a description column for faults? + # it would be possible to parse out the fault displacement, type, slip direction + # if this was stored in the descriptions? + + # Add object id + if config["objectid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: + fault_orientations["ID"] = self.raw_data[Datatype.FAULT_ORIENTATION][ + config["objectid_column"] + ] + else: + fault_orientations["ID"] = numpy.arange(len(fault_orientations)) + self.data[Datatype.FAULT_ORIENTATION] = fault_orientations + return (False, "") + + @beartype.beartype def parse_fold_map(self) -> tuple: """ @@ -1682,4 +1651,4 @@ def STRUCTURE(self): @property def FAULT(self): - return self.get_map_data(Datatype.FAULT) + return self.get_map_data(Datatype.FAULT) \ No newline at end of file From 29466fe3986d5c8e224b0334ffbd6946bfe0cad8 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 01:12:34 +0000 Subject: [PATCH 14/29] style: style fixes by ruff and autoformatting by black --- map2loop/mapdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index a1bb3ae3..4f37b27f 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -20,7 +20,7 @@ import beartype import os from io import BytesIO -from typing import Union, Tuple +from typing import Union import tempfile From 848f31cd2b67728468bc681e838caa855e59701d Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 12:13:51 +1100 Subject: [PATCH 15/29] tests: add tests for data_checks for each datatype --- tests/mapdata/test_input_data_faults.py | 106 +++++++++++ tests/mapdata/test_input_data_geology.py | 202 +++++++++++++++++++++ tests/mapdata/test_input_data_structure.py | 136 ++++++++++++++ 3 files changed, 444 insertions(+) create mode 100644 tests/mapdata/test_input_data_faults.py create mode 100644 tests/mapdata/test_input_data_geology.py create mode 100644 tests/mapdata/test_input_data_structure.py diff --git a/tests/mapdata/test_input_data_faults.py b/tests/mapdata/test_input_data_faults.py new file mode 100644 index 00000000..cf668889 --- /dev/null +++ b/tests/mapdata/test_input_data_faults.py @@ -0,0 +1,106 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.m2l_enums import Datatype +from map2loop.data_checks import check_fault_fields_validity + +@pytest.mark.parametrize( + "fault_data, fault_config, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["Fault A", "Fault B"], + "ID": [1, 2] + }, + {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, + False, + "" + ), + # Invalid geometry + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]) # Invalid geometry + ], + "FEATURE": ["Fault A", "Fault B"], + "ID": [1, 2] + }, + {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, + True, + "Invalid geometries found in FAULT data." + ), + + # Non-string FEATURE column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": [5, 2], + "ID": [1, 2] + }, + {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, + True, + "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values." + ), + # Invalid values in DIP estimate column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["Fault", "Fault"], + "NAME": ["Zuleika", "Zuleika"], + "ID": [1, 2], + "DIP": [70, 50], + "STRIKE": [150, None], + "DEC": ["north_east", "southt"], + }, + { + "structtype_column": "FEATURE", + "fault_text": "Fault", + "objectid_column": "ID", + "name_column": "NAME", + "dip_column": "DIP", + "dipdir_column": "STRIKE", + "dip_estimate_column": "DEC" + }, + True, + "Datatype FAULT: Column 'DEC' contains invalid values. Allowed values: ['north_east', 'south_east', 'south_west', 'north_west', 'north', 'east', 'south', 'west']." + ), + ], + ids=[ + "Valid fault data", + "Invalid geometry", + "Non-string FEATURE column", + "Invalid DIP estimate column" + ] +) +def test_check_fault_fields_validity(fault_data, fault_config, expected_validity, expected_message): + # Dynamically create the mock config for this test case + class MockConfig: + def __init__(self, config): + self.fault_config = config + + # Create a GeoDataFrame + fault_gdf = gpd.GeoDataFrame(fault_data, crs="EPSG:4326") + + # Instantiate the MapData class with the dynamic mock config and data + map_data = MapData() + map_data.config = MockConfig(fault_config) + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.FAULT] = fault_gdf + + # Test the check_fault_fields_validity function + validity_check, message = check_fault_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message diff --git a/tests/mapdata/test_input_data_geology.py b/tests/mapdata/test_input_data_geology.py new file mode 100644 index 00000000..4b083e76 --- /dev/null +++ b/tests/mapdata/test_input_data_geology.py @@ -0,0 +1,202 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.data_checks import check_geology_fields_validity + +# Datatype Enum +class Datatype: + GEOLOGY = 0 + +# Config +class MockConfig: + def __init__(self): + self.geology_config = { + "unitname_column": "UNITNAME", + "alt_unitname_column": "CODE", + "group_column": "GROUP", + "supergroup_column": "SUPERGROUP", + "description_column": "DESCRIPTION", + "rocktype_column": "ROCKTYPE1", + "alt_rocktype_column": "ROCKTYPE2", + "minage_column": "MIN_AGE", + "maxage_column": "MAX_AGE", + "objectid_column": "ID", + "ignore_lithology_codes": [], + } + +@pytest.mark.parametrize( + "geology_data, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + False, + "", + ), + # Invalid geometry + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)])], + "UNITNAME": ["Sandstone"], + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Invalid geometries found in datatype GEOLOGY", + ), + # Missing required column + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + # "CODE": ["SST"], # Missing required column + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Datatype GEOLOGY: Required column with config key: 'alt_unitname_column' is missing from geology data.", + ), + # Non-string value in required column + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": ["Sandstone"], + "CODE": [2], # Non-string value + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Datatype GEOLOGY: Column 'alt_unitname_column' must contain only string values. Please check that the column contains only string values.", + ), + # NaN or blank value in required column + ( + { + "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])], + "UNITNAME": [""], # Blank value + "CODE": ["SST"], + "GROUP": ["Sedimentary"], + "SUPERGROUP": ["Mesozoic"], + "DESCRIPTION": ["A type of sandstone"], + "ROCKTYPE1": ["Clastic"], + "ROCKTYPE2": ["Quartz"], + "MIN_AGE": [150.0], + "MAX_AGE": [200.0], + "ID": [1], + }, + True, + "Datatype GEOLOGY: NaN or blank values found in required column 'unitname_column'. Please double check the column for blank values.", + ), + # Duplicate ID values + ( + { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ["fr", "df"], + "CODE": ["SST", "FGH"], + "GROUP": ["Sedimentary", "Ign"], + "SUPERGROUP": ["Mesozoic", "Arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDS"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, 1], # Duplicate ID + }, + True, + "Datatype GEOLOGY: Duplicate values found in column 'ID' (config key: 'objectid_column').", + ), + # nan in id + ( + { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ["fr", "df"], + "CODE": ["SST", "FGH"], + "GROUP": ["Sedimentary", "Ign"], + "SUPERGROUP": ["Mesozoic", "Arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDS"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, None], + }, + True, + "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains NaN or null values.", + ), + # nan in unit name + ( + { + "geometry": [ + shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), + shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]), + ], + "UNITNAME": ["fr", None], + "CODE": ["SST", "FGH"], + "GROUP": ["Sedimentary", "Ign"], + "SUPERGROUP": ["Mesozoic", "Arc"], + "DESCRIPTION": ["A", "B"], + "ROCKTYPE1": ["A", "B"], + "ROCKTYPE2": ["Quartz", "FDS"], + "MIN_AGE": [150.0, 200], + "MAX_AGE": [200.0, 250], + "ID": [1, 1], # Duplicate ID + }, + True, + "Datatype GEOLOGY: Column 'unitname_column' must contain only string values. Please check that the column contains only string values.", + ), + ], +) + + + +def test_check_geology_fields_validity(geology_data, expected_validity, expected_message): + # Create a GeoDataFrame + geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326") + + # Instantiate the MapData class with the mock config and data + map_data = MapData() + map_data.config = MockConfig() + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.GEOLOGY] = geology_gdf + + # Test the check_geology_fields_validity function + validity_check, message = check_geology_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message \ No newline at end of file diff --git a/tests/mapdata/test_input_data_structure.py b/tests/mapdata/test_input_data_structure.py new file mode 100644 index 00000000..d96441fc --- /dev/null +++ b/tests/mapdata/test_input_data_structure.py @@ -0,0 +1,136 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.data_checks import check_structure_fields_validity + +# Datatype Enum +class Datatype: + STRUCTURE = 1 + +# Config +class MockConfig: + def __init__(self): + self.structure_config = { + "dipdir_column": "DIPDIR", + "dip_column": "DIP", + "description_column": "DESCRIPTION", + "overturned_column": "OVERTURNED", + "objectid_column": "ID", + } + +@pytest.mark.parametrize( + "structure_data, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Point(1, 1) + ], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2] + }, + False, + "", + ), + # Invalid geometry + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]) # Invalid geometry + ], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2] + }, + True, + "Invalid geometries found in datatype STRUCTURE", + ), + # Missing required column + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Point(1, 1) + ], + # "DIPDIR": [45.0, 135.0], # Missing required column + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2] + }, + True, + "Datatype STRUCTURE: Required column with config key: 'dipdir_column' is missing from structure data.", + ), + # Non-numeric value in numeric column + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Point(1, 1) + ], + "DIPDIR": ["A", "B"], # Non-numeric value + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2] + }, + True, + "Datatype STRUCTURE: Column 'dipdir_column' must contain only numeric values. Please check that the column contains only numeric values.", + ), + # NaN or blank value in required column + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Point(1, 1) + ], + "DIPDIR": [None, 3], # NaN value + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 2] + }, + True, + "Datatype STRUCTURE: NaN or blank values found in required column 'dipdir_column'. Please double check the column for blank values.", + ), + # Duplicate ID column + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), + shapely.geometry.Point(1, 1) + ], + "DIPDIR": [45.0, 135.0], + "DIP": [30.0, 45.0], + "DESCRIPTION": ["Description1", "Description2"], + "OVERTURNED": ["Yes", "No"], + "ID": [1, 1] # Duplicate ID + }, + True, + "Datatype STRUCTURE: ID column 'ID' (config key: 'objectid_column') contains duplicate values.", + ), + ], +) +def test_check_structure_fields_validity(structure_data, expected_validity, expected_message): + # Create a GeoDataFrame + structure_gdf = gpd.GeoDataFrame(structure_data, crs="EPSG:4326") + + # Instantiate the MapData class with the mock config and data + map_data = MapData() + map_data.config = MockConfig() + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.STRUCTURE] = structure_gdf + + # Test the check_structure_fields_validity function + validity_check, message = check_structure_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message From c68ce9a004761a6b8551d0869723b0ae1583d516 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 12:26:53 +1100 Subject: [PATCH 16/29] fix: actually abort the process if validation fails --- map2loop/mapdata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index 4f37b27f..60ee159c 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -694,24 +694,24 @@ def check_map(self, datatype: Datatype): if datatype == Datatype.GEOLOGY: validity_check, message = check_geology_fields_validity(mapdata = self) if validity_check: - logger.error(f"Datatype GEOLOGY data validation failed: {message}") - return + logger.error(f"Datatype GEOLOGY - data validation failed: {message}") + raise ValueError(f"Datatype GEOLOGY - data validation failed: {message}") func = self.parse_geology_map #check and parse structure data elif datatype == Datatype.STRUCTURE: validity_check, message = check_structure_fields_validity(mapdata = self) if validity_check: - logger.error(f"Datatype STRUCTURE data validation failed: {message}") - return + logger.error(f"Datatype STRUCTURE - data validation failed: {message}") + raise ValueError(f"Datatype STRUCTURE - data validation failed: {message}") func = self.parse_structure_map #check and parse fault data elif datatype == Datatype.FAULT: validity_check, message = check_fault_fields_validity(mapdata = self) if validity_check: - logger.error(f"Datatype FAULT data validation failed: {message}") - return + logger.error(f"Datatype FAULT - data validation failed: {message}") + raise ValueError(f"Datatype FAULT - data validation failed: {message}") func = self.parse_fault_map elif datatype == Datatype.FAULT_ORIENTATION: From 06ff5510bcfbd1d24dc028f855155379a2098707 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 13:28:39 +1100 Subject: [PATCH 17/29] fix: add config check step to project --- map2loop/config.py | 25 ------------------------- map2loop/data_checks.py | 27 +++++++++++++++++++++++++++ map2loop/project.py | 5 ++++- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/map2loop/config.py b/map2loop/config.py index 2a0efdf7..e5467e95 100644 --- a/map2loop/config.py +++ b/map2loop/config.py @@ -213,31 +213,6 @@ def update_from_file( err_string += "Check the contents for mismatched quotes or brackets!" raise Exception(err_string) - @beartype.beartype - def validate_config_dictionary(self, config_dict: dict) -> None: - required_keys = { - "structure": {"dipdir_column", "dip_column"}, - "geology": {"unitname_column", "alt_unitname_column"}, - } - - # Loop over "structure" and "geology" - for section, keys in required_keys.items(): - - # 1) Check that "section" exists - if section not in config_dict: - logger.error(f"Missing required section '{section}' in config dictionary.") - raise ValueError(f"Missing required section '{section}' in config dictionary.") - - # 2) Check that each required key is in config_dict[section] - for key in keys: - if key not in config_dict[section]: - logger.error( - f"Missing required key '{key}' for '{section}' section of the config dictionary." - ) - raise ValueError( - f"Missing required key '{key}' for '{section}' section of the config dictionary." - ) - @beartype.beartype def check_for_legacy_keys(self, config_dict: dict) -> None: diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index 66ec94e6..a254de16 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -442,3 +442,30 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: ) return (False, "") + + +@beartype.beartype +def validate_config_dictionary(config_dict: dict) -> None: + + required_keys = { + "structure": {"dipdir_column", "dip_column"}, + "geology": {"unitname_column", "alt_unitname_column"}, + } + + # Loop over "structure" and "geology" + for section, keys in required_keys.items(): + + # 1) Check that "section" exists + if section not in config_dict: + logger.error(f"Missing required section '{section}' in config dictionary.") + raise ValueError(f"Missing required section '{section}' in config dictionary.") + + # 2) Check that each required key is in config_dict[section] + for key in keys: + if key not in config_dict[section]: + logger.error( + f"Missing required key '{key}' for '{section}' section of the config dictionary." + ) + raise ValueError( + f"Missing required key '{key}' for '{section}' section of the config dictionary." + ) \ No newline at end of file diff --git a/map2loop/project.py b/map2loop/project.py index 6e4303c6..ca4ecf8d 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -11,6 +11,7 @@ from .stratigraphic_column import StratigraphicColumn from .deformation_history import DeformationHistory from .map2model_wrapper import Map2ModelWrapper +from .data_checks import validate_config_dictionary # external imports import LoopProjectFile as LPF @@ -231,8 +232,10 @@ def __init__( self.map_data.set_config_filename(config_filename) if config_dictionary != {}: - self.map_data.config.validate_config_dictionary(config_dictionary) + validate_config_dictionary(config_dictionary) self.map_data.config.update_from_dictionary(config_dictionary) + # print(self.map_data.config) + # self.map_data.config.validate_config_dictionary(config_dictionary) if clut_filename != "": self.map_data.set_colour_filename(clut_filename) From 6529bf8427668273d8f963ffc27742fef49e1151 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 14:02:12 +1100 Subject: [PATCH 18/29] fix: add extra checks for config dictionary --- map2loop/config.py | 30 +----------------- map2loop/data_checks.py | 69 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 32 deletions(-) diff --git a/map2loop/config.py b/map2loop/config.py index e5467e95..edff2e20 100644 --- a/map2loop/config.py +++ b/map2loop/config.py @@ -100,8 +100,6 @@ def update_from_dictionary(self, dictionary: dict, lower: bool = True): Args: dictionary (dict): The dictionary to update from """ - # make sure dictionary doesn't contain legacy keys - self.check_for_legacy_keys(dictionary) if "structure" in dictionary: self.structure_config.update(dictionary["structure"]) @@ -211,30 +209,4 @@ def update_from_file( else: err_string += "Please check the file exists and is accessible then\n" err_string += "Check the contents for mismatched quotes or brackets!" - raise Exception(err_string) - - - @beartype.beartype - def check_for_legacy_keys(self, config_dict: dict) -> None: - - legacy_keys = { - "otype", "dd", "d", "sf", "bedding", "bo", "btype", "gi", "c", "u", - "g", "g2", "ds", "min", "max", "r1", "r2", "sill", "intrusive", "volcanic", - "f", "fdipnull", "fdipdip_flag", "fdipdir", "fdip", "fdipest", - "fdipest_vals", "n", "ff", "t", "syn" - } - - # Recursively search for keys in the dictionary - def check_keys(d: dict, parent_key=""): - for key, value in d.items(): - if key in legacy_keys: - logger.error( - f"Legacy key found in config - '{key}' at '{parent_key + key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" - ) - raise ValueError( - f"Legacy key found in config - '{key}' at '{parent_key + key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" - ) - if isinstance(value, dict): - check_keys(value, parent_key=f"{parent_key}{key}.") - - check_keys(config_dict) \ No newline at end of file + raise Exception(err_string) \ No newline at end of file diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index a254de16..83b9f1c0 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -447,6 +447,7 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: @beartype.beartype def validate_config_dictionary(config_dict: dict) -> None: + # 1) check mandatory keys for "structure" and "geology" required_keys = { "structure": {"dipdir_column", "dip_column"}, "geology": {"unitname_column", "alt_unitname_column"}, @@ -463,9 +464,71 @@ def validate_config_dictionary(config_dict: dict) -> None: # 2) Check that each required key is in config_dict[section] for key in keys: if key not in config_dict[section]: + logger.error(f"Missing required key '{key}' for '{section}' section of the config dictionary.") + raise ValueError(f"Missing required key '{key}' for '{section}' section of the config dictionary.") + + # 2) check for legacy keys first: + legacy_keys = { + "otype", "dd", "d", "sf", "bedding", "bo", "btype", "gi", "c", "u", + "g", "g2", "ds", "min", "max", "r1", "r2", "sill", "intrusive", "volcanic", + "f", "fdipnull", "fdipdip_flag", "fdipdir", "fdip", "fdipest", + "fdipest_vals", "n", "ff", "t", "syn" + } + + def check_keys(d: dict, parent_key=""): + for key, value in d.items(): + if key in legacy_keys: logger.error( - f"Missing required key '{key}' for '{section}' section of the config dictionary." + f"Legacy key found in config - '{key}' at '{parent_key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" ) raise ValueError( - f"Missing required key '{key}' for '{section}' section of the config dictionary." - ) \ No newline at end of file + f"Legacy key found in config - '{key}' at '{parent_key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed" + ) + if isinstance(value, dict): + check_keys(value, parent_key=f"{parent_key}{key}.") + + check_keys(config_dict) + + # 3) check if all keys are valid: + allowed_keys_by_section = { + "structure": { + "orientation_type", "dipdir_column", "dip_column", + "description_column", "bedding_text", "overturned_column", "overturned_text", + "objectid_column", "desciption_column", + }, + "geology": { + "unitname_column", "alt_unitname_column", "group_column", + "supergroup_column", "description_column", "minage_column", + "maxage_column", "rocktype_column", "alt_rocktype_column", + "sill_text", "intrusive_text", "volcanic_text", "objectid_column", "ignore_lithology_codes", + }, + "fault": { + "structtype_column", "fault_text", "dip_null_value", + "dipdir_flag", "dipdir_column", "dip_column", "orientation_type", + "dipestimate_column", "dipestimate_text", "name_column", + "objectid_column", "minimum_fault_length", "ignore_fault_codes", + }, + "fold": { + "structtype_column", "fold_text", "description_column", + "synform_text", "foldname_column","objectid_column", + }, + } + + for section_name, section_dict in config_dict.items(): + # check section + if section_name not in allowed_keys_by_section: + logger.error(f"Unrecognized section '{section_name}' in config dictionary.") + raise ValueError(f"Unrecognized section '{section_name}' in config dictionary.") + + # check keys + allowed_keys = allowed_keys_by_section[section_name] + for key in section_dict.keys(): + if key not in allowed_keys: + logger.error(f"Key '{key}' is not an allowed key in the '{section_name}' section.") + raise ValueError(f"Key '{key}' is not an allowed key in the '{section_name}' section.") + + # 4) check if minimum fault length is a number + mfl = config_dict.get("fault", {}).get("minimum_fault_length", None) + if mfl is not None and not isinstance(mfl, (int, float)): + logger.error("minimum_fault_length must be a number.") + raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}") \ No newline at end of file From ce444892d6c87602ae9abbd82d820e9ba95dfbe0 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 14:10:56 +1100 Subject: [PATCH 19/29] tests: add config test and reorganise --- tests/data_checks/test_config.py | 135 ++++++++++++++++++ .../test_input_data_faults.py | 0 .../test_input_data_geology.py | 0 .../test_input_data_structure.py | 0 4 files changed, 135 insertions(+) create mode 100644 tests/data_checks/test_config.py rename tests/{mapdata => data_checks}/test_input_data_faults.py (100%) rename tests/{mapdata => data_checks}/test_input_data_geology.py (100%) rename tests/{mapdata => data_checks}/test_input_data_structure.py (100%) diff --git a/tests/data_checks/test_config.py b/tests/data_checks/test_config.py new file mode 100644 index 00000000..cc6f01ba --- /dev/null +++ b/tests/data_checks/test_config.py @@ -0,0 +1,135 @@ +import pytest +from map2loop.data_checks import validate_config_dictionary + + +@pytest.fixture +def valid_config(): + return { + "structure": { + "orientation_type": "dip direction", + "dipdir_column": "azimuth", + "dip_column": "inclinatn", + "description_column": "DESCRIPTION", + "bedding_text": "bed", + "overturned_column": "no_col", + "overturned_text": "blah", + "objectid_column": "geographic", + "desciption_column": "sub_type" + }, + "geology": { + "unitname_column": "formatted_", + "alt_unitname_column": "abbreviate", + "group_column": "no_col", + "supergroup_column": "interpreta", + "description_column": "text_descr", + "minage_column": "no_col", + "maxage_column": "no_col", + "rocktype_column": "rank", + "alt_rocktype_column": "type", + "sill_text": "sill", + "intrusive_text": "intrusion", + "volcanic_text": "volc", + "objectid_column": "ID", + "ignore_lithology_codes": ["cover"] + }, + "fault": { + "structtype_column": "featuretyp", + "fault_text": "s", + "dip_null_value": "0", + "dipdir_flag": "num", + "dipdir_column": "no_col", + "dip_column": "no_col", + "orientation_type": "dip direction", + "dipestimate_column": "no_col", + "dipestimate_text": "no_col", + "name_column": "no_col", + "objectid_column": "geographic", + "minimum_fault_length": 100.0, + "ignore_fault_codes": [] + }, + "fold": { + "structtype_column": "featuretyp", + "fold_text": "fold", + "description_column": "no_col", + "synform_text": "syn", + "foldname_column": "NAME", + "objectid_column": "geographic" + } + } + + +def test_valid_config_no_errors(valid_config): + # Should not raise any error + validate_config_dictionary(valid_config) + + +def test_missing_required_section(valid_config): + + config_missing_structure = dict(valid_config) + del config_missing_structure["structure"] # remove required section + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_missing_structure) + assert "Missing required section 'structure'" in str(exc_info.value) + + +def test_missing_required_key(valid_config): + + config_missing_dip = dict(valid_config) + + del config_missing_dip["structure"]["dip_column"] # remove required key + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_missing_dip) + assert "Missing required key 'dip_column' for 'structure'" in str(exc_info.value) + + +def test_unrecognized_section(valid_config): + + config_extra_section = dict(valid_config) + config_extra_section["random_section"] = {"random_key": "random_value"} + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_extra_section) + assert "Unrecognized section 'random_section'" in str(exc_info.value) + + +def test_unrecognized_key_in_section(valid_config): + + config_extra_key = dict(valid_config) + config_extra_key["structure"]["random_key"] = "random_value" + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_extra_key) + assert "Key 'random_key' is not an allowed key in the 'structure' section." in str(exc_info.value) + + +def test_legacy_key_detected(valid_config): + + config_with_legacy = dict(valid_config) + config_with_legacy["structure"]["otype"] = "legacy_value" # 'otype' --> legacy key + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_with_legacy) + assert "Legacy key found in config - 'otype'" in str(exc_info.value) + + +def test_minimum_fault_length_wrong_type(valid_config): + + config_wrong_mfl = dict(valid_config) + config_wrong_mfl["fault"]["minimum_fault_length"] = "one_hundred" # invalid type + + with pytest.raises(ValueError) as exc_info: + validate_config_dictionary(config_wrong_mfl) + assert "minimum_fault_length must be a number" in str(exc_info.value) + + +def test_minimum_fault_length_missing(valid_config): + """ + Remove minimum_fault_length entirely. That should be fine (None -> no check). + """ + config_no_mfl = dict(valid_config) + del config_no_mfl["fault"]["minimum_fault_length"] + + # Should not raise any error, as it's optional + validate_config_dictionary(config_no_mfl) + diff --git a/tests/mapdata/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py similarity index 100% rename from tests/mapdata/test_input_data_faults.py rename to tests/data_checks/test_input_data_faults.py diff --git a/tests/mapdata/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py similarity index 100% rename from tests/mapdata/test_input_data_geology.py rename to tests/data_checks/test_input_data_geology.py diff --git a/tests/mapdata/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py similarity index 100% rename from tests/mapdata/test_input_data_structure.py rename to tests/data_checks/test_input_data_structure.py From 0a85eac3fe89600ccdf31e8ef983d50febe66dec Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Wed, 8 Jan 2025 16:47:53 +1100 Subject: [PATCH 20/29] fix: add data checks for fold data and update tests accordingly --- map2loop/data_checks.py | 154 +++++++++++++++++++++- map2loop/mapdata.py | 12 +- tests/data_checks/test_input_data_fold.py | 112 ++++++++++++++++ 3 files changed, 270 insertions(+), 8 deletions(-) create mode 100644 tests/data_checks/test_input_data_fold.py diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index 83b9f1c0..0954b08f 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -13,7 +13,7 @@ @beartype.beartype def check_geology_fields_validity(mapdata) -> tuple[bool, str]: - #TODO (AR) - add check for gaps in geology data + #TODO (AR) - add check for gaps in geology data (inspo here: https://medium.com/@achm.firmansyah/an-approach-for-checking-overlaps-and-gaps-in-polygons-using-geopandas-ebd6606e7f70 ) """ Validate the columns in GEOLOGY geodataframe @@ -42,7 +42,16 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: if not geology_data.geometry.is_valid.all(): logger.error("Invalid geometries found. Please fix those before proceeding with map2loop processing") return (True, "Invalid geometries found in datatype GEOLOGY") + + # Check if all geometries are Polygon or MultiPolygon + if not geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon))).all(): + invalid_types = geology_data[~geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon)))] + logger.error( + f"datatype GEOLOGY: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}" + ) + return (True, "Invalid geometry types found in datatype GEOLOGY. All geometries must be Polygon or MultiPolygon.") + # # 2. Required Columns & are they str, and then empty or null? required_columns = [config["unitname_column"], config["alt_unitname_column"]] for col in required_columns: @@ -170,6 +179,14 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: logger.error("datatype STRUCTURE: Invalid geometries found. Please fix those before proceeding with map2loop processing") return (True, "Invalid geometries found in datatype STRUCTURE") + # Check if all geometries are Points + if not structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point)).all(): + invalid_types = structure_data[~structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point))] + logger.error( + f"datatype STRUCTURE: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}" + ) + return (True, "Invalid geometry types found in datatype STRUCTURE. All geometries must be Points.") + # 2. Check mandatory numeric columns required_columns = [config["dipdir_column"], config["dip_column"]] for col in required_columns: @@ -443,6 +460,141 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: return (False, "") +@beartype.beartype +def check_fold_fields_validity(mapdata) -> Tuple[bool, str]: + # Check type of loaded fold map + if ( + mapdata.raw_data[Datatype.FOLD] is None + or type(mapdata.raw_data[Datatype.FOLD]) is not geopandas.GeoDataFrame + ): + logger.warning("Fold map is not loaded or valid") + return (True, "Fold map is not loaded or valid") + + folds = mapdata.raw_data[Datatype.FOLD] + config = mapdata.config.fold_config + + # Debugging: Print column names in the fold_data + logger.debug(f"Fold data columns: {folds.columns.tolist()}") + + # Check geometry + if not folds.geometry.is_valid.all(): + logger.error("datatype FOLD: Invalid geometries found. Please fix those before proceeding with map2loop processing") + return (True, "Invalid geometries found in FOLD data.") + + # Check for LineString or MultiLineString geometries + if not folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all(): + invalid_types = folds[~folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))] + logger.error( + f"datatype FOLD: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}" + ) + return (True, "Invalid geometry types found in FOLD data.") + + # Check "structtype_column" if it exists + if "structtype_column" in config: + structtype_column = config["structtype_column"] + + # Ensure the column exists in the data + if structtype_column not in folds.columns: + logger.warning( + f"Datatype FOLD: '{structtype_column}' (config key: 'structtype_column') is missing from the fold data. Consider removing that key from the config" + ) + return (True, f"Column '{structtype_column}' is missing from the fold data.") + else: + # Check if all entries in the column are strings + if not folds[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.error( + f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." + ) + return (True, f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.") + + # Warn about empty or null cells + if folds[structtype_column].isnull().any() or folds[structtype_column].str.strip().eq("").any(): + logger.warning( + f"Datatype FOLD: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected." + ) + + # Check if "fold_text" is defined and contained in the column + fold_text = config.get("fold_text", None) + if fold_text: + + # check if fold text is a string + if not isinstance(fold_text, str): + logger.error("Datatype FOLD: 'fold_text' must be a string. Please ensure it is defined correctly in the config.") + return (True, "Datatype FOLD: 'fold_text' must be a string.") + #check if it exists in the column strtype + if not folds[structtype_column].str.contains(fold_text, na=False).any(): + logger.error(f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'. This may impact processing.") + return (True, f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'.") + + # check synform_text + synform_text = config.get("synform_text", None) + if synform_text: + # Check if synform_text is a string + if not isinstance(synform_text, str): + logger.error("Datatype FOLD: 'synform_text' must be a string. Please ensure it is defined correctly in the config.") + return (True, "Datatype FOLD: 'synform_text' must be a string.") + # Check if it exists in the structtype_column + if not folds[structtype_column].str.contains(synform_text, na=False).any(): + logger.warning( + f"Datatype FOLD: The 'synform_text' value '{synform_text}' is not found in column '{structtype_column}'. This may impact processing." + ) + + # check description column + description_column = config.get("description_column", None) + if description_column: + # Ensure the column exists in the data + if description_column not in folds.columns: + logger.warning( + f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') is missing from the fold data. Consider removing that key from the config." + ) + else: + # Check if all entries in the column are strings + if not folds[description_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + logger.error( + f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') contains non-string values. Please ensure all values in this column are strings." + ) + return (True, f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') contains non-string values.") + + # Warn about empty or null cells + if folds[description_column].isnull().any() or folds[description_column].str.strip().eq("").any(): + logger.warning( + f"Datatype FOLD: Column '{description_column}' contains NaN, empty, or blank values. Processing might not work as expected." + ) + + + # Check ID column + id_column = config.get("objectid_column") + + if id_column: + if id_column in folds.columns: + # Attempt to coerce the ID column to numeric + folds[id_column] = pandas.to_numeric(folds[id_column], errors='coerce') + + # Check if all values are integers or null after coercion + if not folds[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all(): + logger.warning( + f"Datatype FOLD: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs." + ) + + # Check for NaN values + if folds[id_column].isnull().any(): + logger.warning( + f"Datatype FOLD: ID column '{id_column}' contains NaN or null or str values. Rectify this or remove the key from the config to auto-generate IDs." + ) + + # Check for duplicate values + if folds[id_column].duplicated().any(): + logger.error( + f"Datatype FOLD: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs." + ) + return (True, f"Datatype FOLD: ID column '{id_column}' contains duplicate values.") + else: + logger.warning( + f"Datatype FOLD: ID column '{id_column}' is missing from the fold data. Ensure the column name is correct or remove the key from the config." + ) + + return (False, "") + @beartype.beartype def validate_config_dictionary(config_dict: dict) -> None: diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index 60ee159c..e9fbb302 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -3,7 +3,7 @@ from .config import Config from .aus_state_urls import AustraliaStateUrls from .utils import generate_random_hex_colors, calculate_minimum_fault_length -from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity +from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity, check_fold_fields_validity # external imports import geopandas @@ -719,6 +719,10 @@ def check_map(self, datatype: Datatype): #check and parse fold data elif datatype == Datatype.FOLD: + validity_check, message = check_fold_fields_validity(mapdata = self) + if validity_check: + logger.error(f"Datatype FOLD - data validation failed: {message}") + raise ValueError(f"Datatype FOLD - data validation failed: {message}") func = self.parse_fold_map if func: @@ -1103,12 +1107,6 @@ def parse_fold_map(self) -> tuple: Returns: tuple: A tuple of (bool: success/fail, str: failure message) """ - # Check type of loaded fold map - if ( - self.raw_data[Datatype.FOLD] is None - or type(self.raw_data[Datatype.FOLD]) is not geopandas.GeoDataFrame - ): - return (True, "Fold map is not loaded or valid") # Create new geodataframe folds = geopandas.GeoDataFrame(self.raw_data[Datatype.FOLD]["geometry"]) diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py new file mode 100644 index 00000000..cb92ad2e --- /dev/null +++ b/tests/data_checks/test_input_data_fold.py @@ -0,0 +1,112 @@ +import pytest +import geopandas as gpd +import shapely.geometry +from map2loop.mapdata import MapData +from map2loop.m2l_enums import Datatype +from map2loop.data_checks import check_fold_fields_validity + +@pytest.mark.parametrize( + "fold_data, fold_config, expected_validity, expected_message", + [ + # Valid data + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["fold A", "fold B"], + "ID": [1, 2], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + False, + "" + ), + # Missing geometry + ( + { + "geometry": [ + shapely.geometry.Point(0, 0), # Invalid type + shapely.geometry.LineString([(0, 0), (1, 1)]) + ], + "FEATURE": ["fold A", "fold B"], + "ID": [1, 2], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + True, + "Invalid geometry types found in FOLD data." + ), + # Non-string FEATURE column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": [123, 456], # Invalid type + "ID": [1, 2], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + True, + "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values." + ), + # Missing ID column + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["fold A", "fold B"], + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + False, + "" + ), + # Duplicate ID values + ( + { + "geometry": [ + shapely.geometry.LineString([(0, 0), (1, 1)]), + shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]]) + ], + "FEATURE": ["fold A", "fold B"], + "ID": [1, 1], # Duplicate values + "description": ["desc1", "desc2"] + }, + {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, + True, + "Datatype FOLD: ID column 'ID' contains duplicate values." + ), + ], + ids=[ + "Valid fold data", + "Invalid geometry", + "Non-string FEATURE column", + "Missing ID column", + "Duplicate ID values" + ] +) +def test_check_fold_fields_validity(fold_data, fold_config, expected_validity, expected_message): + # Dynamically create the mock config for this test case + class MockConfig: + def __init__(self, config): + self.fold_config = config + + # Create a GeoDataFrame + fold_gdf = gpd.GeoDataFrame(fold_data, crs="EPSG:4326") + + # Instantiate the MapData class with the dynamic mock config and data + map_data = MapData() + map_data.config = MockConfig(fold_config) + map_data.raw_data = [None] * len(Datatype.__dict__) + map_data.raw_data[Datatype.FOLD] = fold_gdf + + # Test the check_fold_fields_validity function + validity_check, message = check_fold_fields_validity(map_data) + assert validity_check == expected_validity + assert message == expected_message From 273b82d3988a5747350ea0fa19c63a23db913322 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Mon, 13 Jan 2025 15:36:48 +1100 Subject: [PATCH 21/29] typos from merging --- map2loop/mapdata.py | 159 +++++--------------------------------------- 1 file changed, 16 insertions(+), 143 deletions(-) diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index de13de7c..ab6376a7 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -730,146 +730,6 @@ def check_map(self, datatype: Datatype): if error: logger.error(message) - @beartype.beartype - def parse_fault_orientations(self) -> tuple: - """ - Parse the fault orientations shapefile data into a consistent format - - Returns: - tuple: A tuple of (bool: success/fail, str: failure message) - """ - # Check type and size of loaded structure map - if ( - self.raw_data[Datatype.FAULT_ORIENTATION] is None - or type(self.raw_data[Datatype.FAULT_ORIENTATION]) is not geopandas.GeoDataFrame - ): - logger.warning("Fault orientation shapefile is not loaded or valid") - return (True, "Fault orientation shapefile is not loaded or valid") - - # Create new geodataframe - fault_orientations = geopandas.GeoDataFrame( - self.raw_data[Datatype.FAULT_ORIENTATION]["geometry"] - ) - - config = self.config.fault_config - - # Parse dip direction and dip columns - if config["dipdir_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - if config["orientation_type"] == "strike": - fault_orientations["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( - lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 - ) - else: - fault_orientations["DIPDIR"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["dipdir_column"] - ] - else: - print( - f"Fault orientation shapefile does not contain dipdir_column '{config['dipdir_column']}'" - ) - - if config["dip_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - fault_orientations["DIP"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["dip_column"] - ] - else: - print( - f"Fault orientation shapefile does not contain dip_column '{config['dip_column']}'" - ) - - # TODO LG would it be worthwhile adding a description column for faults? - # it would be possible to parse out the fault displacement, type, slip direction - # if this was stored in the descriptions? - - # Add object id - if config["objectid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - fault_orientations["ID"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["objectid_column"] - ] - else: - fault_orientations["ID"] = numpy.arange(len(fault_orientations)) - self.data[Datatype.FAULT_ORIENTATION] = fault_orientations - - if config["featureid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: - fault_orientations["featureId"] = self.raw_data[Datatype.FAULT_ORIENTATION][ - config["featureid_column"] - ] - else: - fault_orientations["featureId"] = numpy.arange(len(fault_orientations)) - - return (False, "") - - @beartype.beartype - def parse_structure_map(self) -> tuple: - """ - Parse the structure shapefile data into a consistent format - - Returns: - tuple: A tuple of (bool: success/fail, str: failure message) - """ - # Check type and size of loaded structure map - if ( - self.raw_data[Datatype.STRUCTURE] is None - or type(self.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame - ): - logger.warning("Structure map is not loaded or valid") - return (True, "Structure map is not loaded or valid") - - if len(self.raw_data[Datatype.STRUCTURE]) < 2: - logger.warning( - "Stucture map does not enough orientations to complete calculations (need at least 2), projection may be inconsistent" - ) - - # Create new geodataframe - structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"]) - config = self.config.structure_config - - # Parse dip direction and dip columns - if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]: - if config["orientation_type"] == "strike": - structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply( - lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1 - ) - else: - structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]] - else: - print(f"Structure map does not contain dipdir_column '{config['dipdir_column']}'") - - # Ensure all DIPDIR values are within [0, 360] - structure["DIPDIR"] = structure["DIPDIR"] % 360.0 - - if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]] - else: - print(f"Structure map does not contain dip_column '{config['dip_column']}'") - - # Add bedding and overturned booleans - if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["OVERTURNED"] = ( - self.raw_data[Datatype.STRUCTURE][config["overturned_column"]] - .astype(str) - .str.contains(config["overturned_text"]) - ) - else: - structure["OVERTURNED"] = False - - if config["description_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["BEDDING"] = ( - self.raw_data[Datatype.STRUCTURE][config["description_column"]] - .astype(str) - .str.contains(config["bedding_text"]) - ) - else: - structure["BEDDING"] = False - - # Add object id - if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]: - structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]] - else: - structure["ID"] = numpy.arange(len(structure)) - - self.data[Datatype.STRUCTURE] = structure - return (False, "") @beartype.beartype def parse_geology_map(self) -> tuple: @@ -1183,7 +1043,7 @@ def parse_fault_map(self) -> tuple: return (False, "") - @beartype.beartype + @beartype.beartype def parse_fault_orientations(self) -> tuple: """ Parse the fault orientations shapefile data into a consistent format @@ -1192,7 +1052,12 @@ def parse_fault_orientations(self) -> tuple: tuple: A tuple of (bool: success/fail, str: failure message) """ # Check type and size of loaded structure map - + if ( + self.raw_data[Datatype.FAULT_ORIENTATION] is None + or type(self.raw_data[Datatype.FAULT_ORIENTATION]) is not geopandas.GeoDataFrame + ): + logger.warning("Fault orientation shapefile is not loaded or valid") + return (True, "Fault orientation shapefile is not loaded or valid") # Create new geodataframe fault_orientations = geopandas.GeoDataFrame( @@ -1237,9 +1102,17 @@ def parse_fault_orientations(self) -> tuple: else: fault_orientations["ID"] = numpy.arange(len(fault_orientations)) self.data[Datatype.FAULT_ORIENTATION] = fault_orientations + + if config["featureid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]: + fault_orientations["featureId"] = self.raw_data[Datatype.FAULT_ORIENTATION][ + config["featureid_column"] + ] + else: + fault_orientations["featureId"] = numpy.arange(len(fault_orientations)) + return (False, "") - + @beartype.beartype def parse_fold_map(self) -> tuple: """ From 04b43f1fd2dbcf02aacd0b60225aebaf4cb897d8 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Mon, 13 Jan 2025 15:38:07 +1100 Subject: [PATCH 22/29] chore: another typo --- map2loop/mapdata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py index ab6376a7..dd710a99 100644 --- a/map2loop/mapdata.py +++ b/map2loop/mapdata.py @@ -1043,7 +1043,7 @@ def parse_fault_map(self) -> tuple: return (False, "") - @beartype.beartype + @beartype.beartype def parse_fault_orientations(self) -> tuple: """ Parse the fault orientations shapefile data into a consistent format From fc2e393bcc51c9ca72d3ed90c9818e576191dbac Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Mon, 13 Jan 2025 15:58:32 +1100 Subject: [PATCH 23/29] fix: remove kwargs from project --- map2loop/project.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/map2loop/project.py b/map2loop/project.py index ca4ecf8d..d752bcb3 100644 --- a/map2loop/project.py +++ b/map2loop/project.py @@ -35,7 +35,7 @@ class Project(object): """ The main entry point into using map2loop - Attiributes + Attributes ----------- verbose_level: m2l_enums.VerboseLevel A selection that defines how much console logging is output @@ -75,7 +75,6 @@ def __init__( save_pre_checked_map_data: bool = False, loop_project_filename: str = "", overwrite_loopprojectfile: bool = False, - **kwargs, ): """ The initialiser for the map2loop project @@ -121,17 +120,6 @@ def __init__( ValueError: use_australian_state_data not in state list ['WA', 'SA', 'QLD', 'NSW', 'TAS', 'VIC', 'ACT', 'NT'] """ - # Throw error if unexpected keyword arguments are passed to project - allowed_kwargs = {"metadata_filename"} - for key in kwargs.keys(): - if key not in allowed_kwargs: - logger.error( - f"Unexpected keyword argument '{key}' passed to Project. Allowed keywords: {', '.join(allowed_kwargs)}." - ) - raise TypeError( - f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding with map2loop processing" - ) - # make sure all the needed arguments are provided if not use_australian_state_data: # this check has to skip if using Loop server data self.validate_required_inputs( @@ -170,11 +158,6 @@ def __init__( self.fold_samples = pandas.DataFrame(columns=["ID", "X", "Y", "Z", "featureId"]) self.geology_samples = pandas.DataFrame(columns=["ID", "X", "Y", "Z", "featureId"]) - - # Check for alternate config filenames in kwargs - if "metadata_filename" in kwargs and config_filename == "": - config_filename = kwargs["metadata_filename"] - # Sanity check on working projection parameter if issubclass(type(working_projection), str) or issubclass(type(working_projection), int): self.map_data.set_working_projection(working_projection) @@ -257,8 +240,6 @@ def __init__( self.stratigraphic_column.populate(self.map_data.get_map_data(Datatype.GEOLOGY)) self.deformation_history.populate(self.map_data.get_map_data(Datatype.FAULT)) - if len(kwargs): - logger.warning(f"Unused keyword arguments: {kwargs}") @beartype.beartype def validate_required_inputs( From e919f941be609c52eb065572d17041f30094d8a7 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Mon, 13 Jan 2025 16:44:13 +1100 Subject: [PATCH 24/29] chore: refactor geometry checks --- map2loop/data_checks.py | 129 ++++++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 52 deletions(-) diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index 0954b08f..68ca13b3 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -3,7 +3,7 @@ #external imports import beartype as beartype -from beartype.typing import Tuple +from beartype.typing import Tuple, List import geopandas import shapely import pandas @@ -38,21 +38,16 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: geology_data = mapdata.raw_data[Datatype.GEOLOGY] config = mapdata.config.geology_config - # 1. Check geometry validity - tested & working - if not geology_data.geometry.is_valid.all(): - logger.error("Invalid geometries found. Please fix those before proceeding with map2loop processing") - return (True, "Invalid geometries found in datatype GEOLOGY") + # 2. Validate geometry + failed, message = validate_geometry( + geodata=geology_data, + expected_geom_types=[shapely.Polygon, shapely.MultiPolygon], + datatype_name="GEOLOGY" + ) + if failed: + return (failed, message) - # Check if all geometries are Polygon or MultiPolygon - if not geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon))).all(): - invalid_types = geology_data[~geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon)))] - logger.error( - f"datatype GEOLOGY: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}" - ) - return (True, "Invalid geometry types found in datatype GEOLOGY. All geometries must be Polygon or MultiPolygon.") - - - # # 2. Required Columns & are they str, and then empty or null? + # # 3. Required Columns & are they str, and then empty or null? required_columns = [config["unitname_column"], config["alt_unitname_column"]] for col in required_columns: if col not in geology_data.columns: @@ -141,6 +136,7 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: logger.info("Geology fields validation passed.") return (False, "") + @beartype.beartype def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: """ @@ -174,18 +170,14 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: structure_data = mapdata.raw_data[Datatype.STRUCTURE] config = mapdata.config.structure_config - # 1. Check geometry validity - if not structure_data.geometry.is_valid.all(): - logger.error("datatype STRUCTURE: Invalid geometries found. Please fix those before proceeding with map2loop processing") - return (True, "Invalid geometries found in datatype STRUCTURE") - - # Check if all geometries are Points - if not structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point)).all(): - invalid_types = structure_data[~structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point))] - logger.error( - f"datatype STRUCTURE: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}" - ) - return (True, "Invalid geometry types found in datatype STRUCTURE. All geometries must be Points.") + # 2. Validate geometry + failed, message = validate_geometry( + geodata=structure_data, + expected_geom_types=[shapely.Point, shapely.MultiPoint], + datatype_name="STRUCTURE" + ) + if failed: + return (failed, message) # 2. Check mandatory numeric columns required_columns = [config["dipdir_column"], config["dip_column"]] @@ -273,18 +265,14 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: fault_data = mapdata.raw_data[Datatype.FAULT] config = mapdata.config.fault_config - # Check geometry - if not fault_data.geometry.is_valid.all(): - logger.error("datatype FAULT: Invalid geometries found. Please fix those before proceeding with map2loop processing") - return (True, "Invalid geometries found in FAULT data.") - - # Check for LineString or MultiLineString geometries - if not fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all(): - invalid_types = fault_data[~fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))] - logger.error( - f"FAULT data contains invalid geometry types. Rows with invalid geometry types: {invalid_types.index.tolist()}" - ) - return (True, "FAULT data contains geometries that are not LineString or MultiLineString.") + # 2. Validate geometry + failed, message = validate_geometry( + geodata=fault_data, + expected_geom_types=[shapely.LineString, shapely.MultiLineString], + datatype_name="FAULT" + ) + if failed: + return (failed, message) # Check "structtype_column" if it exists if "structtype_column" in config: @@ -476,18 +464,14 @@ def check_fold_fields_validity(mapdata) -> Tuple[bool, str]: # Debugging: Print column names in the fold_data logger.debug(f"Fold data columns: {folds.columns.tolist()}") - # Check geometry - if not folds.geometry.is_valid.all(): - logger.error("datatype FOLD: Invalid geometries found. Please fix those before proceeding with map2loop processing") - return (True, "Invalid geometries found in FOLD data.") - - # Check for LineString or MultiLineString geometries - if not folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all(): - invalid_types = folds[~folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))] - logger.error( - f"datatype FOLD: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}" - ) - return (True, "Invalid geometry types found in FOLD data.") + # 2. Validate geometry + failed, message = validate_geometry( + geodata=folds, + expected_geom_types=[shapely.LineString, shapely.MultiLineString], + datatype_name="FOLD" + ) + if failed: + return (failed, message) # Check "structtype_column" if it exists if "structtype_column" in config: @@ -683,4 +667,45 @@ def check_keys(d: dict, parent_key=""): mfl = config_dict.get("fault", {}).get("minimum_fault_length", None) if mfl is not None and not isinstance(mfl, (int, float)): logger.error("minimum_fault_length must be a number.") - raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}") \ No newline at end of file + raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}") + + +def validate_geometry( + geodata: geopandas.GeoDataFrame, + expected_geom_types: List[type], + datatype_name: str +) -> Tuple[bool, str]: + """ + Validates the geometry column of a GeoDataFrame. + + Parameters: + geodata (gpd.GeoDataFrame): The GeoDataFrame to validate. + expected_geom_types (List[type]): A list of expected Shapely geometry types. + datatype_name (str): A string representing the datatype being validated (e.g., "GEOLOGY"). + + Returns: + Tuple[bool, str]: A tuple where the first element is a boolean indicating if validation failed, + and the second element is an error message if failed. + """ + # 1. Check if all geometries are valid + if not geodata.geometry.is_valid.all(): + logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.") + return True, f"Invalid geometries found in datatype {datatype_name}." + + # 2. Check if all geometries are of the expected types + if not geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types))).all(): + invalid_types = geodata[~geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types)))] + invalid_indices = invalid_types.index.tolist() + expected_types_names = ', '.join([geom_type.__name__ for geom_type in expected_geom_types]) + logger.error( + f"Datatype {datatype_name}: Invalid geometry types found. Expected types: {expected_types_names}. " + f"Rows with invalid types: {invalid_indices}" + ) + return True, ( + f"Invalid geometry types found in datatype {datatype_name}. " + f"All geometries must be {expected_types_names}." + ) + + # If all checks pass + logger.debug(f"Geometry validation passed for datatype {datatype_name}.") + return False, "" \ No newline at end of file From 000226238e6b1f756c6d89a336450e27f15ccbb6 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Mon, 13 Jan 2025 17:19:34 +1100 Subject: [PATCH 25/29] chore: update tests for geometry refactor --- map2loop/data_checks.py | 6 +++--- tests/data_checks/test_input_data_faults.py | 2 +- tests/data_checks/test_input_data_fold.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index 68ca13b3..73aee9e9 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -669,7 +669,7 @@ def check_keys(d: dict, parent_key=""): logger.error("minimum_fault_length must be a number.") raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}") - +@beartype.beartype def validate_geometry( geodata: geopandas.GeoDataFrame, expected_geom_types: List[type], @@ -690,7 +690,7 @@ def validate_geometry( # 1. Check if all geometries are valid if not geodata.geometry.is_valid.all(): logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.") - return True, f"Invalid geometries found in datatype {datatype_name}." + return True, f"Invalid geometries found in datatype {datatype_name}" # 2. Check if all geometries are of the expected types if not geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types))).all(): @@ -707,5 +707,5 @@ def validate_geometry( ) # If all checks pass - logger.debug(f"Geometry validation passed for datatype {datatype_name}.") + logger.debug(f"Geometry validation passed for datatype {datatype_name}") return False, "" \ No newline at end of file diff --git a/tests/data_checks/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py index cf668889..2f70176a 100644 --- a/tests/data_checks/test_input_data_faults.py +++ b/tests/data_checks/test_input_data_faults.py @@ -34,7 +34,7 @@ }, {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, True, - "Invalid geometries found in FAULT data." + "Invalid geometries found in datatype FAULT" ), # Non-string FEATURE column diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py index cb92ad2e..75051e56 100644 --- a/tests/data_checks/test_input_data_fold.py +++ b/tests/data_checks/test_input_data_fold.py @@ -27,7 +27,7 @@ ( { "geometry": [ - shapely.geometry.Point(0, 0), # Invalid type + shapely.geometry.LineString([(0,0), (0,0)]), # Invalid type shapely.geometry.LineString([(0, 0), (1, 1)]) ], "FEATURE": ["fold A", "fold B"], @@ -36,7 +36,7 @@ }, {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, True, - "Invalid geometry types found in FOLD data." + "Invalid geometries found in datatype FOLD" ), # Non-string FEATURE column ( From b066a2218b8d57dd46e15e6d07438c95da145012 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 14 Jan 2025 11:41:52 +1100 Subject: [PATCH 26/29] chore: refactor id checks --- map2loop/data_checks.py | 203 +++++++++--------- tests/data_checks/test_input_data_fold.py | 2 +- tests/data_checks/test_input_data_geology.py | 4 +- .../data_checks/test_input_data_structure.py | 2 +- 4 files changed, 102 insertions(+), 109 deletions(-) diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index 73aee9e9..4d769f1d 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -85,31 +85,16 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: "Map2loop processing might not work as expected." ) - # # 4. Check for duplicates in ID - if "objectid_column" in config and config["objectid_column"] in geology_data.columns: - objectid_values = geology_data[config["objectid_column"]] + # # 4. check ID column + if "objectid_column" in config: + id_validation_failed, id_message = validate_id_column( + geodata=geology_data, + config=config, + id_config_key="objectid_column", + geodata_name="GEOLOGY") - # Check for None, NaN, or other null-like values - if objectid_values.isnull().any(): - logger.error( - f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values. Ensure all values are valid and non-null." - ) - return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values.") - - # Check for duplicate values - if objectid_values.duplicated().any(): - logger.error( - f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column'). Please make sure that the column contains unique values." - ) - return (True, f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column').") - - # Check for uniqueness - if not objectid_values.is_unique: - logger.error( - f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values. Ensure all values are unique." - ) - return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values.") - + if id_validation_failed: + return (id_validation_failed, id_message) # 5. Check for NaNs/blanks in optional fields with warnings warning_fields = [ @@ -224,31 +209,17 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: "Map2loop processing might not work as expected." ) - # check ID column for type, null values, and duplicates - optional_numeric_column_key = "objectid_column" - optional_numeric_column = config.get(optional_numeric_column_key) - - if optional_numeric_column: - if optional_numeric_column in structure_data.columns: - # Check for non-integer values - if not structure_data[optional_numeric_column].apply(lambda x: isinstance(x, int) or pandas.isnull(x)).all(): - logger.error( - f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values. Rectify this, or remove this column from the config - map2loop will generate a new ID." - ) - return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values.") - # Check for NaN - if structure_data[optional_numeric_column].isnull().any(): - logger.error( - f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values. Rectify this, or remove this column from the config - map2loop will generate a new ID." - ) - return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values.") - # Check for duplicates - if structure_data[optional_numeric_column].duplicated().any(): - logger.error( - f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values. Rectify this, or remove this column from the config - map2loop will generate a new ID." - ) - return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values.") - + # check ID column + if "objectid_column" in config: + id_validation_failed, id_message = validate_id_column( + geodata=structure_data, + config=config, + id_config_key="objectid_column", + geodata_name="STRUCTURE") + + if id_validation_failed: + return (id_validation_failed, id_message) + return (False, "") @beartype.beartype @@ -419,32 +390,17 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: ) return (True, f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data.") - # Check ID column - id_column = config.get("objectid_column") - if id_column: - if id_column in fault_data.columns: - # Check for non-integer values - # Attempt to coerce the ID column to integers because WA data says so (ARodrigues) - fault_data[id_column] = pandas.to_numeric(fault_data[id_column], errors='coerce') - - # Check if all values are integers or null after coercion - if not fault_data[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all(): - logger.warning( - f"Datatype FAULT: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs." - ) - - # Check for NaN values - if fault_data[id_column].isnull().any(): - logger.warning( - f"Datatype FAULT: ID column '{id_column}' contains NaN or null values. Rectify this or remove the key from the config to auto-generate IDs." - ) - - # Check for duplicates - if fault_data[id_column].duplicated().any(): - logger.error( - f"Datatype FAULT: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs." - ) + # # 4. check ID column + if "objectid_column" in config: + id_validation_failed, id_message = validate_id_column( + geodata=fault_data, + config=config, + id_config_key="objectid_column", + geodata_name="FAULT") + + if id_validation_failed: + return (id_validation_failed, id_message) return (False, "") @@ -546,36 +502,16 @@ def check_fold_fields_validity(mapdata) -> Tuple[bool, str]: ) - # Check ID column - id_column = config.get("objectid_column") - - if id_column: - if id_column in folds.columns: - # Attempt to coerce the ID column to numeric - folds[id_column] = pandas.to_numeric(folds[id_column], errors='coerce') - - # Check if all values are integers or null after coercion - if not folds[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all(): - logger.warning( - f"Datatype FOLD: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs." - ) - - # Check for NaN values - if folds[id_column].isnull().any(): - logger.warning( - f"Datatype FOLD: ID column '{id_column}' contains NaN or null or str values. Rectify this or remove the key from the config to auto-generate IDs." - ) - - # Check for duplicate values - if folds[id_column].duplicated().any(): - logger.error( - f"Datatype FOLD: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs." - ) - return (True, f"Datatype FOLD: ID column '{id_column}' contains duplicate values.") - else: - logger.warning( - f"Datatype FOLD: ID column '{id_column}' is missing from the fold data. Ensure the column name is correct or remove the key from the config." - ) + # # 4. check ID column + if "objectid_column" in config: + id_validation_failed, id_message = validate_id_column( + geodata=folds, + config=config, + id_config_key="objectid_column", + geodata_name="FOLD") + + if id_validation_failed: + return (id_validation_failed, id_message) return (False, "") @@ -708,4 +644,61 @@ def validate_geometry( # If all checks pass logger.debug(f"Geometry validation passed for datatype {datatype_name}") - return False, "" \ No newline at end of file + return False, "" + + +@beartype.beartype +def validate_id_column( + geodata: geopandas.GeoDataFrame, + config: dict, + id_config_key: str, + geodata_name: str +) -> Tuple[bool, str]: + + # Retrieve the ID column name from the configuration + id_column = config.get(id_config_key) + + if not id_column: + error_msg = f"Configuration key '{id_config_key}' is missing." + logger.error(error_msg) + return (True, error_msg) + + if id_column in geodata.columns: + geodata[id_column] = pandas.to_numeric(geodata[id_column], errors='coerce') + + # Check for non-numeric values (which are now NaN after coercion) + if geodata[id_column].isnull().any(): + error_msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') contains non-numeric or NaN values. " + "Please rectify the values, or remove this key from the config dictionary to let map2loop assign IDs." + ) + logger.error(error_msg) + return (True, error_msg) + + if not (geodata[id_column] == geodata[id_column].astype(int)).all(): + error_msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') contains non-integer values." + ) + logger.error(error_msg) + return (True, error_msg) + + if geodata[id_column].duplicated().any(): + error_msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') contains duplicate values." + ) + logger.error(error_msg) + return (True, error_msg) + + + elif id_column not in geodata.columns: + msg = ( + f"Datatype {geodata_name}: Column '{id_column}' " + f"(config key: '{id_config_key}') is missing from the data. " + "Map2loop will automatically generate IDs." + ) + logger.warning(msg) + + return (False, "") \ No newline at end of file diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py index 75051e56..740039b8 100644 --- a/tests/data_checks/test_input_data_fold.py +++ b/tests/data_checks/test_input_data_fold.py @@ -80,7 +80,7 @@ }, {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, True, - "Datatype FOLD: ID column 'ID' contains duplicate values." + "Datatype FOLD: Column 'ID' (config key: 'objectid_column') contains duplicate values." ), ], ids=[ diff --git a/tests/data_checks/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py index 4b083e76..517e6fc8 100644 --- a/tests/data_checks/test_input_data_geology.py +++ b/tests/data_checks/test_input_data_geology.py @@ -137,7 +137,7 @@ def __init__(self): "ID": [1, 1], # Duplicate ID }, True, - "Datatype GEOLOGY: Duplicate values found in column 'ID' (config key: 'objectid_column').", + "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains duplicate values.", ), # nan in id ( @@ -158,7 +158,7 @@ def __init__(self): "ID": [1, None], }, True, - "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains NaN or null values.", + "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains non-numeric or NaN values. Please rectify the values, or remove this key from the config dictionary to let map2loop assign IDs.", ), # nan in unit name ( diff --git a/tests/data_checks/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py index d96441fc..ceabdcaf 100644 --- a/tests/data_checks/test_input_data_structure.py +++ b/tests/data_checks/test_input_data_structure.py @@ -116,7 +116,7 @@ def __init__(self): "ID": [1, 1] # Duplicate ID }, True, - "Datatype STRUCTURE: ID column 'ID' (config key: 'objectid_column') contains duplicate values.", + "Datatype STRUCTURE: Column 'ID' (config key: 'objectid_column') contains duplicate values.", ), ], ) From 9001a08729ee3b584cc985bacc9362a2ca9b9b42 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 14 Jan 2025 14:20:46 +1100 Subject: [PATCH 27/29] chore: refactor mandatory fields for str and geo --- map2loop/data_checks.py | 134 ++++++++++++++---- tests/data_checks/test_input_data_geology.py | 8 +- .../data_checks/test_input_data_structure.py | 6 +- 3 files changed, 113 insertions(+), 35 deletions(-) diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index 4d769f1d..b71ef6e8 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -47,20 +47,22 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: if failed: return (failed, message) - # # 3. Required Columns & are they str, and then empty or null? - required_columns = [config["unitname_column"], config["alt_unitname_column"]] - for col in required_columns: - if col not in geology_data.columns: - logger.error(f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.") - return (True, f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.") - if not geology_data[col].apply(lambda x: isinstance(x, str)).all(): - config_key = [k for k, v in config.items() if v == col][0] - logger.error(f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.") - return (True, f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.") - if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any(): - config_key = [k for k, v in config.items() if v == col][0] - logger.error(f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") - return (True, f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") + + # check required columns in geology + required_columns = ["unitname_column", "alt_unitname_column"] + + validation_failed, message = validate_required_columns( + geodata=geology_data, + config=config, + required_columns=required_columns, + expected_type=str, + check_blank=True, + datatype_name="GEOLOGY" + ) + if validation_failed: + return (validation_failed, message) + + # # 3. Optional Columns optional_string_columns = [ @@ -164,20 +166,19 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: if failed: return (failed, message) - # 2. Check mandatory numeric columns - required_columns = [config["dipdir_column"], config["dip_column"]] - for col in required_columns: - if col not in structure_data.columns: - logger.error(f"DDatatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.") - return (True, f"Datatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.") - if not structure_data[col].apply(lambda x: isinstance(x, (int, float))).all(): - config_key = [k for k, v in config.items() if v == col][0] - logger.error(f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.") - return (True, f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.") - if structure_data[col].isnull().any(): - config_key = [k for k, v in config.items() if v == col][0] - logger.error(f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") - return (True, f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.") + + # check required columns in structure (numeric dips & dip dir) + required_columns = ["dipdir_column", "dip_column"] + validation_failed, message = validate_required_columns( + geodata=structure_data, + config=config, + required_columns=required_columns, + expected_type=(int, float), + check_blank=False, + datatype_name="STRUCTURE" + ) + if validation_failed: + return (validation_failed, message) if config["dip_column"] in structure_data.columns: invalid_dip = ~((structure_data[config["dip_column"]] >= 0) & (structure_data[config["dip_column"]] <= 90)) @@ -701,4 +702,81 @@ def validate_id_column( ) logger.warning(msg) + return (False, "") + +from beartype.typing import List, Type, Tuple, Union + +def validate_required_columns( + geodata: geopandas.GeoDataFrame, + config: dict, + required_columns: List[str], + expected_type: Union[Type, Tuple[Type, ...]], + check_blank: bool = False, + datatype_name: str = "UNKNOWN" +) -> Tuple[bool, str]: + """ + Validate required columns in a GeoDataFrame. + + This function checks whether required columns exist, have the expected data types, + and contain no null or (optionally) blank values. + + Args: + geodata (geopandas.GeoDataFrame): The GeoDataFrame to validate. + config (dict): Configuration dictionary mapping config keys to column names. + required_columns (List[str]): List of config keys for required columns. + expected_type (Type or Tuple[Type, ...]): Expected data type(s) for the columns. + check_blank (bool, optional): Whether to check for blank (empty) strings. Defaults to False. + datatype_name (str, optional): Name of the datatype being validated (for logging). Defaults to "UNKNOWN". + + Returns: + Tuple[bool, str]: (True, error_message) if validation fails, else (False, ""). + """ + for config_key in required_columns: + column_name = config.get(config_key) + + if not column_name: + error_msg = ( + f"Configuration key '{config_key}' is missing for datatype '{datatype_name}'." + ) + logger.error(error_msg) + return (True, error_msg) + + if column_name not in geodata.columns: + error_msg = ( + f"Datatype {datatype_name.upper()}: Required column with config key '{config_key}' " + f"(column: '{column_name}') is missing from the data." + ) + logger.error(error_msg) + return (True, error_msg) + + # Check data type + if not geodata[column_name].apply(lambda x: isinstance(x, expected_type)).all(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') " + f"must contain only {expected_type if isinstance(expected_type, type) else 'numeric'} values." + ) + logger.error(error_msg) + return (True, error_msg) + + # Check for null values + if geodata[column_name].isnull().any(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') " + f"contains null values. Please ensure all values are present." + ) + logger.error(error_msg) + return (True, error_msg) + + # Optionally check for blank strings + if check_blank and issubclass(expected_type, str): + if geodata[column_name].str.strip().eq("").any(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') " + f"contains blank (empty) values. Please ensure all values are populated." + ) + logger.error(error_msg) + return (True, error_msg) + + # If all required columns pass validation + logger.info(f"Datatype {datatype_name.upper()}: All required columns validated successfully.") return (False, "") \ No newline at end of file diff --git a/tests/data_checks/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py index 517e6fc8..4ae58ec2 100644 --- a/tests/data_checks/test_input_data_geology.py +++ b/tests/data_checks/test_input_data_geology.py @@ -80,7 +80,7 @@ def __init__(self): "ID": [1], }, True, - "Datatype GEOLOGY: Required column with config key: 'alt_unitname_column' is missing from geology data.", + "Datatype GEOLOGY: Required column with config key 'alt_unitname_column' (column: 'CODE') is missing from the data.", ), # Non-string value in required column ( @@ -98,7 +98,7 @@ def __init__(self): "ID": [1], }, True, - "Datatype GEOLOGY: Column 'alt_unitname_column' must contain only string values. Please check that the column contains only string values.", + "Datatype GEOLOGY: Column 'alt_unitname_column' (column: 'CODE') must contain only values.", ), # NaN or blank value in required column ( @@ -116,7 +116,7 @@ def __init__(self): "ID": [1], }, True, - "Datatype GEOLOGY: NaN or blank values found in required column 'unitname_column'. Please double check the column for blank values.", + "Datatype GEOLOGY: Column 'unitname_column' (column: 'UNITNAME') contains blank (empty) values. Please ensure all values are populated.", ), # Duplicate ID values ( @@ -179,7 +179,7 @@ def __init__(self): "ID": [1, 1], # Duplicate ID }, True, - "Datatype GEOLOGY: Column 'unitname_column' must contain only string values. Please check that the column contains only string values.", + "Datatype GEOLOGY: Column 'unitname_column' (column: 'UNITNAME') must contain only values.", ), ], ) diff --git a/tests/data_checks/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py index ceabdcaf..60db846c 100644 --- a/tests/data_checks/test_input_data_structure.py +++ b/tests/data_checks/test_input_data_structure.py @@ -68,7 +68,7 @@ def __init__(self): "ID": [1, 2] }, True, - "Datatype STRUCTURE: Required column with config key: 'dipdir_column' is missing from structure data.", + "Datatype STRUCTURE: Required column with config key 'dipdir_column' (column: 'DIPDIR') is missing from the data.", ), # Non-numeric value in numeric column ( @@ -84,7 +84,7 @@ def __init__(self): "ID": [1, 2] }, True, - "Datatype STRUCTURE: Column 'dipdir_column' must contain only numeric values. Please check that the column contains only numeric values.", + "Datatype STRUCTURE: Column 'dipdir_column' (column: 'DIPDIR') must contain only numeric values.", ), # NaN or blank value in required column ( @@ -100,7 +100,7 @@ def __init__(self): "ID": [1, 2] }, True, - "Datatype STRUCTURE: NaN or blank values found in required column 'dipdir_column'. Please double check the column for blank values.", + "Datatype STRUCTURE: Column 'dipdir_column' (column: 'DIPDIR') contains null values. Please ensure all values are present.", ), # Duplicate ID column ( From 652281cbccdd5fc47cec3ebf9ff00a72883deb44 Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 14 Jan 2025 15:16:52 +1100 Subject: [PATCH 28/29] finalise refactor --- map2loop/data_checks.py | 580 ++++++++++++-------- tests/data_checks/test_input_data_faults.py | 2 +- tests/data_checks/test_input_data_fold.py | 2 +- 3 files changed, 342 insertions(+), 242 deletions(-) diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py index b71ef6e8..b36349ca 100644 --- a/map2loop/data_checks.py +++ b/map2loop/data_checks.py @@ -3,7 +3,7 @@ #external imports import beartype as beartype -from beartype.typing import Tuple, List +from beartype.typing import Tuple, Optional, List, Dict, Type, Union import geopandas import shapely import pandas @@ -51,7 +51,7 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: # check required columns in geology required_columns = ["unitname_column", "alt_unitname_column"] - validation_failed, message = validate_required_columns( + failed, message = validate_required_columns( geodata=geology_data, config=config, required_columns=required_columns, @@ -59,66 +59,52 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]: check_blank=True, datatype_name="GEOLOGY" ) - if validation_failed: - return (validation_failed, message) - + if failed: + return (failed, message) - - # # 3. Optional Columns + # check optional columns optional_string_columns = [ "group_column", "supergroup_column", "description_column", "rocktype_column", "alt_rocktype_column", ] - for key in optional_string_columns: - if key in config and config[key] in geology_data.columns: - if not geology_data[config[key]].apply(lambda x: isinstance(x, str)).all(): - logger.warning( - f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-string values. " - "Map2loop processing might not work as expected." - ) - - optional_numeric_columns = ["minage_column", "maxage_column", "objectid_column"] - for key in optional_numeric_columns: - if key in config and config[key] in geology_data.columns: - if not geology_data[config[key]].apply(lambda x: isinstance(x, (int, float))).all(): - logger.warning( - f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-numeric values. " - "Map2loop processing might not work as expected." - ) + string_warnings = validate_optional_columns( + geodata=geology_data, + config=config, + optional_columns=optional_string_columns, + expected_type=str, + check_blank=True, + datatype_name="GEOLOGY" + ) + ### only emit warnings for optional columns + for warning in string_warnings: + logger.warning(warning) + + # 5. Validate Optional Numeric Columns + optional_numeric_columns = ["minage_column", "maxage_column"] + numeric_warnings = validate_optional_columns( + geodata=geology_data, + config=config, + optional_columns=optional_numeric_columns, + expected_type=(int, float), + check_blank=False, + datatype_name="GEOLOGY" + ) + + ### only emit warnings for optional columns + for warning in numeric_warnings: + logger.warning(warning) # # 4. check ID column if "objectid_column" in config: - id_validation_failed, id_message = validate_id_column( + failed, message = validate_id_column( geodata=geology_data, config=config, id_config_key="objectid_column", geodata_name="GEOLOGY") - if id_validation_failed: - return (id_validation_failed, id_message) - - # 5. Check for NaNs/blanks in optional fields with warnings - warning_fields = [ - "group_column", "supergroup_column", "description_column", - "rocktype_column", "minage_column", "maxage_column", - ] - for key in warning_fields: - col = config.get(key) - if col and col in geology_data.columns: - # Check if column contains string values before applying `.str` - if pandas.api.types.is_string_dtype(geology_data[col]): - if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any(): - logger.warning( - f"Datatype GEOLOGY: NaN or blank values found in optional column '{col}' (config key: '{key}')." - ) - else: - # Non-string columns, check only for NaN values - if geology_data[col].isnull().any(): - logger.warning( - f"Datatype GEOLOGY: NaN values found in optional column '{col}' (config key: '{key}')." - ) - + if failed: + return (failed, message) logger.info("Geology fields validation passed.") return (False, "") @@ -169,7 +155,7 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: # check required columns in structure (numeric dips & dip dir) required_columns = ["dipdir_column", "dip_column"] - validation_failed, message = validate_required_columns( + failed, message = validate_required_columns( geodata=structure_data, config=config, required_columns=required_columns, @@ -177,49 +163,46 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]: check_blank=False, datatype_name="STRUCTURE" ) - if validation_failed: - return (validation_failed, message) - - if config["dip_column"] in structure_data.columns: - invalid_dip = ~((structure_data[config["dip_column"]] >= 0) & (structure_data[config["dip_column"]] <= 90)) - if invalid_dip.any(): - logger.warning( - f"Datatype STRUCTURE: Column '{config['dip_column']}' has values that are not between 0 and 90 degrees. Is this intentional?" - ) + if failed: + return (failed, message) - if config["dipdir_column"] in structure_data.columns: - invalid_dipdir = ~((structure_data[config["dipdir_column"]] >= 0) & (structure_data[config["dipdir_column"]] <= 360)) - if invalid_dipdir.any(): - logger.warning( - f"Datatype STRUCTURE: Column '{config['dipdir_column']}' has values that are not between 0 and 360 degrees. Is this intentional?" - ) + # 4. Validate Dip and Dip Direction value ranges + dip_columns = ["dip_column", "dipdir_column"] + dip_validation_failed, dip_message = validate_dip_columns( + geodata=structure_data, + config=config, + dip_columns=dip_columns, + datatype_name="STRUCTURE", + allow_nulls=False # Dip and dipdir cannot have nulls in structure data + ) + if dip_validation_failed: + logger.warning(dip_message) - # check validity of optional string columns + # check optional columns optional_string_columns = ["description_column", "overturned_column"] - for key in optional_string_columns: - if key in config and config[key] in structure_data.columns: - column_name = config[key] - if not structure_data[column_name].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): - logger.warning( - f"Datatype STRUCTURE: Optional column with config key: '{key}' contains non-string values. " - "Map2loop processing might not work as expected." - ) - if structure_data[column_name].isnull().any() or structure_data[column_name].str.strip().eq("").any(): - logger.warning( - f"Datatype STRUCTURE: Optional column config key: '{key}' contains NaN, empty, or null values. " - "Map2loop processing might not work as expected." - ) + string_warnings = validate_optional_columns( + geodata=structure_data, + config=config, + optional_columns=optional_string_columns, + expected_type=str, + check_blank=True, + datatype_name="STRUCTURE" + ) + + ## only emit warnings for optional columns + for warning in string_warnings: + logger.warning(warning) # check ID column if "objectid_column" in config: - id_validation_failed, id_message = validate_id_column( + failed, id_message = validate_id_column( geodata=structure_data, config=config, id_config_key="objectid_column", geodata_name="STRUCTURE") - if id_validation_failed: - return (id_validation_failed, id_message) + if failed: + return (failed, id_message) return (False, "") @@ -246,43 +229,21 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: if failed: return (failed, message) - # Check "structtype_column" if it exists - if "structtype_column" in config: - structtype_column = config["structtype_column"] - - # Ensure the column exists in the data - if structtype_column not in fault_data.columns: - logger.warning( - f"Datatype FAULT: '{structtype_column}' (config key: 'structtype_column') is missing from the fault data. Consider removing that key from the config" - ) - else: - # Check if all entries in the column are strings - if not fault_data[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): - logger.error( - f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." - ) - return (True, f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.") - - # Warn about empty or null cells - if fault_data[structtype_column].isnull().any() or fault_data[structtype_column].str.strip().eq("").any(): - logger.warning( - f"Datatype FAULT: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected." - ) - - # Check if "fault_text" is defined and contained in the column - fault_text = config.get("fault_text", None) - - # Check if the structtype_column exists in the fault_data - if structtype_column not in fault_data.columns: - logger.warning( - f"Datatype FAULT: The column '{structtype_column}' is not present in the fault data." - ) - - else: - if not fault_data[structtype_column].str.contains(fault_text).any(): - logger.error( - f"Datatype FAULT: The 'fault_text' value '{fault_text}' is not found in column '{structtype_column}'. Project might end up with no faults" - ) + # # Check "structtype_column" if it exists + text_keys = { + "fault_text": "fault_text" + } + structtype_validation_failed, structtype_message = validate_structtype_column( + geodata=fault_data, + config=config, + datatype_name="FAULT", + required=True, # Assuming structtype_column is required in FAULT + text_keys=text_keys + ) + if structtype_validation_failed: + return (structtype_validation_failed, structtype_message) + + #checks on name column name_column = config.get("name_column") @@ -312,50 +273,17 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]: f"Datatype FAULT: Column '{name_column}' contains duplicate values. This may affect processing." ) - # dips & strikes - # Check for dips and dip directions - strike_dips_columns = ["dip_column", "dipdir_column"] - - for key in strike_dips_columns: - column_name = config.get(key) - if column_name: # Only proceed if the config has this key - if column_name in fault_data.columns: - - #coerce to numeric - fault_data[column_name] = pandas.to_numeric(fault_data[column_name], errors='coerce') - - # Check if the column contains only numeric values - if not fault_data[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all(): - logger.warning( - f"Datatype FAULT: Column '{column_name}' (config key {key}) must contain only numeric values. Please ensure the column is numeric." - ) - - # Check for NaN or empty values - if fault_data[column_name].isnull().any(): - logger.warning( - f"Datatype FAULT: Column '{column_name}' (config key {key}) contains NaN or empty values. This may affect processing." - ) - - # Check range constraints - if key == "dip_column": - # Dips must be between 0 and 90 - invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 90)) - if invalid_values.any(): - logger.warning( - f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 90]. Was this intentional?" - ) - elif key == "dipdir_column": - # Dip directions must be between 0 and 360 - invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 360)) - if invalid_values.any(): - logger.warning( - f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 360]. Was this intentional?" - ) - else: - logger.warning( - f"Datatype FAULT: Column '{column_name}' (config key {key}) is missing from the fault data. Please ensure the column name is correct, or otherwise remove that key from the config." - ) - + # # dips & strikes + dip_columns = ["dip_column", "dipdir_column"] + dip_validation_failed, dip_message = validate_dip_columns( + geodata=fault_data, + config=config, + dip_columns=dip_columns, + datatype_name="FAULT", + allow_nulls=True # Dip fields can be empty + ) + if dip_validation_failed: + logger.warning(dip_message) # dip estimates dip_estimate_column = config.get("dip_estimate_column") @@ -430,55 +358,20 @@ def check_fold_fields_validity(mapdata) -> Tuple[bool, str]: if failed: return (failed, message) - # Check "structtype_column" if it exists - if "structtype_column" in config: - structtype_column = config["structtype_column"] - - # Ensure the column exists in the data - if structtype_column not in folds.columns: - logger.warning( - f"Datatype FOLD: '{structtype_column}' (config key: 'structtype_column') is missing from the fold data. Consider removing that key from the config" - ) - return (True, f"Column '{structtype_column}' is missing from the fold data.") - else: - # Check if all entries in the column are strings - if not folds[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): - logger.error( - f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." - ) - return (True, f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.") - - # Warn about empty or null cells - if folds[structtype_column].isnull().any() or folds[structtype_column].str.strip().eq("").any(): - logger.warning( - f"Datatype FOLD: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected." - ) - - # Check if "fold_text" is defined and contained in the column - fold_text = config.get("fold_text", None) - if fold_text: - - # check if fold text is a string - if not isinstance(fold_text, str): - logger.error("Datatype FOLD: 'fold_text' must be a string. Please ensure it is defined correctly in the config.") - return (True, "Datatype FOLD: 'fold_text' must be a string.") - #check if it exists in the column strtype - if not folds[structtype_column].str.contains(fold_text, na=False).any(): - logger.error(f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'. This may impact processing.") - return (True, f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'.") - - # check synform_text - synform_text = config.get("synform_text", None) - if synform_text: - # Check if synform_text is a string - if not isinstance(synform_text, str): - logger.error("Datatype FOLD: 'synform_text' must be a string. Please ensure it is defined correctly in the config.") - return (True, "Datatype FOLD: 'synform_text' must be a string.") - # Check if it exists in the structtype_column - if not folds[structtype_column].str.contains(synform_text, na=False).any(): - logger.warning( - f"Datatype FOLD: The 'synform_text' value '{synform_text}' is not found in column '{structtype_column}'. This may impact processing." - ) + ## check structtype column if it exists + text_keys = { + "fold_text": "fold_text", + "synform_text": "synform_text" + } + structtype_validation_failed, structtype_message = validate_structtype_column( + geodata=folds, + config=config, + datatype_name="FOLD", + required=True, # Assuming structtype_column is required in FOLD + text_keys=text_keys + ) + if structtype_validation_failed: + return (structtype_validation_failed, structtype_message) # check description column description_column = config.get("description_column", None) @@ -612,18 +505,7 @@ def validate_geometry( expected_geom_types: List[type], datatype_name: str ) -> Tuple[bool, str]: - """ - Validates the geometry column of a GeoDataFrame. - - Parameters: - geodata (gpd.GeoDataFrame): The GeoDataFrame to validate. - expected_geom_types (List[type]): A list of expected Shapely geometry types. - datatype_name (str): A string representing the datatype being validated (e.g., "GEOLOGY"). - Returns: - Tuple[bool, str]: A tuple where the first element is a boolean indicating if validation failed, - and the second element is an error message if failed. - """ # 1. Check if all geometries are valid if not geodata.geometry.is_valid.all(): logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.") @@ -704,8 +586,7 @@ def validate_id_column( return (False, "") -from beartype.typing import List, Type, Tuple, Union - +@beartype.beartype def validate_required_columns( geodata: geopandas.GeoDataFrame, config: dict, @@ -714,23 +595,7 @@ def validate_required_columns( check_blank: bool = False, datatype_name: str = "UNKNOWN" ) -> Tuple[bool, str]: - """ - Validate required columns in a GeoDataFrame. - - This function checks whether required columns exist, have the expected data types, - and contain no null or (optionally) blank values. - Args: - geodata (geopandas.GeoDataFrame): The GeoDataFrame to validate. - config (dict): Configuration dictionary mapping config keys to column names. - required_columns (List[str]): List of config keys for required columns. - expected_type (Type or Tuple[Type, ...]): Expected data type(s) for the columns. - check_blank (bool, optional): Whether to check for blank (empty) strings. Defaults to False. - datatype_name (str, optional): Name of the datatype being validated (for logging). Defaults to "UNKNOWN". - - Returns: - Tuple[bool, str]: (True, error_message) if validation fails, else (False, ""). - """ for config_key in required_columns: column_name = config.get(config_key) @@ -779,4 +644,239 @@ def validate_required_columns( # If all required columns pass validation logger.info(f"Datatype {datatype_name.upper()}: All required columns validated successfully.") + return (False, "") + + + +def validate_optional_columns( + geodata: geopandas.GeoDataFrame, + config: Dict[str, str], + optional_columns: List[str], + expected_type: Union[Type, Tuple[Type, ...]], + check_blank: bool = False, + datatype_name: str = "UNKNOWN" +) -> List[str]: + + warnings = [] + + for config_key in optional_columns: + column_name = config.get(config_key) + + if not column_name: + warning_msg = ( + f"Configuration key '{config_key}' is missing for datatype '{datatype_name}'. " + f"Optional column validation for this key is skipped." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + continue + + if column_name in geodata.columns: + # Type Check + if not geodata[column_name].apply(lambda x: isinstance(x, expected_type) or pandas.isnull(x)).all(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') contains values that are not of type " + f"{expected_type if isinstance(expected_type, type) else expected_type}. " + "Map2loop processing might not work as expected." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + + # Blank String Check (if applicable) + if check_blank and issubclass(expected_type, str): + if geodata[column_name].str.strip().eq("").any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') contains blank (empty) string values. " + "Map2loop processing might not work as expected." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + + # Null Value Check + if geodata[column_name].isnull().any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + f"(config key: '{config_key}') contains NaN or null values. " + "Map2loop processing might not work as expected." + ) + logger.warning(warning_msg) + warnings.append(warning_msg) + + # else: + # warning_msg = ( + # f"Datatype {datatype_name.upper()}: Optional column '{column_name}' " + # f"(config key: '{config_key}') is missing from the data. " + # ) + ####### this might be taking it a bit too far + + # logger.info(warning_msg) + # warnings.append(warning_msg) + + return warnings + + +@beartype.beartype +def validate_dip_columns( + geodata: geopandas.GeoDataFrame, + config: Dict[str, str], + dip_columns: List[str], + datatype_name: str = "UNKNOWN", + allow_nulls: bool = False +) -> Tuple[bool, str]: + + validation_failed = False + messages = [] + + # Define fixed ranges + fixed_ranges = { + "dip_column": (0, 90), + "dipdir_column": (0, 360) + } + + for key in dip_columns: + column_name = config.get(key) + if not column_name and datatype_name == "STRUCTURE": # noly mandatory for structure, not faults! + warning_msg = ( + f"Configuration key '{key}' is missing for datatype '{datatype_name}'. " + f"Dip column validation for this key is skipped." + ) + logger.warning(warning_msg) + messages.append(warning_msg) + validation_failed = True + continue + + if column_name in geodata.columns: + # Coerce to numeric + geodata[column_name] = pandas.to_numeric(geodata[column_name], errors='coerce') + + # Check for non-numeric or NaN values + if geodata[column_name].isnull().any(): + if not allow_nulls: + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{column_name}' " + f"(config key: '{key}') contains non-numeric or NaN values." + ) + logger.warning(warning_msg) + messages.append(warning_msg) + validation_failed = True + + # Check if all values are numeric + if not geodata[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{column_name}' " + f"(config key: '{key}') must contain only numeric values." + ) + logger.warning(warning_msg) + messages.append(warning_msg) + validation_failed = True + + # Range validation + min_val, max_val = fixed_ranges.get(key, (None, None)) + if min_val is not None and max_val is not None: + invalid_values = ~geodata[column_name].between(min_val, max_val, inclusive='both') + if invalid_values.any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{column_name}' " + f"(config key: '{key}') contains values outside the range [{min_val}, {max_val}]. " + "Is this intentional?" + ) + logger.warning(warning_msg) + messages.append(warning_msg) + + summary_message = "\n".join(messages) + return (validation_failed, summary_message) + + +@beartype.beartype +def validate_structtype_column( + geodata: geopandas.GeoDataFrame, + config: Dict[str, str], + datatype_name: str, + required: bool = True, + text_keys: Optional[Dict[str, str]] = None +) -> Tuple[bool, str]: + + structtype_key = "structtype_column" + structtype_column = config.get(structtype_key) + + if not structtype_column: + if required: + error_msg = ( + f"Configuration key '{structtype_key}' is missing for datatype '{datatype_name}'. " + f"Validation for 'structtype_column' is skipped." + ) + logger.warning(error_msg) + return (True, error_msg) + else: + warning_msg = ( + f"Configuration key '{structtype_key}' is missing for datatype '{datatype_name}'. " + f"Optional 'structtype_column' validation is skipped." + ) + logger.warning(warning_msg) + return (False, "") + + if structtype_column not in geodata.columns: + if required: + error_msg = ( + f"Datatype {datatype_name.upper()}: '{structtype_column}' (config key: '{structtype_key}') " + f"is missing from the data. Consider removing that key from the config." + ) + logger.error(error_msg) + return (True, error_msg) + else: + warning_msg = ( + f"Datatype {datatype_name.upper()}: '{structtype_column}' (config key: '{structtype_key}') " + f"is missing from the data. Consider removing that key from the config." + ) + logger.warning(warning_msg) + return (False, "") + + # Check if all entries are strings or nulls + if not geodata[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all(): + error_msg = ( + f"Datatype {datatype_name.upper()}: Column '{structtype_column}' " + f"(config key: '{structtype_key}') contains non-string values. " + "Please ensure all values in this column are strings." + ) + logger.error(error_msg) + return (True, error_msg) + + # Warn about empty or null cells + if geodata[structtype_column].isnull().any() or geodata[structtype_column].str.strip().eq("").any(): + warning_msg = ( + f"Datatype {datatype_name.upper()}: Column '{structtype_column}' contains NaN, empty, or blank values. " + "Processing might not work as expected." + ) + logger.warning(warning_msg) + + # Check for specific text keys + if text_keys: + for text_key, config_key in text_keys.items(): + text_value = config.get(config_key, None) + if text_value: + if not isinstance(text_value, str): + error_msg = ( + f"Datatype {datatype_name.upper()}: '{config_key}' must be a string. " + "Please ensure it is defined correctly in the config." + ) + logger.error(error_msg) + return (True, error_msg) + + if not geodata[structtype_column].str.contains(text_value, na=False).any(): + if text_key == "synform_text": + warning_msg = ( + f"Datatype {datatype_name.upper()}: The '{text_key}' value '{text_value}' is not found in column '{structtype_column}'. " + "This may impact processing." + ) + logger.warning(warning_msg) + else: + error_msg = ( + f"Datatype {datatype_name.upper()}: The '{text_key}' value '{text_value}' is not found in column '{structtype_column}'. " + "Project might end up with no faults." + ) + logger.error(error_msg) + return (True, error_msg) + return (False, "") \ No newline at end of file diff --git a/tests/data_checks/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py index 2f70176a..6c594d72 100644 --- a/tests/data_checks/test_input_data_faults.py +++ b/tests/data_checks/test_input_data_faults.py @@ -49,7 +49,7 @@ }, {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"}, True, - "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values." + "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." ), # Invalid values in DIP estimate column ( diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py index 740039b8..e20b97d4 100644 --- a/tests/data_checks/test_input_data_fold.py +++ b/tests/data_checks/test_input_data_fold.py @@ -51,7 +51,7 @@ }, {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, True, - "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values." + "Datatype FAULT: Column 'FOLD' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." ), # Missing ID column ( From d1f67a9e64fb09fc2b3e13351f23af9148e2eb5a Mon Sep 17 00:00:00 2001 From: AngRodrigues Date: Tue, 14 Jan 2025 15:32:09 +1100 Subject: [PATCH 29/29] chore: finalise details --- tests/data_checks/test_input_data_fold.py | 2 +- tests/project/test_ignore_codes_setters_getters.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py index e20b97d4..94834894 100644 --- a/tests/data_checks/test_input_data_fold.py +++ b/tests/data_checks/test_input_data_fold.py @@ -51,7 +51,7 @@ }, {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"}, True, - "Datatype FAULT: Column 'FOLD' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." + "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings." ), # Missing ID column ( diff --git a/tests/project/test_ignore_codes_setters_getters.py b/tests/project/test_ignore_codes_setters_getters.py index 34de34a4..0674ca95 100644 --- a/tests/project/test_ignore_codes_setters_getters.py +++ b/tests/project/test_ignore_codes_setters_getters.py @@ -21,6 +21,7 @@ def test_set_get_ignore_codes(): config_dictionary = { "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"}, "geology": {"unitname_column": "unitname", "alt_unitname_column": "code"}, + "fault": {'structtype_column': 'feature', 'fault_text': 'Fault'}, } with patch.object(Project, 'validate_required_inputs', return_value=None): project = Project(