From d55523e5a44c9bcf2b81c42f882a64504f3c8186 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 17 Dec 2024 13:29:38 +1100
Subject: [PATCH 01/29] fix: add check for missing/invalid args in project

---
 map2loop/project.py | 85 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 5 deletions(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index 84aa0eea..ff319745 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -18,7 +18,7 @@
 gdal.UseExceptions()
 import geopandas
 import beartype
-from beartype.typing import Union, List
+from beartype.typing import Union, List, Dict, Any
 import pathlib
 import numpy
 import pandas
@@ -75,7 +75,7 @@ def __init__(
         loop_project_filename: str = "",
         overwrite_loopprojectfile: bool = False,
         **kwargs,
-    ):
+    ):  
         """
         The initialiser for the map2loop project
 
@@ -119,6 +119,28 @@ def __init__(
             TypeError: Type of bounding_box not a dict or tuple
             ValueError: use_australian_state_data not in state list ['WA', 'SA', 'QLD', 'NSW', 'TAS', 'VIC', 'ACT', 'NT']
         """
+        
+        # Throw error if unexpected keyword arguments are passed to project
+        allowed_kwargs = {"metadata_filename"}
+        for key in kwargs.keys():
+            if key not in allowed_kwargs:
+                logger.error(
+                    f"Unexpected keyword argument '{key}' passed to Project. Allowed keywords: {', '.join(allowed_kwargs)}."
+                )
+                raise TypeError(
+                    f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding."
+                )
+        
+        # make sure all the needed arguments are provided
+        self.validate_required_inputs(
+            bounding_box=bounding_box,
+            working_projection=working_projection,
+            geology_filename=geology_filename,
+            structure_filename=structure_filename,
+            dtm_filename=dtm_filename,
+            config_dictionary=config_dictionary,
+            config_filename=config_filename,
+        )
         self._error_state = ErrorState.NONE
         self._error_state_msg = ""
         self.verbose_level = verbose_level
@@ -233,6 +255,58 @@ def __init__(
         if len(kwargs):
             logger.warning(f"Unused keyword arguments: {kwargs}")
 
+    @beartype.beartype
+    def validate_required_inputs(
+        self,
+        bounding_box: Dict[str, Union[float, int]],
+        working_projection: str,
+        geology_filename: str,
+        structure_filename: str,
+        dtm_filename: str,
+        config_filename: str = None,
+        config_dictionary: Dict[str, Any] = {},
+    ) -> None:
+
+        required_inputs = {
+            "bounding_box": bounding_box,
+            "working_projection": working_projection,
+            "geology_filename": geology_filename,
+            "structure_filename": structure_filename,
+            "dtm_filename": dtm_filename,
+        }
+
+        # Check for missing required inputs in project
+        missing_inputs = [key for key, value in required_inputs.items() if not value]
+
+        if missing_inputs:
+            missing_list = ", ".join(missing_inputs)
+            logger.error(
+                f"Project construction is missing required inputs: {missing_list}. "
+                "Please add them to the Project()."
+            )
+            raise ValueError(
+                f"Project construction is missing required inputs: {missing_list}. "
+                "Please add them to the Project()."
+            )
+
+        # Either config_filename or config_dictionary must be provided (but not both or neither)
+        if not config_filename and not config_dictionary:
+            logger.error(
+                "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project."
+            )
+            raise ValueError(
+                "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project."
+            )
+        if config_filename and config_dictionary:
+            logger.error(
+                "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one."
+            )
+            raise ValueError(
+                "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one."
+            )
+
+            
+    
     # Getters and Setters
     @beartype.beartype
     def set_ignore_lithology_codes(self, codes: list):
@@ -734,9 +808,10 @@ def save_into_projectfile(self):
         logger.info('Saving data into loop project file')
         if not self.loop_filename:
             logger.info('No loop project file specified, creating a new one')
-            self.loop_filename = os.path.join(
-                self.map_data.tmp_path, os.path.basename(self.map_data.tmp_path) + ".loop3d"
-            )
+            output_dir = pathlib.Path.cwd()  
+            output_dir.mkdir(parents=True, exist_ok=True) 
+            filename = "new_project.loop3d"
+            self.loop_filename = str(output_dir / filename)
 
         file_exists = os.path.isfile(self.loop_filename)
 

From 596ddb3aa60b2a4c529d24cd1447658b8ff3092c Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 17 Dec 2024 13:30:47 +1100
Subject: [PATCH 02/29] chore: add mention to issue

---
 map2loop/project.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index ff319745..6883d79b 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -269,7 +269,7 @@ def validate_required_inputs(
 
         required_inputs = {
             "bounding_box": bounding_box,
-            "working_projection": working_projection,
+            "working_projection": working_projection, # this may be removed when fix is added for https://github.com/Loop3D/map2loop/issues/103
             "geology_filename": geology_filename,
             "structure_filename": structure_filename,
             "dtm_filename": dtm_filename,

From 3345bbec278343c29875911484efd9724df91593 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 15:53:52 +1100
Subject: [PATCH 03/29] chore: clear warnings

---
 map2loop/project.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index 6883d79b..41f84447 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -292,10 +292,10 @@ def validate_required_inputs(
         # Either config_filename or config_dictionary must be provided (but not both or neither)
         if not config_filename and not config_dictionary:
             logger.error(
-                "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project."
+                "A config file is required to run map2loop - use either 'config_filename' or 'config_dictionary' to initialise the project."
             )
             raise ValueError(
-                "Either 'config_filename' or 'config_dictionary' must be provided to initialize the Project."
+                 "A config file is required to run map2loop - use either 'config_filename' or 'config_dictionary' to initialise the project."
             )
         if config_filename and config_dictionary:
             logger.error(

From b1a94bd3e013a1fd566884b49c5f70dea85b8d3a Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 15:55:32 +1100
Subject: [PATCH 04/29] chore: make warnings clear

---
 map2loop/project.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index 41f84447..e9b4d919 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -299,10 +299,10 @@ def validate_required_inputs(
             )
         if config_filename and config_dictionary:
             logger.error(
-                "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one."
+                "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one config."
             )
             raise ValueError(
-                "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one."
+                "Both 'config_filename' and 'config_dictionary' were provided. Please specify only one config."
             )
 
             

From 1bb6e4787f7d789c24e0f06bb2052f9e0e2a53e3 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 15:56:21 +1100
Subject: [PATCH 05/29] chore: make warnings clear

---
 map2loop/project.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index e9b4d919..d4fce9a4 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -128,7 +128,7 @@ def __init__(
                     f"Unexpected keyword argument '{key}' passed to Project. Allowed keywords: {', '.join(allowed_kwargs)}."
                 )
                 raise TypeError(
-                    f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding."
+                    f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding with map2loop processing"
                 )
         
         # make sure all the needed arguments are provided

From 27ea51df28a9fe62b13b25a8cb5be7c6f333cc9c Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 16:09:21 +1100
Subject: [PATCH 06/29] tests: bypass the necessary dataset requirement

---
 .../test_ignore_codes_setters_getters.py      | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/project/test_ignore_codes_setters_getters.py b/tests/project/test_ignore_codes_setters_getters.py
index 4cebdba7..34de34a4 100644
--- a/tests/project/test_ignore_codes_setters_getters.py
+++ b/tests/project/test_ignore_codes_setters_getters.py
@@ -2,6 +2,7 @@
 from map2loop.project import Project
 from map2loop.m2l_enums import Datatype
 import map2loop
+from unittest.mock import patch
 
 
 # Sample test function for lithology and fault ignore codes
@@ -21,24 +22,25 @@ def test_set_get_ignore_codes():
         "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"},
         "geology": {"unitname_column": "unitname", "alt_unitname_column": "code"},
     }
-
-    project = Project(
-        working_projection='EPSG:28350',
-        bounding_box=bbox_3d,
-        geology_filename=str(
-            pathlib.Path(map2loop.__file__).parent
-            / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson')
-        ),
-        fault_filename=str(
-            pathlib.Path(map2loop.__file__).parent
-            / pathlib.Path('_datasets/geodata_files/hamersley/faults.geojson')
-        ),
-        dtm_filename=str(
-            pathlib.Path(map2loop.__file__).parent
-            / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif')
-        ),
-        config_dictionary=config_dictionary,
-    )
+    with patch.object(Project, 'validate_required_inputs', return_value=None):
+        project = Project(
+            working_projection='EPSG:28350',
+            bounding_box=bbox_3d,
+            geology_filename=str(
+                pathlib.Path(map2loop.__file__).parent
+                / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson')
+            ),
+            fault_filename=str(
+                pathlib.Path(map2loop.__file__).parent
+                / pathlib.Path('_datasets/geodata_files/hamersley/faults.geojson')
+            ),
+            dtm_filename=str(
+                pathlib.Path(map2loop.__file__).parent
+                / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif')
+            ),
+            config_dictionary=config_dictionary,
+            structure_filename="", 
+        )
 
     # Define test ignore codes for lithology and faults
     lithology_codes = ["cover", "Fortescue_Group", "A_FO_od"]

From c9da10dba3c4614b7e272fd72eb90864771862b1 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 16:09:52 +1100
Subject: [PATCH 07/29] fix: skip required file checks if using loop server
 data

---
 map2loop/project.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index d4fce9a4..f6de1941 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -132,15 +132,16 @@ def __init__(
                 )
         
         # make sure all the needed arguments are provided
-        self.validate_required_inputs(
-            bounding_box=bounding_box,
-            working_projection=working_projection,
-            geology_filename=geology_filename,
-            structure_filename=structure_filename,
-            dtm_filename=dtm_filename,
-            config_dictionary=config_dictionary,
-            config_filename=config_filename,
-        )
+        if not use_australian_state_data: # this check has to skip if using Loop server data
+            self.validate_required_inputs(
+                bounding_box=bounding_box,
+                working_projection=working_projection,
+                geology_filename=geology_filename,
+                structure_filename=structure_filename,
+                dtm_filename=dtm_filename,
+                config_dictionary=config_dictionary,
+                config_filename=config_filename,
+            )
         self._error_state = ErrorState.NONE
         self._error_state_msg = ""
         self.verbose_level = verbose_level

From 713abffcbeb0f098641898dbc76c798b67127c56 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 16:33:00 +1100
Subject: [PATCH 08/29] fix: make the config check through project

---
 map2loop/config.py  | 19 ++++++-------------
 map2loop/project.py |  3 ++-
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/map2loop/config.py b/map2loop/config.py
index 48d017d3..2a0efdf7 100644
--- a/map2loop/config.py
+++ b/map2loop/config.py
@@ -103,9 +103,6 @@ def update_from_dictionary(self, dictionary: dict, lower: bool = True):
         # make sure dictionary doesn't contain legacy keys
         self.check_for_legacy_keys(dictionary)
         
-        # make sure it has the minimum requirements
-        self.validate_config_dictionary(dictionary)
-        
         if "structure" in dictionary:
             self.structure_config.update(dictionary["structure"])
             for key in dictionary["structure"].keys():
@@ -218,25 +215,20 @@ def update_from_file(
 
     @beartype.beartype
     def validate_config_dictionary(self, config_dict: dict) -> None:
-        """
-        Validate the structure and keys of the configuration dictionary.
-
-        Args:
-            config_dict (dict): The config dictionary to validate.
-
-        Raises:
-            ValueError: If the dictionary does not meet the minimum requirements for ma2p2loop.
-        """    
         required_keys = {
             "structure": {"dipdir_column", "dip_column"},
             "geology": {"unitname_column", "alt_unitname_column"},
         }
 
+        # Loop over "structure" and "geology"
         for section, keys in required_keys.items():
+
+            # 1) Check that "section" exists
             if section not in config_dict:
                 logger.error(f"Missing required section '{section}' in config dictionary.")
                 raise ValueError(f"Missing required section '{section}' in config dictionary.")
-            
+
+            # 2) Check that each required key is in config_dict[section]
             for key in keys:
                 if key not in config_dict[section]:
                     logger.error(
@@ -246,6 +238,7 @@ def validate_config_dictionary(self, config_dict: dict) -> None:
                         f"Missing required key '{key}' for '{section}' section of the config dictionary."
                     )
 
+
     @beartype.beartype
     def check_for_legacy_keys(self, config_dict: dict) -> None:
 
diff --git a/map2loop/project.py b/map2loop/project.py
index f6de1941..6e4303c6 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -142,6 +142,7 @@ def __init__(
                 config_dictionary=config_dictionary,
                 config_filename=config_filename,
             )
+        
         self._error_state = ErrorState.NONE
         self._error_state_msg = ""
         self.verbose_level = verbose_level
@@ -230,12 +231,12 @@ def __init__(
             self.map_data.set_config_filename(config_filename)
 
         if config_dictionary != {}:
+            self.map_data.config.validate_config_dictionary(config_dictionary)
             self.map_data.config.update_from_dictionary(config_dictionary)
             
         if clut_filename != "":
             self.map_data.set_colour_filename(clut_filename)
             
-
         
         # Load all data (both shape and raster)
         self.map_data.load_all_map_data()

From b52d54014fc493846ff3a068f8dc8b00bb5b0f82 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 7 Jan 2025 16:48:18 +1100
Subject: [PATCH 09/29] tests: add tests for config checks

---
 tests/project/test_config_arguments.py | 150 +++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 tests/project/test_config_arguments.py

diff --git a/tests/project/test_config_arguments.py b/tests/project/test_config_arguments.py
new file mode 100644
index 00000000..229bcabc
--- /dev/null
+++ b/tests/project/test_config_arguments.py
@@ -0,0 +1,150 @@
+import pytest
+import pathlib
+from unittest.mock import patch
+from map2loop.project import Project
+from map2loop.m2l_enums import Datatype
+import map2loop
+
+# ------------------------------------------------------------------------------
+# Common fixtures or helper data (bounding box, minimal filenames, etc.)
+# ------------------------------------------------------------------------------
+
+@pytest.fixture
+def minimal_bounding_box():
+    return {
+        "minx": 515687.31005864,
+        "miny": 7493446.76593407,
+        "maxx": 562666.860106543,
+        "maxy": 7521273.57407786,
+        "base": -3200,
+        "top": 3000,
+    }
+
+@pytest.fixture
+def geology_file():
+    return str(
+        pathlib.Path(map2loop.__file__).parent
+        / pathlib.Path('_datasets/geodata_files/hamersley/geology.geojson')
+    )
+
+@pytest.fixture
+def structure_file():
+    return str(
+        pathlib.Path(map2loop.__file__).parent
+        / pathlib.Path('_datasets/geodata_files/hamersley/structure.geojson')
+    )
+
+@pytest.fixture
+def dtm_file():
+    return str(
+        pathlib.Path(map2loop.__file__).parent
+        / pathlib.Path('_datasets/geodata_files/hamersley/dtm_rp.tif')
+    )
+
+@pytest.fixture
+def valid_config_dictionary():
+    """
+    A valid config dictionary that meets the 'structure' and 'geology' requirements
+    """
+    return {
+        "structure": {
+            "dipdir_column": "azimuth2",
+            "dip_column": "dip"
+        },
+        "geology": {
+            "unitname_column": "unitname",
+            "alt_unitname_column": "code",
+        }
+    }
+
+
+
+# 1) config_filename and config_dictionary both present should raise ValueError
+def test_config_filename_and_dictionary_raises_error(
+    minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary
+):
+
+    with pytest.raises(ValueError, match="Both 'config_filename' and 'config_dictionary' were provided"):
+        Project(
+            bounding_box=minimal_bounding_box,
+            working_projection="EPSG:28350",
+            geology_filename=geology_file,
+            dtm_filename=dtm_file,
+            structure_filename=structure_file,
+            config_filename="dummy_config.json",
+            config_dictionary=valid_config_dictionary,
+        )
+        
+# 2) No config_filename or config_dictionary should raise ValueError
+def test_no_config_provided_raises_error(
+    minimal_bounding_box, geology_file, dtm_file, structure_file
+):
+
+    with pytest.raises(ValueError, match="A config file is required to run map2loop"):
+        Project(
+            bounding_box=minimal_bounding_box,
+            working_projection="EPSG:28350",
+            geology_filename=geology_file,
+            dtm_filename=dtm_file,
+            structure_filename=structure_file,
+        )
+
+# 3) Passing an unexpected argument should raise TypeError
+def test_unexpected_argument_raises_error(
+    minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary
+):
+   
+    with pytest.raises(TypeError, match="unexpected keyword argument 'config_file'"):
+        Project(
+            bounding_box=minimal_bounding_box,
+            working_projection="EPSG:28350",
+            geology_filename=geology_file,
+            dtm_filename=dtm_file,
+            structure_filename=structure_file,
+            config_dictionary=valid_config_dictionary,
+            config_file="wrong_kwarg.json", 
+        )
+
+# 4) Dictionary missing a required key should raise ValueError
+
+def test_dictionary_missing_required_key_raises_error(
+    minimal_bounding_box, geology_file, dtm_file, structure_file
+):
+
+    invalid_dictionary = {
+        "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"},
+        "geology": {"unitname_column": "unitname"}  # alt_unitname_column missing
+    }
+
+    with pytest.raises(ValueError, match="Missing required key 'alt_unitname_column' for 'geology'"):
+        Project(
+            bounding_box=minimal_bounding_box,
+            working_projection="EPSG:28350",
+            geology_filename=geology_file,
+            dtm_filename=dtm_file,
+            structure_filename=structure_file,
+            config_dictionary=invalid_dictionary,
+        )
+
+# 5) All good => The Project should be created without errors
+def test_good_config_runs_successfully(
+    minimal_bounding_box, geology_file, dtm_file, structure_file, valid_config_dictionary
+):
+    project = None
+    try:
+        project = Project(
+            bounding_box=minimal_bounding_box,
+            working_projection="EPSG:28350",
+            geology_filename=geology_file,
+            dtm_filename=dtm_file,
+            structure_filename=structure_file,
+            config_dictionary=valid_config_dictionary,
+        )
+    except Exception as e:
+        pytest.fail(f"Project initialization raised an unexpected exception: {e}")
+
+    assert project is not None, "Project was not created."
+    assert project.map_data.config.structure_config["dipdir_column"] == "azimuth2"
+    assert project.map_data.config.structure_config["dip_column"] == "dip"
+    assert project.map_data.config.geology_config["unitname_column"] == "unitname"
+    assert project.map_data.config.geology_config["alt_unitname_column"] == "code"
\ No newline at end of file

From 632287237e2f3fc231c51b7605c2ec0d6fd526a3 Mon Sep 17 00:00:00 2001
From: AngRodrigues <AngRodrigues@users.noreply.github.com>
Date: Tue, 7 Jan 2025 05:51:37 +0000
Subject: [PATCH 10/29] style: style fixes by ruff and autoformatting by black

---
 tests/project/test_config_arguments.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/project/test_config_arguments.py b/tests/project/test_config_arguments.py
index 229bcabc..53e6ce35 100644
--- a/tests/project/test_config_arguments.py
+++ b/tests/project/test_config_arguments.py
@@ -1,8 +1,6 @@
 import pytest
 import pathlib
-from unittest.mock import patch
 from map2loop.project import Project
-from map2loop.m2l_enums import Datatype
 import map2loop
 
 # ------------------------------------------------------------------------------

From 554ed6aace57e047fb4696820b1894ea49384db6 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 08:33:26 +1100
Subject: [PATCH 11/29] fiz: actually use libmamba on build

---
 .github/workflows/conda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml
index 1a9949f1..47cf32e0 100644
--- a/.github/workflows/conda.yml
+++ b/.github/workflows/conda.yml
@@ -28,7 +28,7 @@ jobs:
         shell: bash -l {0}
         run: |
           conda install -c conda-forge conda-build scikit-build-core numpy anaconda-client conda-libmamba-solver -y
-          conda build -c conda-forge -c loop3d --output-folder conda conda  --python ${{matrix.python-version}}
+          conda build -c conda-forge -c loop3d --output-folder conda conda  --python ${{matrix.python-version}} --solver=libmamba
           anaconda upload --label main conda/*/*.tar.bz2
 
       - name: upload artifacts

From f9752b2511714777b829c2418fdc039154e825af Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 08:42:07 +1100
Subject: [PATCH 12/29] revert previous commit

---
 .github/workflows/conda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml
index 47cf32e0..bd9b0f28 100644
--- a/.github/workflows/conda.yml
+++ b/.github/workflows/conda.yml
@@ -28,7 +28,7 @@ jobs:
         shell: bash -l {0}
         run: |
           conda install -c conda-forge conda-build scikit-build-core numpy anaconda-client conda-libmamba-solver -y
-          conda build -c conda-forge -c loop3d --output-folder conda conda  --python ${{matrix.python-version}} --solver=libmamba
+          conda build -c conda-forge -c loop3d --output-folder conda conda  --python ${{matrix.python-version}}reve
           anaconda upload --label main conda/*/*.tar.bz2
 
       - name: upload artifacts

From 9be3ac6b2e4f9bbe7e62720eeffdf6d704ec3a15 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 12:12:12 +1100
Subject: [PATCH 13/29] fix: add init commit

---
 map2loop/data_checks.py | 444 ++++++++++++++++++++++++++++++++++++++++
 map2loop/mapdata.py     | 315 +++++++++++++---------------
 2 files changed, 586 insertions(+), 173 deletions(-)
 create mode 100644 map2loop/data_checks.py

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
new file mode 100644
index 00000000..66ec94e6
--- /dev/null
+++ b/map2loop/data_checks.py
@@ -0,0 +1,444 @@
+#internal imports
+from .m2l_enums import Datatype
+
+#external imports
+import beartype as beartype
+from beartype.typing import Tuple
+import geopandas
+import shapely
+import pandas
+
+from .logging import getLogger
+logger = getLogger(__name__)  
+
+@beartype.beartype
+def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
+    #TODO (AR) - add check for gaps in geology data
+    """
+    Validate the columns in GEOLOGY geodataframe
+
+    Several checks to ensure that the geology data:
+    - Is loaded and valid.
+    - Contains required columns with appropriate types and no missing or blank values.
+    - Has optional columns with valid types, if present.
+    - Does not contain duplicate in IDs.
+    - Ensures the geometry column has valid geometries.
+
+    Returns:
+        Tuple[bool, str]: A tuple indicating success (False) or failure (True)
+    """
+    # Check if geology data is loaded and valid
+    if (
+        mapdata.raw_data[Datatype.GEOLOGY] is None
+        or type(mapdata.raw_data[Datatype.GEOLOGY]) is not geopandas.GeoDataFrame
+    ):
+        logger.error("GEOLOGY data is not loaded or is not a valid GeoDataFrame")
+        return (True, "GEOLOGY data is not loaded or is not a valid GeoDataFrame")
+    
+    geology_data = mapdata.raw_data[Datatype.GEOLOGY]
+    config = mapdata.config.geology_config
+    
+    # 1. Check geometry validity - tested & working
+    if not geology_data.geometry.is_valid.all():
+        logger.error("Invalid geometries found. Please fix those before proceeding with map2loop processing")
+        return (True, "Invalid geometries found in datatype GEOLOGY")
+
+    # # 2. Required Columns & are they str, and then empty or null? 
+    required_columns = [config["unitname_column"], config["alt_unitname_column"]]
+    for col in required_columns:
+        if col not in geology_data.columns:
+            logger.error(f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.")
+            return (True, f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.")
+        if not geology_data[col].apply(lambda x: isinstance(x, str)).all():
+            config_key = [k for k, v in config.items() if v == col][0]
+            logger.error(f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.")
+            return (True, f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.")
+        if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any():
+            config_key = [k for k, v in config.items() if v == col][0]
+            logger.error(f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
+            return (True, f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
+
+    # # 3. Optional Columns
+    optional_string_columns = [
+        "group_column", "supergroup_column", "description_column",
+        "rocktype_column", "alt_rocktype_column",
+    ]
+    
+    for key in optional_string_columns:
+        if key in config and config[key] in geology_data.columns:
+            if not geology_data[config[key]].apply(lambda x: isinstance(x, str)).all():
+                logger.warning(
+                    f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-string values. "
+                    "Map2loop processing might not work as expected."
+                )
+
+    optional_numeric_columns = ["minage_column", "maxage_column", "objectid_column"]
+    for key in optional_numeric_columns:
+        if key in config and config[key] in geology_data.columns:
+            if not geology_data[config[key]].apply(lambda x: isinstance(x, (int, float))).all():
+                logger.warning(
+                    f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-numeric values. "
+                    "Map2loop processing might not work as expected."
+        )
+    
+    # # 4. Check for duplicates in ID
+    if "objectid_column" in config and config["objectid_column"] in geology_data.columns:
+        objectid_values = geology_data[config["objectid_column"]]
+        
+        # Check for None, NaN, or other null-like values
+        if objectid_values.isnull().any():
+            logger.error(
+                f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values. Ensure all values are valid and non-null."
+            )
+            return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values.")
+        
+        # Check for duplicate values
+        if objectid_values.duplicated().any():
+            logger.error(
+                f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column'). Please make sure that the column contains unique values."
+            )
+            return (True, f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column').")
+        
+        # Check for uniqueness
+        if not objectid_values.is_unique:
+            logger.error(
+                f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values. Ensure all values are unique."
+            )
+            return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values.")
+
+
+    # 5. Check for NaNs/blanks in optional fields with warnings
+    warning_fields = [
+        "group_column", "supergroup_column", "description_column",
+        "rocktype_column", "minage_column", "maxage_column",
+    ]
+    for key in warning_fields:
+        col = config.get(key)
+        if col and col in geology_data.columns:
+            # Check if column contains string values before applying `.str`
+            if pandas.api.types.is_string_dtype(geology_data[col]):
+                if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any():
+                    logger.warning(
+                        f"Datatype GEOLOGY: NaN or blank values found in optional column '{col}' (config key: '{key}')."
+                    )
+            else:
+                # Non-string columns, check only for NaN values
+                if geology_data[col].isnull().any():
+                    logger.warning(
+                        f"Datatype GEOLOGY: NaN values found in optional column '{col}' (config key: '{key}')."
+                    )
+
+
+    logger.info("Geology fields validation passed.")
+    return (False, "")
+
+@beartype.beartype
+def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
+    """
+    Validate the structure data for required and optional fields.
+
+    Performs the following checks:
+    - Ensures the structure map is loaded, valid, and contains at least two structures.
+    - Validates the geometry column
+    - Checks required numeric columns (`dip_column`, `dipdir_column`) for existence, dtype, range, and null values.
+    - Checks optional string columns (`description_column`, `overturned_column`) for type and null/empty values.
+    - Validates the optional numeric `objectid_column` for type, null values, and duplicates.
+
+    Returns:
+        Tuple[bool, str]: A tuple where the first value indicates if validation failed (True = failed),
+                        and the second value provides a message describing the issue.
+    """
+    
+    # Check type and size of loaded structure map
+    if (
+        mapdata.raw_data[Datatype.STRUCTURE] is None
+        or type(mapdata.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame
+    ):
+        logger.warning("Structure map is not loaded or valid")
+        return (True, "Structure map is not loaded or valid")
+
+    if len(mapdata.raw_data[Datatype.STRUCTURE]) < 2:
+        logger.warning(
+            "Datatype STRUCTURE: map does with not enough orientations to complete calculations (need at least 2), projection may be inconsistent"
+        )
+    
+    structure_data = mapdata.raw_data[Datatype.STRUCTURE]
+    config = mapdata.config.structure_config
+
+    # 1. Check geometry validity
+    if not structure_data.geometry.is_valid.all():
+        logger.error("datatype STRUCTURE: Invalid geometries found. Please fix those before proceeding with map2loop processing")
+        return (True, "Invalid geometries found in datatype STRUCTURE")
+
+    # 2. Check mandatory numeric columns
+    required_columns = [config["dipdir_column"], config["dip_column"]]
+    for col in required_columns:
+        if col not in structure_data.columns:
+            logger.error(f"DDatatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.")
+            return (True, f"Datatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.")
+        if not structure_data[col].apply(lambda x: isinstance(x, (int, float))).all():
+            config_key = [k for k, v in config.items() if v == col][0]
+            logger.error(f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.")
+            return (True, f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.")
+        if structure_data[col].isnull().any():
+            config_key = [k for k, v in config.items() if v == col][0]
+            logger.error(f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
+            return (True, f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
+
+    if config["dip_column"] in structure_data.columns:
+        invalid_dip = ~((structure_data[config["dip_column"]] >= 0) & (structure_data[config["dip_column"]] <= 90))
+        if invalid_dip.any():
+            logger.warning(
+                f"Datatype STRUCTURE: Column '{config['dip_column']}' has values that are not between 0 and 90 degrees. Is this intentional?"
+            )
+
+    if config["dipdir_column"] in structure_data.columns:
+        invalid_dipdir = ~((structure_data[config["dipdir_column"]] >= 0) & (structure_data[config["dipdir_column"]] <= 360))
+        if invalid_dipdir.any():
+            logger.warning(
+                f"Datatype STRUCTURE: Column '{config['dipdir_column']}' has values that are not between 0 and 360 degrees. Is this intentional?"
+            )
+    
+    # check validity of optional string columns
+    optional_string_columns = ["description_column", "overturned_column"]
+    for key in optional_string_columns:
+        if key in config and config[key] in structure_data.columns:
+            column_name = config[key]
+            if not structure_data[column_name].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
+                logger.warning(
+                    f"Datatype STRUCTURE: Optional column with config key: '{key}' contains non-string values. "
+                    "Map2loop processing might not work as expected."
+                )
+            if structure_data[column_name].isnull().any() or structure_data[column_name].str.strip().eq("").any():
+                logger.warning(
+                    f"Datatype STRUCTURE: Optional column config key: '{key}' contains NaN, empty, or null values. "
+                    "Map2loop processing might not work as expected."
+        )
+
+    # check ID column for type, null values, and duplicates
+    optional_numeric_column_key = "objectid_column"
+    optional_numeric_column = config.get(optional_numeric_column_key)
+
+    if optional_numeric_column:
+        if optional_numeric_column in structure_data.columns:
+            # Check for non-integer values
+            if not structure_data[optional_numeric_column].apply(lambda x: isinstance(x, int) or pandas.isnull(x)).all():
+                logger.error(
+                    f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values. Rectify this, or remove this column from the config - map2loop will generate a new ID."
+                )
+                return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values.")
+            # Check for NaN
+            if structure_data[optional_numeric_column].isnull().any():
+                logger.error(
+                    f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values. Rectify this, or remove this column from the config - map2loop will generate a new ID."
+                )
+                return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values.")
+            # Check for duplicates
+            if structure_data[optional_numeric_column].duplicated().any():
+                logger.error(
+                    f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values. Rectify this, or remove this column from the config - map2loop will generate a new ID."
+                )
+                return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values.")
+
+    return (False, "")
+
+@beartype.beartype
+def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
+    
+    # Check type of loaded fault map
+    if (
+        mapdata.raw_data[Datatype.FAULT] is None
+        or type(mapdata.raw_data[Datatype.FAULT]) is not geopandas.GeoDataFrame
+    ):
+        logger.warning("Fault map is not loaded or valid")
+        return (True, "Fault map is not loaded or valid")
+    
+    fault_data = mapdata.raw_data[Datatype.FAULT]
+    config = mapdata.config.fault_config
+    
+    # Check geometry
+    if not fault_data.geometry.is_valid.all():
+        logger.error("datatype FAULT: Invalid geometries found. Please fix those before proceeding with map2loop processing")
+        return (True, "Invalid geometries found in FAULT data.")
+
+    # Check for LineString or MultiLineString geometries
+    if not fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all():
+        invalid_types = fault_data[~fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))]
+        logger.error(
+            f"FAULT data contains invalid geometry types. Rows with invalid geometry types: {invalid_types.index.tolist()}"
+        )
+        return (True, "FAULT data contains geometries that are not LineString or MultiLineString.")
+    
+    # Check "structtype_column" if it exists
+    if "structtype_column" in config:
+        structtype_column = config["structtype_column"]
+
+        # Ensure the column exists in the data
+        if structtype_column not in fault_data.columns:
+            logger.warning(
+                f"Datatype FAULT: '{structtype_column}' (config key: 'structtype_column') is missing from the fault data. Consider removing that key from the config"
+            )
+        else:
+        # Check if all entries in the column are strings
+            if not fault_data[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
+                logger.error(
+                    f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
+                )
+                return (True, f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.")
+
+            # Warn about empty or null cells
+            if fault_data[structtype_column].isnull().any() or fault_data[structtype_column].str.strip().eq("").any():
+                logger.warning(
+                    f"Datatype FAULT: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected."
+                )
+
+    # Check if "fault_text" is defined and contained in the column
+    fault_text = config.get("fault_text", None)
+
+    # Check if the structtype_column exists in the fault_data
+    if structtype_column not in fault_data.columns:
+        logger.warning(
+            f"Datatype FAULT: The column '{structtype_column}' is not present in the fault data."
+        )
+
+    else:
+        if not fault_data[structtype_column].str.contains(fault_text).any():
+            logger.error(
+                f"Datatype FAULT: The 'fault_text' value '{fault_text}' is not found in column '{structtype_column}'. Project might end up with no faults"
+            )
+    
+    #checks on name column
+    name_column = config.get("name_column")
+    if name_column not in fault_data.columns:
+        logger.warning(
+            f"Datatype FAULT: Column '{name_column}' (config key 'name_column') is missing from the fault data."
+            "Please ensure it is present, or remove that key from the config."
+        )
+    
+    if name_column and name_column in fault_data.columns:
+        # Check if the column contains non-string values
+        if not fault_data[name_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
+            logger.error(
+                f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains non-string values. Ensure all values are valid strings."
+            )
+            return (True, f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains non-string values.")
+        
+        # Check for NaN values
+        if fault_data[name_column].isnull().any():
+            logger.warning(
+                f"Datatype FAULT: Column '{name_column}' (config key 'name_column') contains NaN or empty values. This may affect processing."
+            )
+        
+        # Check for duplicate values
+        if fault_data[name_column].duplicated().any():
+            logger.warning(
+                f"Datatype FAULT: Column '{name_column}' contains duplicate values. This may affect processing."
+            )
+
+    # dips & strikes
+    # Check for dips and dip directions
+    strike_dips_columns = ["dip_column", "dipdir_column"]
+
+    for key in strike_dips_columns:
+        column_name = config.get(key)
+        if column_name:  # Only proceed if the config has this key
+            if column_name in fault_data.columns:
+                
+                #coerce to numeric
+                fault_data[column_name] = pandas.to_numeric(fault_data[column_name], errors='coerce')
+                
+                # Check if the column contains only numeric values                    
+                if not fault_data[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all():
+                    logger.warning(
+                        f"Datatype FAULT: Column '{column_name}' (config key {key}) must contain only numeric values. Please ensure the column is numeric."
+                    )
+
+                # Check for NaN or empty values
+                if fault_data[column_name].isnull().any():
+                    logger.warning(
+                        f"Datatype FAULT: Column '{column_name}' (config key {key}) contains NaN or empty values. This may affect processing."
+                    )
+
+                # Check range constraints
+                if key == "dip_column":
+                    # Dips must be between 0 and 90
+                    invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 90))
+                    if invalid_values.any():
+                        logger.warning(
+                            f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 90]. Was this intentional?"
+                        )
+                elif key == "dipdir_column":
+                    # Dip directions must be between 0 and 360
+                    invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 360))
+                    if invalid_values.any():
+                        logger.warning(
+                            f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 360]. Was this intentional?"
+                        )
+            else:
+                logger.warning(
+                    f"Datatype FAULT: Column '{column_name}' (config key {key}) is missing from the fault data. Please ensure the column name is correct, or otherwise remove that key from the config."
+                )
+                
+    
+    # dip estimates
+    dip_estimate_column = config.get("dip_estimate_column")
+    valid_directions = [
+        "north_east", "south_east", "south_west", "north_west",
+        "north", "east", "south", "west"
+    ]
+
+    if dip_estimate_column:  
+        if dip_estimate_column in fault_data.columns:
+            # Ensure all values are in the set of valid directions or are NaN
+            invalid_values = fault_data[dip_estimate_column][
+                ~fault_data[dip_estimate_column].apply(lambda x: x in valid_directions or pandas.isnull(x))
+            ]
+
+            if not invalid_values.empty:
+                logger.error(
+                    f"Datatype FAULT: Column '{dip_estimate_column}' contains invalid values not in the set of allowed dip estimates: {valid_directions}."
+                )
+                return (
+                    True,
+                    f"Datatype FAULT: Column '{dip_estimate_column}' contains invalid values. Allowed values: {valid_directions}.",
+                )
+
+            # Warn if there are NaN or empty values
+            if fault_data[dip_estimate_column].isnull().any():
+                logger.warning(
+                    f"Datatype FAULT: Column '{dip_estimate_column}' contains NaN or empty values. This may affect processing."
+                )
+        else:
+            logger.error(
+                f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data. Please ensure the column name is correct or remove that key from the config."
+            )
+            return (True, f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data.")
+
+    # Check ID column
+    id_column = config.get("objectid_column")
+    
+    if id_column:  
+        if id_column in fault_data.columns:
+            # Check for non-integer values
+            # Attempt to coerce the ID column to integers because WA data says so (ARodrigues)
+            fault_data[id_column] = pandas.to_numeric(fault_data[id_column], errors='coerce')
+
+            # Check if all values are integers or null after coercion
+            if not fault_data[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all():
+                logger.warning(
+                    f"Datatype FAULT: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs."
+                )
+            
+            # Check for NaN values
+            if fault_data[id_column].isnull().any():
+                logger.warning(
+                    f"Datatype FAULT: ID column '{id_column}' contains NaN or null values. Rectify this or remove the key from the config to auto-generate IDs."
+                )
+
+            # Check for duplicates
+            if fault_data[id_column].duplicated().any():
+                logger.error(
+                    f"Datatype FAULT: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs."
+                )
+    
+    return (False, "")
diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py
index 4ef0be89..a1bb3ae3 100644
--- a/map2loop/mapdata.py
+++ b/map2loop/mapdata.py
@@ -3,6 +3,7 @@
 from .config import Config
 from .aus_state_urls import AustraliaStateUrls
 from .utils import generate_random_hex_colors, calculate_minimum_fault_length
+from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity
 
 # external imports
 import geopandas
@@ -19,7 +20,7 @@
 import beartype
 import os
 from io import BytesIO
-from typing import Union
+from typing import Union, Tuple
 import tempfile
 
 
@@ -689,154 +690,42 @@ def check_map(self, datatype: Datatype):
                 The datatype to check
         """
         func = None
+        #check and parse geology data
         if datatype == Datatype.GEOLOGY:
+            validity_check, message = check_geology_fields_validity(mapdata = self)
+            if validity_check:
+                logger.error(f"Datatype GEOLOGY data validation failed: {message}")
+                return
             func = self.parse_geology_map
+            
+        #check and parse structure data
         elif datatype == Datatype.STRUCTURE:
+            validity_check, message = check_structure_fields_validity(mapdata = self)
+            if validity_check:
+                logger.error(f"Datatype STRUCTURE data validation failed: {message}")
+                return
             func = self.parse_structure_map
+        
+        #check and parse fault data
         elif datatype == Datatype.FAULT:
+            validity_check, message = check_fault_fields_validity(mapdata = self)
+            if validity_check:
+                logger.error(f"Datatype FAULT data validation failed: {message}")
+                return
             func = self.parse_fault_map
-        elif datatype == Datatype.FOLD:
-            func = self.parse_fold_map
+        
         elif datatype == Datatype.FAULT_ORIENTATION:
             func = self.parse_fault_orientations
+            
+        #check and parse fold data
+        elif datatype == Datatype.FOLD:
+            func = self.parse_fold_map
+
         if func:
             error, message = func()
             if error:
                 logger.error(message)
 
-    @beartype.beartype
-    def parse_fault_orientations(self) -> tuple:
-        """
-        Parse the fault orientations shapefile data into a consistent format
-
-        Returns:
-            tuple: A tuple of (bool: success/fail, str: failure message)
-        """
-        # Check type and size of loaded structure map
-        if (
-            self.raw_data[Datatype.FAULT_ORIENTATION] is None
-            or type(self.raw_data[Datatype.FAULT_ORIENTATION]) is not geopandas.GeoDataFrame
-        ):
-            logger.warning("Fault orientation shapefile is not loaded or valid")
-            return (True, "Fault orientation shapefile is not loaded or valid")
-
-        # Create new geodataframe
-        fault_orientations = geopandas.GeoDataFrame(
-            self.raw_data[Datatype.FAULT_ORIENTATION]["geometry"]
-        )
-
-        config = self.config.fault_config
-
-        # Parse dip direction and dip columns
-        if config["dipdir_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            if config["orientation_type"] == "strike":
-                fault_orientations["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply(
-                    lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1
-                )
-            else:
-                fault_orientations["DIPDIR"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                    config["dipdir_column"]
-                ]
-        else:
-            print(
-                f"Fault orientation shapefile does not contain dipdir_column '{config['dipdir_column']}'"
-            )
-
-        if config["dip_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            fault_orientations["DIP"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                config["dip_column"]
-            ]
-        else:
-            print(
-                f"Fault orientation shapefile does not contain dip_column '{config['dip_column']}'"
-            )
-
-        # TODO LG would it be worthwhile adding a description column for faults?
-        # it would be possible to parse out the fault displacement, type, slip direction
-        # if this was stored in the descriptions?
-
-        # Add object id
-        if config["objectid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            fault_orientations["ID"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                config["objectid_column"]
-            ]
-        else:
-            fault_orientations["ID"] = numpy.arange(len(fault_orientations))
-        self.data[Datatype.FAULT_ORIENTATION] = fault_orientations
-        return (False, "")
-
-    @beartype.beartype
-    def parse_structure_map(self) -> tuple:
-        """
-        Parse the structure shapefile data into a consistent format
-
-        Returns:
-            tuple: A tuple of (bool: success/fail, str: failure message)
-        """
-        # Check type and size of loaded structure map
-        if (
-            self.raw_data[Datatype.STRUCTURE] is None
-            or type(self.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame
-        ):
-            logger.warning("Structure map is not loaded or valid")
-            return (True, "Structure map is not loaded or valid")
-
-        if len(self.raw_data[Datatype.STRUCTURE]) < 2:
-            logger.warning(
-                "Stucture map does not enough orientations to complete calculations (need at least 2), projection may be inconsistent"
-            )
-
-        # Create new geodataframe
-        structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"])
-        config = self.config.structure_config
-
-        # Parse dip direction and dip columns
-        if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]:
-            if config["orientation_type"] == "strike":
-                structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply(
-                    lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1
-                )
-            else:
-                structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]]
-        else:
-            print(f"Structure map does not contain dipdir_column '{config['dipdir_column']}'")
-
-        # Ensure all DIPDIR values are within [0, 360]
-        structure["DIPDIR"] = structure["DIPDIR"] % 360.0
-
-        if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]]
-        else:
-            print(f"Structure map does not contain dip_column '{config['dip_column']}'")
-
-        # Add bedding and overturned booleans
-        if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["OVERTURNED"] = (
-                self.raw_data[Datatype.STRUCTURE][config["overturned_column"]]
-                .astype(str)
-                .str.contains(config["overturned_text"])
-            )
-        else:
-            structure["OVERTURNED"] = False
-
-        if config["description_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["BEDDING"] = (
-                self.raw_data[Datatype.STRUCTURE][config["description_column"]]
-                .astype(str)
-                .str.contains(config["bedding_text"])
-            )
-        else:
-            structure["BEDDING"] = False
-
-        # Add object id
-        if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]]
-        else:
-            structure["ID"] = numpy.arange(len(structure))
-
-        self.data[Datatype.STRUCTURE] = structure
-        return (False, "")
-
     @beartype.beartype
     def parse_geology_map(self) -> tuple:
         """
@@ -845,13 +734,6 @@ def parse_geology_map(self) -> tuple:
         Returns:
             tuple: A tuple of (bool: success/fail, str: failure message)
         """
-        # Check type of loaded geology map
-        if (
-            self.raw_data[Datatype.GEOLOGY] is None
-            or type(self.raw_data[Datatype.GEOLOGY]) is not geopandas.GeoDataFrame
-        ):
-            logger.warning("Geology map is not loaded or valid")
-            return (True, "Geology map is not loaded or valid")
 
         # Create new geodataframe
         geology = geopandas.GeoDataFrame(self.raw_data[Datatype.GEOLOGY]["geometry"])
@@ -862,22 +744,11 @@ def parse_geology_map(self) -> tuple:
             geology["UNITNAME"] = self.raw_data[Datatype.GEOLOGY][config["unitname_column"]].astype(
                 str
             )
-        else:
-            msg = f"Geology map does not contain unitname_column {config['unitname_column']}"
-            print(msg)
-            logger.warning(msg)
-            return (True, msg)
+
         if config["alt_unitname_column"] in self.raw_data[Datatype.GEOLOGY]:
             geology["CODE"] = self.raw_data[Datatype.GEOLOGY][config["alt_unitname_column"]].astype(
                 str
             )
-        else:
-            msg = (
-                f"Geology map does not contain alt_unitname_column {config['alt_unitname_column']}"
-            )
-            print(msg)
-            logger.warning(msg)
-            return (True, msg)
 
         # Parse group and supergroup columns
         if config["group_column"] in self.raw_data[Datatype.GEOLOGY]:
@@ -947,11 +818,9 @@ def parse_geology_map(self) -> tuple:
         else:
             geology["ID"] = numpy.arange(len(geology))
 
-        # TODO: Check for duplicates in "ID"
         # TODO: Check that the exploded geology has more than 1 unit
         #       Do we need to explode the geometry at this stage for geology/faults/folds???
         #       If not subsequent classes will need to be able to deal with them
-        # TODO: Check for Nans or blanks in "UNITNAME", "GROUP", "SUPERGROUP", "DESCRIPTION", "CODE", "ROCKTYPE"
         # Strip out whitespace (/n <space> /t) and '-', ',', '?' from "UNITNAME", "CODE" "GROUP" "SUPERGROUP"
         geology["UNITNAME"] = geology["UNITNAME"].str.replace("[ -/?]", "_", regex=True)
         geology["CODE"] = geology["CODE"].str.replace("[ -/?]", "_", regex=True)
@@ -970,12 +839,62 @@ def parse_geology_map(self) -> tuple:
         return (False, "")
 
     @beartype.beartype
-    def get_minimum_fault_length(self) -> Union[float, int, None]:
+    def parse_structure_map(self) -> tuple:
         """
-        Get the minimum fault length
+        Parse the structure shapefile data into a consistent format
+
+        Returns:
+            tuple: A tuple of (bool: success/fail, str: failure message)
         """
 
-        return self.minimum_fault_length
+        # Create new geodataframe
+        structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"])
+        config = self.config.structure_config
+
+        # Parse dip direction and dip columns
+        if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]:
+            if config["orientation_type"] == "strike":
+                structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply(
+                    lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1
+                )
+            else:
+                structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]]
+
+        # Ensure all DIPDIR values are within [0, 360]
+        structure["DIPDIR"] = structure["DIPDIR"] % 360.0
+
+        if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]:
+            structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]]
+
+
+        # Add bedding and overturned booleans
+        if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]:
+            structure["OVERTURNED"] = (
+                self.raw_data[Datatype.STRUCTURE][config["overturned_column"]]
+                .astype(str)
+                .str.contains(config["overturned_text"])
+            )
+        else:
+            structure["OVERTURNED"] = False
+
+        if config["description_column"] in self.raw_data[Datatype.STRUCTURE]:
+            structure["BEDDING"] = (
+                self.raw_data[Datatype.STRUCTURE][config["description_column"]]
+                .astype(str)
+                .str.contains(config["bedding_text"])
+            )
+        else:
+            structure["BEDDING"] = False
+
+        # Add object id
+        if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]:
+            structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]]
+        else:
+            structure["ID"] = numpy.arange(len(structure))
+
+        self.data[Datatype.STRUCTURE] = structure
+        return (False, "")
+
 
     @beartype.beartype
     def parse_fault_map(self) -> tuple:
@@ -985,14 +904,6 @@ def parse_fault_map(self) -> tuple:
         Returns:
             tuple: A tuple of (bool: success/fail, str: failure message)
         """
-        # Check type of loaded fault map
-        if (
-            self.raw_data[Datatype.FAULT] is None
-            or type(self.raw_data[Datatype.FAULT]) is not geopandas.GeoDataFrame
-        ):
-            logger.warning("Fault map is not loaded or valid")
-            return (True, "Fault map is not loaded or valid")
-
         # Create a new geodataframe
         faults = geopandas.GeoDataFrame(self.raw_data[Datatype.FAULT]["geometry"])
 
@@ -1005,11 +916,12 @@ def parse_fault_map(self) -> tuple:
             self.minimum_fault_length = calculate_minimum_fault_length(
                 bbox=self.bounding_box, area_percentage=0.05
             )
-
+        logger.info(f"Calculated minimum fault length - {self.minimum_fault_length}")
+        
         # crop
         faults = faults.loc[faults.geometry.length >= self.minimum_fault_length]
-
-        if config["structtype_column"] in self.raw_data[Datatype.FAULT]:
+        
+        if config["structtype_column"] in self.raw_data[Datatype.FAULT]:               
             faults["FEATURE"] = self.raw_data[Datatype.FAULT][config["structtype_column"]]
             faults = faults[faults["FEATURE"].astype(str).str.contains(config["fault_text"])]
             if self.verbose_level > VerboseLevel.NONE:
@@ -1039,7 +951,7 @@ def parse_fault_map(self) -> tuple:
         # Filter the DataFrame to remove rows where 'NAME' is in the existing_codes
         if existing_codes:
             faults = faults[~faults["NAME"].isin(existing_codes)]
-            logger.info(f"The following codes were found and removed: {existing_codes}")
+            logger.info(f"The following faults were found and removed as per the config: {existing_codes}")
         else:
             logger.info("None of the fault ignore codes exist in the original fault data.")
             pass
@@ -1126,6 +1038,63 @@ def parse_fault_map(self) -> tuple:
 
         return (False, "")
 
+    @beartype.beartype
+    def parse_fault_orientations(self) -> tuple:
+        """
+        Parse the fault orientations shapefile data into a consistent format
+
+        Returns:
+            tuple: A tuple of (bool: success/fail, str: failure message)
+        """
+        # Check type and size of loaded structure map
+
+
+        # Create new geodataframe
+        fault_orientations = geopandas.GeoDataFrame(
+            self.raw_data[Datatype.FAULT_ORIENTATION]["geometry"]
+        )
+
+        config = self.config.fault_config
+
+        # Parse dip direction and dip columns
+        if config["dipdir_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
+            if config["orientation_type"] == "strike":
+                fault_orientations["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply(
+                    lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1
+                )
+            else:
+                fault_orientations["DIPDIR"] = self.raw_data[Datatype.FAULT_ORIENTATION][
+                    config["dipdir_column"]
+                ]
+        else:
+            print(
+                f"Fault orientation shapefile does not contain dipdir_column '{config['dipdir_column']}'"
+            )
+
+        if config["dip_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
+            fault_orientations["DIP"] = self.raw_data[Datatype.FAULT_ORIENTATION][
+                config["dip_column"]
+            ]
+        else:
+            print(
+                f"Fault orientation shapefile does not contain dip_column '{config['dip_column']}'"
+            )
+
+        # TODO LG would it be worthwhile adding a description column for faults?
+        # it would be possible to parse out the fault displacement, type, slip direction
+        # if this was stored in the descriptions?
+
+        # Add object id
+        if config["objectid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
+            fault_orientations["ID"] = self.raw_data[Datatype.FAULT_ORIENTATION][
+                config["objectid_column"]
+            ]
+        else:
+            fault_orientations["ID"] = numpy.arange(len(fault_orientations))
+        self.data[Datatype.FAULT_ORIENTATION] = fault_orientations
+        return (False, "")
+
+
     @beartype.beartype
     def parse_fold_map(self) -> tuple:
         """
@@ -1682,4 +1651,4 @@ def STRUCTURE(self):
 
     @property
     def FAULT(self):
-        return self.get_map_data(Datatype.FAULT)
+        return self.get_map_data(Datatype.FAULT)
\ No newline at end of file

From 29466fe3986d5c8e224b0334ffbd6946bfe0cad8 Mon Sep 17 00:00:00 2001
From: AngRodrigues <AngRodrigues@users.noreply.github.com>
Date: Wed, 8 Jan 2025 01:12:34 +0000
Subject: [PATCH 14/29] style: style fixes by ruff and autoformatting by black

---
 map2loop/mapdata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py
index a1bb3ae3..4f37b27f 100644
--- a/map2loop/mapdata.py
+++ b/map2loop/mapdata.py
@@ -20,7 +20,7 @@
 import beartype
 import os
 from io import BytesIO
-from typing import Union, Tuple
+from typing import Union
 import tempfile
 
 

From 848f31cd2b67728468bc681e838caa855e59701d Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 12:13:51 +1100
Subject: [PATCH 15/29] tests: add tests for data_checks for each datatype

---
 tests/mapdata/test_input_data_faults.py    | 106 +++++++++++
 tests/mapdata/test_input_data_geology.py   | 202 +++++++++++++++++++++
 tests/mapdata/test_input_data_structure.py | 136 ++++++++++++++
 3 files changed, 444 insertions(+)
 create mode 100644 tests/mapdata/test_input_data_faults.py
 create mode 100644 tests/mapdata/test_input_data_geology.py
 create mode 100644 tests/mapdata/test_input_data_structure.py

diff --git a/tests/mapdata/test_input_data_faults.py b/tests/mapdata/test_input_data_faults.py
new file mode 100644
index 00000000..cf668889
--- /dev/null
+++ b/tests/mapdata/test_input_data_faults.py
@@ -0,0 +1,106 @@
+import pytest
+import geopandas as gpd
+import shapely.geometry
+from map2loop.mapdata import MapData
+from map2loop.m2l_enums import Datatype
+from map2loop.data_checks import check_fault_fields_validity
+
+@pytest.mark.parametrize(
+    "fault_data, fault_config, expected_validity, expected_message",
+    [
+        # Valid data
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": ["Fault A", "Fault B"],
+                "ID": [1, 2]
+            },
+            {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"},
+            False,
+            ""
+        ),
+        # Invalid geometry
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)])  # Invalid geometry
+                ],
+                "FEATURE": ["Fault A", "Fault B"],
+                "ID": [1, 2]
+            },
+            {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"},
+            True,
+            "Invalid geometries found in FAULT data."
+        ),
+
+        # Non-string FEATURE column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": [5, 2],
+                "ID": [1, 2]
+            },
+            {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"},
+            True,
+            "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values."
+        ),
+        # Invalid values in DIP estimate column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": ["Fault", "Fault"],
+                "NAME": ["Zuleika", "Zuleika"],
+                "ID": [1, 2],
+                "DIP": [70, 50],
+                "STRIKE": [150, None],
+                "DEC": ["north_east", "southt"],
+            },
+            {
+                "structtype_column": "FEATURE",
+                "fault_text": "Fault",
+                "objectid_column": "ID",
+                "name_column": "NAME",
+                "dip_column": "DIP",
+                "dipdir_column": "STRIKE",
+                "dip_estimate_column": "DEC"
+            },
+            True,
+            "Datatype FAULT: Column 'DEC' contains invalid values. Allowed values: ['north_east', 'south_east', 'south_west', 'north_west', 'north', 'east', 'south', 'west']."
+        ),
+    ],
+    ids=[
+        "Valid fault data",
+        "Invalid geometry",
+        "Non-string FEATURE column",
+        "Invalid DIP estimate column"
+    ]
+)
+def test_check_fault_fields_validity(fault_data, fault_config, expected_validity, expected_message):
+    # Dynamically create the mock config for this test case
+    class MockConfig:
+        def __init__(self, config):
+            self.fault_config = config
+
+    # Create a GeoDataFrame
+    fault_gdf = gpd.GeoDataFrame(fault_data, crs="EPSG:4326")
+
+    # Instantiate the MapData class with the dynamic mock config and data
+    map_data = MapData()
+    map_data.config = MockConfig(fault_config)
+    map_data.raw_data = [None] * len(Datatype.__dict__)
+    map_data.raw_data[Datatype.FAULT] = fault_gdf
+
+    # Test the check_fault_fields_validity function
+    validity_check, message = check_fault_fields_validity(map_data)
+    assert validity_check == expected_validity
+    assert message == expected_message
diff --git a/tests/mapdata/test_input_data_geology.py b/tests/mapdata/test_input_data_geology.py
new file mode 100644
index 00000000..4b083e76
--- /dev/null
+++ b/tests/mapdata/test_input_data_geology.py
@@ -0,0 +1,202 @@
+import pytest
+import geopandas as gpd
+import shapely.geometry
+from map2loop.mapdata import MapData
+from map2loop.data_checks import check_geology_fields_validity
+
+# Datatype Enum
+class Datatype:
+    GEOLOGY = 0
+
+# Config 
+class MockConfig:
+    def __init__(self):
+        self.geology_config = {
+            "unitname_column": "UNITNAME",
+            "alt_unitname_column": "CODE",
+            "group_column": "GROUP",
+            "supergroup_column": "SUPERGROUP",
+            "description_column": "DESCRIPTION",
+            "rocktype_column": "ROCKTYPE1",
+            "alt_rocktype_column": "ROCKTYPE2",
+            "minage_column": "MIN_AGE",
+            "maxage_column": "MAX_AGE",
+            "objectid_column": "ID",
+            "ignore_lithology_codes": [],
+        }
+
+@pytest.mark.parametrize(
+    "geology_data, expected_validity, expected_message",
+    [
+        # Valid data
+        (
+            {
+                "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])],
+                "UNITNAME": ["Sandstone"],
+                "CODE": ["SST"],
+                "GROUP": ["Sedimentary"],
+                "SUPERGROUP": ["Mesozoic"],
+                "DESCRIPTION": ["A type of sandstone"],
+                "ROCKTYPE1": ["Clastic"],
+                "ROCKTYPE2": ["Quartz"],
+                "MIN_AGE": [150.0],
+                "MAX_AGE": [200.0],
+                "ID": [1],
+            },
+            False,
+            "",
+        ),
+        # Invalid geometry
+        (
+            {
+                "geometry": [shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)])],
+                "UNITNAME": ["Sandstone"],
+                "CODE": ["SST"],
+                "GROUP": ["Sedimentary"],
+                "SUPERGROUP": ["Mesozoic"],
+                "DESCRIPTION": ["A type of sandstone"],
+                "ROCKTYPE1": ["Clastic"],
+                "ROCKTYPE2": ["Quartz"],
+                "MIN_AGE": [150.0],
+                "MAX_AGE": [200.0],
+                "ID": [1],
+            },
+            True,
+            "Invalid geometries found in datatype GEOLOGY",
+        ),
+        # Missing required column
+        (
+            {
+                "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])],
+                "UNITNAME": ["Sandstone"],
+                # "CODE": ["SST"],  # Missing required column
+                "GROUP": ["Sedimentary"],
+                "SUPERGROUP": ["Mesozoic"],
+                "DESCRIPTION": ["A type of sandstone"],
+                "ROCKTYPE1": ["Clastic"],
+                "ROCKTYPE2": ["Quartz"],
+                "MIN_AGE": [150.0],
+                "MAX_AGE": [200.0],
+                "ID": [1],
+            },
+            True,
+            "Datatype GEOLOGY: Required column with config key: 'alt_unitname_column' is missing from geology data.",
+        ),
+        # Non-string value in required column
+        (
+            {
+                "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])],
+                "UNITNAME": ["Sandstone"],
+                "CODE": [2],  # Non-string value
+                "GROUP": ["Sedimentary"],
+                "SUPERGROUP": ["Mesozoic"],
+                "DESCRIPTION": ["A type of sandstone"],
+                "ROCKTYPE1": ["Clastic"],
+                "ROCKTYPE2": ["Quartz"],
+                "MIN_AGE": [150.0],
+                "MAX_AGE": [200.0],
+                "ID": [1],
+            },
+            True,
+            "Datatype GEOLOGY: Column 'alt_unitname_column' must contain only string values. Please check that the column contains only string values.",
+        ),
+        # NaN or blank value in required column
+        (
+            {
+                "geometry": [shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])],
+                "UNITNAME": [""],  # Blank value
+                "CODE": ["SST"],
+                "GROUP": ["Sedimentary"],
+                "SUPERGROUP": ["Mesozoic"],
+                "DESCRIPTION": ["A type of sandstone"],
+                "ROCKTYPE1": ["Clastic"],
+                "ROCKTYPE2": ["Quartz"],
+                "MIN_AGE": [150.0],
+                "MAX_AGE": [200.0],
+                "ID": [1],
+            },
+            True,
+            "Datatype GEOLOGY: NaN or blank values found in required column 'unitname_column'. Please double check the column for blank values.",
+        ),
+        # Duplicate ID values
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+                    shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]),
+                ],
+                "UNITNAME": ["fr", "df"],
+                "CODE": ["SST", "FGH"],
+                "GROUP": ["Sedimentary", "Ign"],
+                "SUPERGROUP": ["Mesozoic", "Arc"],
+                "DESCRIPTION": ["A", "B"],
+                "ROCKTYPE1": ["A", "B"],
+                "ROCKTYPE2": ["Quartz", "FDS"],
+                "MIN_AGE": [150.0, 200],
+                "MAX_AGE": [200.0, 250],
+                "ID": [1, 1],  # Duplicate ID
+            },
+            True,
+            "Datatype GEOLOGY: Duplicate values found in column 'ID' (config key: 'objectid_column').",
+        ),
+        # nan in id
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+                    shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]),
+                ],
+                "UNITNAME": ["fr", "df"],
+                "CODE": ["SST", "FGH"],
+                "GROUP": ["Sedimentary", "Ign"],
+                "SUPERGROUP": ["Mesozoic", "Arc"],
+                "DESCRIPTION": ["A", "B"],
+                "ROCKTYPE1": ["A", "B"],
+                "ROCKTYPE2": ["Quartz", "FDS"],
+                "MIN_AGE": [150.0, 200],
+                "MAX_AGE": [200.0, 250],
+                "ID": [1, None],  
+            },
+            True,
+            "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains NaN or null values.",
+        ),
+        # nan in unit name
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),
+                    shapely.geometry.Polygon([(0, 0), (10, 0), (1, 1), (0, 10)]),
+                ],
+                "UNITNAME": ["fr", None],
+                "CODE": ["SST", "FGH"],
+                "GROUP": ["Sedimentary", "Ign"],
+                "SUPERGROUP": ["Mesozoic", "Arc"],
+                "DESCRIPTION": ["A", "B"],
+                "ROCKTYPE1": ["A", "B"],
+                "ROCKTYPE2": ["Quartz", "FDS"],
+                "MIN_AGE": [150.0, 200],
+                "MAX_AGE": [200.0, 250],
+                "ID": [1, 1],  # Duplicate ID
+            },
+            True,
+            "Datatype GEOLOGY: Column 'unitname_column' must contain only string values. Please check that the column contains only string values.",
+        ),
+    ],
+)
+
+
+
+def test_check_geology_fields_validity(geology_data, expected_validity, expected_message):
+    # Create a GeoDataFrame
+    geology_gdf = gpd.GeoDataFrame(geology_data, crs="EPSG:4326")
+
+    # Instantiate the MapData class with the mock config and data
+    map_data = MapData()
+    map_data.config = MockConfig()
+    map_data.raw_data = [None] * len(Datatype.__dict__)
+    map_data.raw_data[Datatype.GEOLOGY] = geology_gdf
+
+    # Test the check_geology_fields_validity function
+    validity_check, message = check_geology_fields_validity(map_data)
+    assert validity_check == expected_validity
+    assert message == expected_message
\ No newline at end of file
diff --git a/tests/mapdata/test_input_data_structure.py b/tests/mapdata/test_input_data_structure.py
new file mode 100644
index 00000000..d96441fc
--- /dev/null
+++ b/tests/mapdata/test_input_data_structure.py
@@ -0,0 +1,136 @@
+import pytest
+import geopandas as gpd
+import shapely.geometry
+from map2loop.mapdata import MapData
+from map2loop.data_checks import check_structure_fields_validity
+
+# Datatype Enum
+class Datatype:
+    STRUCTURE = 1
+
+# Config
+class MockConfig:
+    def __init__(self):
+        self.structure_config = {
+            "dipdir_column": "DIPDIR",
+            "dip_column": "DIP",
+            "description_column": "DESCRIPTION",
+            "overturned_column": "OVERTURNED",
+            "objectid_column": "ID",
+        }
+
+@pytest.mark.parametrize(
+    "structure_data, expected_validity, expected_message",
+    [
+        # Valid data
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),
+                    shapely.geometry.Point(1, 1)
+                ],
+                "DIPDIR": [45.0, 135.0],
+                "DIP": [30.0, 45.0],
+                "DESCRIPTION": ["Description1", "Description2"],
+                "OVERTURNED": ["Yes", "No"],
+                "ID": [1, 2]
+            },
+            False,
+            "",
+        ),
+        # Invalid geometry
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),
+                    shapely.geometry.Polygon([(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)])  # Invalid geometry
+                ],
+                "DIPDIR": [45.0, 135.0],
+                "DIP": [30.0, 45.0],
+                "DESCRIPTION": ["Description1", "Description2"],
+                "OVERTURNED": ["Yes", "No"],
+                "ID": [1, 2]
+            },
+            True,
+            "Invalid geometries found in datatype STRUCTURE",
+        ),
+        # Missing required column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),
+                    shapely.geometry.Point(1, 1)
+                ],
+                # "DIPDIR": [45.0, 135.0],  # Missing required column
+                "DIP": [30.0, 45.0],
+                "DESCRIPTION": ["Description1", "Description2"],
+                "OVERTURNED": ["Yes", "No"],
+                "ID": [1, 2]
+            },
+            True,
+            "Datatype STRUCTURE: Required column with config key: 'dipdir_column' is missing from structure data.",
+        ),
+        # Non-numeric value in numeric column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),
+                    shapely.geometry.Point(1, 1)
+                ],
+                "DIPDIR": ["A", "B"],  # Non-numeric value
+                "DIP": [30.0, 45.0],
+                "DESCRIPTION": ["Description1", "Description2"],
+                "OVERTURNED": ["Yes", "No"],
+                "ID": [1, 2]
+            },
+            True,
+            "Datatype STRUCTURE: Column 'dipdir_column' must contain only numeric values. Please check that the column contains only numeric values.",
+        ),
+        # NaN or blank value in required column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),
+                    shapely.geometry.Point(1, 1)
+                ],
+                "DIPDIR": [None, 3],  # NaN value
+                "DIP": [30.0, 45.0],
+                "DESCRIPTION": ["Description1", "Description2"],
+                "OVERTURNED": ["Yes", "No"],
+                "ID": [1, 2]
+            },
+            True,
+            "Datatype STRUCTURE: NaN or blank values found in required column 'dipdir_column'. Please double check the column for blank values.",
+        ),
+        # Duplicate ID column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),
+                    shapely.geometry.Point(1, 1)
+                ],
+                "DIPDIR": [45.0, 135.0],
+                "DIP": [30.0, 45.0],
+                "DESCRIPTION": ["Description1", "Description2"],
+                "OVERTURNED": ["Yes", "No"],
+                "ID": [1, 1]  # Duplicate ID
+            },
+            True,
+            "Datatype STRUCTURE: ID column 'ID' (config key: 'objectid_column') contains duplicate values.",
+        ),
+    ],
+)
+def test_check_structure_fields_validity(structure_data, expected_validity, expected_message):
+    # Create a GeoDataFrame
+    structure_gdf = gpd.GeoDataFrame(structure_data, crs="EPSG:4326")
+
+    # Instantiate the MapData class with the mock config and data
+    map_data = MapData()
+    map_data.config = MockConfig()
+    map_data.raw_data = [None] * len(Datatype.__dict__)
+    map_data.raw_data[Datatype.STRUCTURE] = structure_gdf
+
+    # Test the check_structure_fields_validity function
+    validity_check, message = check_structure_fields_validity(map_data)
+    assert validity_check == expected_validity
+    assert message == expected_message

From c68ce9a004761a6b8551d0869723b0ae1583d516 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 12:26:53 +1100
Subject: [PATCH 16/29] fix: actually abort the process if validation fails

---
 map2loop/mapdata.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py
index 4f37b27f..60ee159c 100644
--- a/map2loop/mapdata.py
+++ b/map2loop/mapdata.py
@@ -694,24 +694,24 @@ def check_map(self, datatype: Datatype):
         if datatype == Datatype.GEOLOGY:
             validity_check, message = check_geology_fields_validity(mapdata = self)
             if validity_check:
-                logger.error(f"Datatype GEOLOGY data validation failed: {message}")
-                return
+                logger.error(f"Datatype GEOLOGY - data validation failed: {message}")
+                raise ValueError(f"Datatype GEOLOGY - data validation failed: {message}")
             func = self.parse_geology_map
             
         #check and parse structure data
         elif datatype == Datatype.STRUCTURE:
             validity_check, message = check_structure_fields_validity(mapdata = self)
             if validity_check:
-                logger.error(f"Datatype STRUCTURE data validation failed: {message}")
-                return
+                logger.error(f"Datatype STRUCTURE - data validation failed: {message}")
+                raise ValueError(f"Datatype STRUCTURE - data validation failed: {message}")
             func = self.parse_structure_map
         
         #check and parse fault data
         elif datatype == Datatype.FAULT:
             validity_check, message = check_fault_fields_validity(mapdata = self)
             if validity_check:
-                logger.error(f"Datatype FAULT data validation failed: {message}")
-                return
+                logger.error(f"Datatype FAULT - data validation failed: {message}")
+                raise ValueError(f"Datatype FAULT - data validation failed: {message}")
             func = self.parse_fault_map
         
         elif datatype == Datatype.FAULT_ORIENTATION:

From 06ff5510bcfbd1d24dc028f855155379a2098707 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 13:28:39 +1100
Subject: [PATCH 17/29] fix: add config check step to project

---
 map2loop/config.py      | 25 -------------------------
 map2loop/data_checks.py | 27 +++++++++++++++++++++++++++
 map2loop/project.py     |  5 ++++-
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/map2loop/config.py b/map2loop/config.py
index 2a0efdf7..e5467e95 100644
--- a/map2loop/config.py
+++ b/map2loop/config.py
@@ -213,31 +213,6 @@ def update_from_file(
             err_string += "Check the contents for mismatched quotes or brackets!"
             raise Exception(err_string)
 
-    @beartype.beartype
-    def validate_config_dictionary(self, config_dict: dict) -> None:
-        required_keys = {
-            "structure": {"dipdir_column", "dip_column"},
-            "geology": {"unitname_column", "alt_unitname_column"},
-        }
-
-        # Loop over "structure" and "geology"
-        for section, keys in required_keys.items():
-
-            # 1) Check that "section" exists
-            if section not in config_dict:
-                logger.error(f"Missing required section '{section}' in config dictionary.")
-                raise ValueError(f"Missing required section '{section}' in config dictionary.")
-
-            # 2) Check that each required key is in config_dict[section]
-            for key in keys:
-                if key not in config_dict[section]:
-                    logger.error(
-                        f"Missing required key '{key}' for '{section}' section of the config dictionary."
-                    )
-                    raise ValueError(
-                        f"Missing required key '{key}' for '{section}' section of the config dictionary."
-                    )
-
 
     @beartype.beartype
     def check_for_legacy_keys(self, config_dict: dict) -> None:
diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index 66ec94e6..a254de16 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -442,3 +442,30 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
                 )
     
     return (False, "")
+
+
+@beartype.beartype
+def validate_config_dictionary(config_dict: dict) -> None:
+    
+    required_keys = {
+        "structure": {"dipdir_column", "dip_column"},
+        "geology": {"unitname_column", "alt_unitname_column"},
+    }
+
+    # Loop over "structure" and "geology"
+    for section, keys in required_keys.items():
+
+        # 1) Check that "section" exists
+        if section not in config_dict:
+            logger.error(f"Missing required section '{section}' in config dictionary.")
+            raise ValueError(f"Missing required section '{section}' in config dictionary.")
+
+        # 2) Check that each required key is in config_dict[section]
+        for key in keys:
+            if key not in config_dict[section]:
+                logger.error(
+                    f"Missing required key '{key}' for '{section}' section of the config dictionary."
+                )
+                raise ValueError(
+                    f"Missing required key '{key}' for '{section}' section of the config dictionary."
+                )
\ No newline at end of file
diff --git a/map2loop/project.py b/map2loop/project.py
index 6e4303c6..ca4ecf8d 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -11,6 +11,7 @@
 from .stratigraphic_column import StratigraphicColumn
 from .deformation_history import DeformationHistory
 from .map2model_wrapper import Map2ModelWrapper
+from .data_checks import validate_config_dictionary
 
 # external imports
 import LoopProjectFile as LPF
@@ -231,8 +232,10 @@ def __init__(
             self.map_data.set_config_filename(config_filename)
 
         if config_dictionary != {}:
-            self.map_data.config.validate_config_dictionary(config_dictionary)
+            validate_config_dictionary(config_dictionary)
             self.map_data.config.update_from_dictionary(config_dictionary)
+            # print(self.map_data.config)
+            # self.map_data.config.validate_config_dictionary(config_dictionary)
             
         if clut_filename != "":
             self.map_data.set_colour_filename(clut_filename)

From 6529bf8427668273d8f963ffc27742fef49e1151 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 14:02:12 +1100
Subject: [PATCH 18/29] fix: add extra checks for config dictionary

---
 map2loop/config.py      | 30 +-----------------
 map2loop/data_checks.py | 69 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/map2loop/config.py b/map2loop/config.py
index e5467e95..edff2e20 100644
--- a/map2loop/config.py
+++ b/map2loop/config.py
@@ -100,8 +100,6 @@ def update_from_dictionary(self, dictionary: dict, lower: bool = True):
         Args:
             dictionary (dict): The dictionary to update from
         """
-        # make sure dictionary doesn't contain legacy keys
-        self.check_for_legacy_keys(dictionary)
         
         if "structure" in dictionary:
             self.structure_config.update(dictionary["structure"])
@@ -211,30 +209,4 @@ def update_from_file(
             else:
                 err_string += "Please check the file exists and is accessible then\n"
             err_string += "Check the contents for mismatched quotes or brackets!"
-            raise Exception(err_string)
-
-
-    @beartype.beartype
-    def check_for_legacy_keys(self, config_dict: dict) -> None:
-
-        legacy_keys = {
-            "otype", "dd", "d", "sf", "bedding", "bo", "btype", "gi", "c", "u",
-            "g", "g2", "ds", "min", "max", "r1", "r2", "sill", "intrusive", "volcanic",
-            "f", "fdipnull", "fdipdip_flag", "fdipdir", "fdip", "fdipest",
-            "fdipest_vals", "n", "ff", "t", "syn"
-        }
-
-        # Recursively search for keys in the dictionary
-        def check_keys(d: dict, parent_key=""):
-            for key, value in d.items():
-                if key in legacy_keys:
-                    logger.error(
-                        f"Legacy key found in config - '{key}' at '{parent_key + key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed"
-                    )
-                    raise ValueError(
-                        f"Legacy key found in config - '{key}' at '{parent_key + key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed"
-                    )
-                if isinstance(value, dict):
-                    check_keys(value, parent_key=f"{parent_key}{key}.")
-        
-        check_keys(config_dict)
\ No newline at end of file
+            raise Exception(err_string)
\ No newline at end of file
diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index a254de16..83b9f1c0 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -447,6 +447,7 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
 @beartype.beartype
 def validate_config_dictionary(config_dict: dict) -> None:
     
+    # 1)  check mandatory keys for "structure" and "geology"
     required_keys = {
         "structure": {"dipdir_column", "dip_column"},
         "geology": {"unitname_column", "alt_unitname_column"},
@@ -463,9 +464,71 @@ def validate_config_dictionary(config_dict: dict) -> None:
         # 2) Check that each required key is in config_dict[section]
         for key in keys:
             if key not in config_dict[section]:
+                logger.error(f"Missing required key '{key}' for '{section}' section of the config dictionary.")
+                raise ValueError(f"Missing required key '{key}' for '{section}' section of the config dictionary.")
+    
+    # 2) check for legacy keys first:
+    legacy_keys = {
+        "otype", "dd", "d", "sf", "bedding", "bo", "btype", "gi", "c", "u",
+        "g", "g2", "ds", "min", "max", "r1", "r2", "sill", "intrusive", "volcanic",
+        "f", "fdipnull", "fdipdip_flag", "fdipdir", "fdip", "fdipest",
+        "fdipest_vals", "n", "ff", "t", "syn"
+    }
+
+    def check_keys(d: dict, parent_key=""):
+        for key, value in d.items():
+            if key in legacy_keys:
                 logger.error(
-                    f"Missing required key '{key}' for '{section}' section of the config dictionary."
+                    f"Legacy key found in config - '{key}' at '{parent_key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed"
                 )
                 raise ValueError(
-                    f"Missing required key '{key}' for '{section}' section of the config dictionary."
-                )
\ No newline at end of file
+                    f"Legacy key found in config - '{key}' at '{parent_key}'. Please use the new config format. Use map2loop.utils.update_from_legacy_file to convert between the formats if needed"
+                )
+            if isinstance(value, dict):
+                check_keys(value, parent_key=f"{parent_key}{key}.")
+    
+    check_keys(config_dict)
+
+    # 3) check if all keys are valid:
+    allowed_keys_by_section = {
+        "structure": {
+            "orientation_type", "dipdir_column", "dip_column",
+            "description_column", "bedding_text", "overturned_column", "overturned_text",
+            "objectid_column", "desciption_column",
+        },
+        "geology": {
+            "unitname_column", "alt_unitname_column", "group_column",
+            "supergroup_column", "description_column", "minage_column",
+            "maxage_column", "rocktype_column",  "alt_rocktype_column",
+            "sill_text", "intrusive_text",  "volcanic_text",   "objectid_column", "ignore_lithology_codes",
+        },
+        "fault": {
+            "structtype_column",  "fault_text",  "dip_null_value",
+            "dipdir_flag", "dipdir_column",  "dip_column",  "orientation_type",
+            "dipestimate_column",  "dipestimate_text",  "name_column",
+            "objectid_column", "minimum_fault_length", "ignore_fault_codes",
+        },
+        "fold": {
+            "structtype_column", "fold_text", "description_column",
+            "synform_text", "foldname_column","objectid_column",
+        },
+    }
+    
+    for section_name, section_dict in config_dict.items():
+        # check section
+        if section_name not in allowed_keys_by_section:
+            logger.error(f"Unrecognized section '{section_name}' in config dictionary.")
+            raise ValueError(f"Unrecognized section '{section_name}' in config dictionary.")
+
+        # check keys
+        allowed_keys = allowed_keys_by_section[section_name]
+        for key in section_dict.keys():
+            if key not in allowed_keys:
+                logger.error(f"Key '{key}' is not an allowed key in the '{section_name}' section.")
+                raise ValueError(f"Key '{key}' is not an allowed key in the '{section_name}' section.")
+    
+    # 4) check if minimum fault length is a number
+    mfl = config_dict.get("fault", {}).get("minimum_fault_length", None)
+    if mfl is not None and not isinstance(mfl, (int, float)):
+        logger.error("minimum_fault_length must be a number.")
+        raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}")
\ No newline at end of file

From ce444892d6c87602ae9abbd82d820e9ba95dfbe0 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 14:10:56 +1100
Subject: [PATCH 19/29] tests: add config test and reorganise

---
 tests/data_checks/test_config.py              | 135 ++++++++++++++++++
 .../test_input_data_faults.py                 |   0
 .../test_input_data_geology.py                |   0
 .../test_input_data_structure.py              |   0
 4 files changed, 135 insertions(+)
 create mode 100644 tests/data_checks/test_config.py
 rename tests/{mapdata => data_checks}/test_input_data_faults.py (100%)
 rename tests/{mapdata => data_checks}/test_input_data_geology.py (100%)
 rename tests/{mapdata => data_checks}/test_input_data_structure.py (100%)

diff --git a/tests/data_checks/test_config.py b/tests/data_checks/test_config.py
new file mode 100644
index 00000000..cc6f01ba
--- /dev/null
+++ b/tests/data_checks/test_config.py
@@ -0,0 +1,135 @@
+import pytest
+from map2loop.data_checks import validate_config_dictionary
+
+
+@pytest.fixture
+def valid_config():
+    return {
+        "structure": {
+            "orientation_type": "dip direction",
+            "dipdir_column": "azimuth",
+            "dip_column": "inclinatn",
+            "description_column": "DESCRIPTION",
+            "bedding_text": "bed",
+            "overturned_column": "no_col",
+            "overturned_text": "blah",
+            "objectid_column": "geographic",
+            "desciption_column": "sub_type"
+        },
+        "geology": {
+            "unitname_column": "formatted_",
+            "alt_unitname_column": "abbreviate",
+            "group_column": "no_col",
+            "supergroup_column": "interpreta",
+            "description_column": "text_descr",
+            "minage_column": "no_col",
+            "maxage_column": "no_col",
+            "rocktype_column": "rank",
+            "alt_rocktype_column": "type",
+            "sill_text": "sill",
+            "intrusive_text": "intrusion",
+            "volcanic_text": "volc",
+            "objectid_column": "ID",
+            "ignore_lithology_codes": ["cover"]
+        },
+        "fault": {
+            "structtype_column": "featuretyp",
+            "fault_text": "s",
+            "dip_null_value": "0",
+            "dipdir_flag": "num",
+            "dipdir_column": "no_col",
+            "dip_column": "no_col",
+            "orientation_type": "dip direction",
+            "dipestimate_column": "no_col",
+            "dipestimate_text": "no_col",
+            "name_column": "no_col",
+            "objectid_column": "geographic",
+            "minimum_fault_length": 100.0, 
+            "ignore_fault_codes": []
+        },
+        "fold": {
+            "structtype_column": "featuretyp",
+            "fold_text": "fold",
+            "description_column": "no_col",
+            "synform_text": "syn",
+            "foldname_column": "NAME",
+            "objectid_column": "geographic"
+        }
+    }
+
+
+def test_valid_config_no_errors(valid_config):
+    # Should not raise any error
+    validate_config_dictionary(valid_config)
+
+
+def test_missing_required_section(valid_config):
+
+    config_missing_structure = dict(valid_config)
+    del config_missing_structure["structure"]  # remove required section
+
+    with pytest.raises(ValueError) as exc_info:
+        validate_config_dictionary(config_missing_structure)
+    assert "Missing required section 'structure'" in str(exc_info.value)
+
+
+def test_missing_required_key(valid_config):
+    
+    config_missing_dip = dict(valid_config)
+    
+    del config_missing_dip["structure"]["dip_column"] # remove required key
+
+    with pytest.raises(ValueError) as exc_info:
+        validate_config_dictionary(config_missing_dip)
+    assert "Missing required key 'dip_column' for 'structure'" in str(exc_info.value)
+
+
+def test_unrecognized_section(valid_config):
+
+    config_extra_section = dict(valid_config)
+    config_extra_section["random_section"] = {"random_key": "random_value"}
+
+    with pytest.raises(ValueError) as exc_info:
+        validate_config_dictionary(config_extra_section)
+    assert "Unrecognized section 'random_section'" in str(exc_info.value)
+
+
+def test_unrecognized_key_in_section(valid_config):
+    
+    config_extra_key = dict(valid_config)
+    config_extra_key["structure"]["random_key"] = "random_value"
+
+    with pytest.raises(ValueError) as exc_info:
+        validate_config_dictionary(config_extra_key)
+    assert "Key 'random_key' is not an allowed key in the 'structure' section." in str(exc_info.value)
+
+
+def test_legacy_key_detected(valid_config):
+
+    config_with_legacy = dict(valid_config)
+    config_with_legacy["structure"]["otype"] = "legacy_value"  # 'otype' --> legacy key
+    with pytest.raises(ValueError) as exc_info:
+        validate_config_dictionary(config_with_legacy)
+    assert "Legacy key found in config - 'otype'" in str(exc_info.value)
+
+
+def test_minimum_fault_length_wrong_type(valid_config):
+
+    config_wrong_mfl = dict(valid_config)
+    config_wrong_mfl["fault"]["minimum_fault_length"] = "one_hundred"  # invalid type
+
+    with pytest.raises(ValueError) as exc_info:
+        validate_config_dictionary(config_wrong_mfl)
+    assert "minimum_fault_length must be a number" in str(exc_info.value)
+
+
+def test_minimum_fault_length_missing(valid_config):
+    """
+    Remove minimum_fault_length entirely. That should be fine (None -> no check).
+    """
+    config_no_mfl = dict(valid_config)
+    del config_no_mfl["fault"]["minimum_fault_length"]
+
+    # Should not raise any error, as it's optional
+    validate_config_dictionary(config_no_mfl)
+
diff --git a/tests/mapdata/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py
similarity index 100%
rename from tests/mapdata/test_input_data_faults.py
rename to tests/data_checks/test_input_data_faults.py
diff --git a/tests/mapdata/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py
similarity index 100%
rename from tests/mapdata/test_input_data_geology.py
rename to tests/data_checks/test_input_data_geology.py
diff --git a/tests/mapdata/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py
similarity index 100%
rename from tests/mapdata/test_input_data_structure.py
rename to tests/data_checks/test_input_data_structure.py

From 0a85eac3fe89600ccdf31e8ef983d50febe66dec Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Wed, 8 Jan 2025 16:47:53 +1100
Subject: [PATCH 20/29] fix: add data checks for fold data and update tests
 accordingly

---
 map2loop/data_checks.py                   | 154 +++++++++++++++++++++-
 map2loop/mapdata.py                       |  12 +-
 tests/data_checks/test_input_data_fold.py | 112 ++++++++++++++++
 3 files changed, 270 insertions(+), 8 deletions(-)
 create mode 100644 tests/data_checks/test_input_data_fold.py

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index 83b9f1c0..0954b08f 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -13,7 +13,7 @@
 
 @beartype.beartype
 def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
-    #TODO (AR) - add check for gaps in geology data
+    #TODO (AR) - add check for gaps in geology data (inspo here: https://medium.com/@achm.firmansyah/an-approach-for-checking-overlaps-and-gaps-in-polygons-using-geopandas-ebd6606e7f70 )
     """
     Validate the columns in GEOLOGY geodataframe
 
@@ -42,7 +42,16 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
     if not geology_data.geometry.is_valid.all():
         logger.error("Invalid geometries found. Please fix those before proceeding with map2loop processing")
         return (True, "Invalid geometries found in datatype GEOLOGY")
+    
+    # Check if all geometries are Polygon or MultiPolygon
+    if not geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon))).all():
+        invalid_types = geology_data[~geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon)))]
+        logger.error(
+            f"datatype GEOLOGY: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}"
+        )
+        return (True, "Invalid geometry types found in datatype GEOLOGY. All geometries must be Polygon or MultiPolygon.")
 
+    
     # # 2. Required Columns & are they str, and then empty or null? 
     required_columns = [config["unitname_column"], config["alt_unitname_column"]]
     for col in required_columns:
@@ -170,6 +179,14 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
         logger.error("datatype STRUCTURE: Invalid geometries found. Please fix those before proceeding with map2loop processing")
         return (True, "Invalid geometries found in datatype STRUCTURE")
 
+    #  Check if all geometries are Points
+    if not structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point)).all():
+        invalid_types = structure_data[~structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point))]
+        logger.error(
+            f"datatype STRUCTURE: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}"
+        )
+        return (True, "Invalid geometry types found in datatype STRUCTURE. All geometries must be Points.")
+    
     # 2. Check mandatory numeric columns
     required_columns = [config["dipdir_column"], config["dip_column"]]
     for col in required_columns:
@@ -443,6 +460,141 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
     
     return (False, "")
 
+@beartype.beartype
+def check_fold_fields_validity(mapdata) -> Tuple[bool, str]:
+    # Check type of loaded fold map
+    if (
+        mapdata.raw_data[Datatype.FOLD] is None
+        or type(mapdata.raw_data[Datatype.FOLD]) is not geopandas.GeoDataFrame
+    ):
+        logger.warning("Fold map is not loaded or valid")
+        return (True, "Fold map is not loaded or valid")
+
+    folds = mapdata.raw_data[Datatype.FOLD]
+    config = mapdata.config.fold_config
+
+    # Debugging: Print column names in the fold_data
+    logger.debug(f"Fold data columns: {folds.columns.tolist()}")
+
+    # Check geometry
+    if not folds.geometry.is_valid.all():
+        logger.error("datatype FOLD: Invalid geometries found. Please fix those before proceeding with map2loop processing")
+        return (True, "Invalid geometries found in FOLD data.")
+    
+    # Check for LineString or MultiLineString geometries
+    if not folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all():
+        invalid_types = folds[~folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))]
+        logger.error(
+            f"datatype FOLD: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}"
+        )
+        return (True, "Invalid geometry types found in FOLD data.")
+    
+    # Check "structtype_column" if it exists
+    if "structtype_column" in config:
+        structtype_column = config["structtype_column"]
+
+        # Ensure the column exists in the data
+        if structtype_column not in folds.columns:
+            logger.warning(
+                f"Datatype FOLD: '{structtype_column}' (config key: 'structtype_column') is missing from the fold data. Consider removing that key from the config"
+            )
+            return (True, f"Column '{structtype_column}' is missing from the fold data.")
+        else:
+            # Check if all entries in the column are strings
+            if not folds[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
+                logger.error(
+                    f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
+                )
+                return (True, f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.")
+
+            # Warn about empty or null cells
+            if folds[structtype_column].isnull().any() or folds[structtype_column].str.strip().eq("").any():
+                logger.warning(
+                    f"Datatype FOLD: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected."
+                )
+            
+            # Check if "fold_text" is defined and contained in the column
+            fold_text = config.get("fold_text", None)
+            if fold_text:
+                
+                # check if fold text is a string
+                if not isinstance(fold_text, str):
+                    logger.error("Datatype FOLD: 'fold_text' must be a string. Please ensure it is defined correctly in the config.")
+                    return (True, "Datatype FOLD: 'fold_text' must be a string.")
+                #check if it exists in the column strtype
+                if not folds[structtype_column].str.contains(fold_text, na=False).any():
+                    logger.error(f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'. This may impact processing.")
+                    return (True, f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'.")
+
+            # check synform_text
+            synform_text = config.get("synform_text", None)
+            if synform_text:
+                # Check if synform_text is a string
+                if not isinstance(synform_text, str):
+                    logger.error("Datatype FOLD: 'synform_text' must be a string. Please ensure it is defined correctly in the config.")
+                    return (True, "Datatype FOLD: 'synform_text' must be a string.")
+                # Check if it exists in the structtype_column
+                if not folds[structtype_column].str.contains(synform_text, na=False).any():
+                    logger.warning(
+                        f"Datatype FOLD: The 'synform_text' value '{synform_text}' is not found in column '{structtype_column}'. This may impact processing."
+                    )
+                        
+    # check description column
+    description_column = config.get("description_column", None)
+    if description_column:
+        # Ensure the column exists in the data
+        if description_column not in folds.columns:
+            logger.warning(
+                f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') is missing from the fold data. Consider removing that key from the config."
+            )
+        else:
+            # Check if all entries in the column are strings
+            if not folds[description_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
+                logger.error(
+                    f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') contains non-string values. Please ensure all values in this column are strings."
+                )
+                return (True, f"Datatype FOLD: Column '{description_column}' (config key: 'description_column') contains non-string values.")
+
+            # Warn about empty or null cells
+            if folds[description_column].isnull().any() or folds[description_column].str.strip().eq("").any():
+                logger.warning(
+                    f"Datatype FOLD: Column '{description_column}' contains NaN, empty, or blank values. Processing might not work as expected."
+                )
+
+
+    # Check ID column
+    id_column = config.get("objectid_column")
+    
+    if id_column:
+        if id_column in folds.columns:
+            # Attempt to coerce the ID column to numeric
+            folds[id_column] = pandas.to_numeric(folds[id_column], errors='coerce')
+            
+            # Check if all values are integers or null after coercion
+            if not folds[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all():
+                logger.warning(
+                    f"Datatype FOLD: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs."
+                )
+            
+            # Check for NaN values
+            if folds[id_column].isnull().any():
+                logger.warning(
+                    f"Datatype FOLD: ID column '{id_column}' contains NaN or null or str values. Rectify this or remove the key from the config to auto-generate IDs."
+                )
+            
+            # Check for duplicate values
+            if folds[id_column].duplicated().any():
+                logger.error(
+                    f"Datatype FOLD: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs."
+                )
+                return (True, f"Datatype FOLD: ID column '{id_column}' contains duplicate values.")
+        else:
+            logger.warning(
+                f"Datatype FOLD: ID column '{id_column}' is missing from the fold data. Ensure the column name is correct or remove the key from the config."
+            )
+
+    return (False, "")
+
 
 @beartype.beartype
 def validate_config_dictionary(config_dict: dict) -> None:
diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py
index 60ee159c..e9fbb302 100644
--- a/map2loop/mapdata.py
+++ b/map2loop/mapdata.py
@@ -3,7 +3,7 @@
 from .config import Config
 from .aus_state_urls import AustraliaStateUrls
 from .utils import generate_random_hex_colors, calculate_minimum_fault_length
-from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity
+from .data_checks import check_geology_fields_validity, check_structure_fields_validity, check_fault_fields_validity, check_fold_fields_validity
 
 # external imports
 import geopandas
@@ -719,6 +719,10 @@ def check_map(self, datatype: Datatype):
             
         #check and parse fold data
         elif datatype == Datatype.FOLD:
+            validity_check, message = check_fold_fields_validity(mapdata = self)
+            if validity_check:
+                logger.error(f"Datatype FOLD - data validation failed: {message}")
+                raise ValueError(f"Datatype FOLD - data validation failed: {message}")
             func = self.parse_fold_map
 
         if func:
@@ -1103,12 +1107,6 @@ def parse_fold_map(self) -> tuple:
         Returns:
             tuple: A tuple of (bool: success/fail, str: failure message)
         """
-        # Check type of loaded fold map
-        if (
-            self.raw_data[Datatype.FOLD] is None
-            or type(self.raw_data[Datatype.FOLD]) is not geopandas.GeoDataFrame
-        ):
-            return (True, "Fold map is not loaded or valid")
 
         # Create new geodataframe
         folds = geopandas.GeoDataFrame(self.raw_data[Datatype.FOLD]["geometry"])
diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py
new file mode 100644
index 00000000..cb92ad2e
--- /dev/null
+++ b/tests/data_checks/test_input_data_fold.py
@@ -0,0 +1,112 @@
+import pytest
+import geopandas as gpd
+import shapely.geometry
+from map2loop.mapdata import MapData
+from map2loop.m2l_enums import Datatype
+from map2loop.data_checks import check_fold_fields_validity
+
+@pytest.mark.parametrize(
+    "fold_data, fold_config, expected_validity, expected_message",
+    [
+        # Valid data
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": ["fold A", "fold B"],
+                "ID": [1, 2],
+                "description": ["desc1", "desc2"]
+            },
+            {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
+            False,
+            ""
+        ),
+        # Missing geometry
+        (
+            {
+                "geometry": [
+                    shapely.geometry.Point(0, 0),  # Invalid type
+                    shapely.geometry.LineString([(0, 0), (1, 1)])
+                ],
+                "FEATURE": ["fold A", "fold B"],
+                "ID": [1, 2],
+                "description": ["desc1", "desc2"]
+            },
+            {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
+            True,
+            "Invalid geometry types found in FOLD data."
+        ),
+        # Non-string FEATURE column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": [123, 456],  # Invalid type
+                "ID": [1, 2],
+                "description": ["desc1", "desc2"]
+            },
+            {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
+            True,
+            "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values."
+        ),
+        # Missing ID column
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": ["fold A", "fold B"],
+                "description": ["desc1", "desc2"]
+            },
+            {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
+            False,
+            ""
+        ),
+        # Duplicate ID values
+        (
+            {
+                "geometry": [
+                    shapely.geometry.LineString([(0, 0), (1, 1)]),
+                    shapely.geometry.MultiLineString([[(0, 0), (1, 1)], [(1, 1), (2, 2)]])
+                ],
+                "FEATURE": ["fold A", "fold B"],
+                "ID": [1, 1],  # Duplicate values
+                "description": ["desc1", "desc2"]
+            },
+            {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
+            True,
+            "Datatype FOLD: ID column 'ID' contains duplicate values."
+        ),
+    ],
+    ids=[
+        "Valid fold data",
+        "Invalid geometry",
+        "Non-string FEATURE column",
+        "Missing ID column",
+        "Duplicate ID values"
+    ]
+)
+def test_check_fold_fields_validity(fold_data, fold_config, expected_validity, expected_message):
+    # Dynamically create the mock config for this test case
+    class MockConfig:
+        def __init__(self, config):
+            self.fold_config = config
+
+    # Create a GeoDataFrame
+    fold_gdf = gpd.GeoDataFrame(fold_data, crs="EPSG:4326")
+
+    # Instantiate the MapData class with the dynamic mock config and data
+    map_data = MapData()
+    map_data.config = MockConfig(fold_config)
+    map_data.raw_data = [None] * len(Datatype.__dict__)
+    map_data.raw_data[Datatype.FOLD] = fold_gdf
+
+    # Test the check_fold_fields_validity function
+    validity_check, message = check_fold_fields_validity(map_data)
+    assert validity_check == expected_validity
+    assert message == expected_message

From 273b82d3988a5747350ea0fa19c63a23db913322 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Mon, 13 Jan 2025 15:36:48 +1100
Subject: [PATCH 21/29] typos from merging

---
 map2loop/mapdata.py | 159 +++++---------------------------------------
 1 file changed, 16 insertions(+), 143 deletions(-)

diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py
index de13de7c..ab6376a7 100644
--- a/map2loop/mapdata.py
+++ b/map2loop/mapdata.py
@@ -730,146 +730,6 @@ def check_map(self, datatype: Datatype):
             if error:
                 logger.error(message)
 
-    @beartype.beartype
-    def parse_fault_orientations(self) -> tuple:
-        """
-        Parse the fault orientations shapefile data into a consistent format
-
-        Returns:
-            tuple: A tuple of (bool: success/fail, str: failure message)
-        """
-        # Check type and size of loaded structure map
-        if (
-            self.raw_data[Datatype.FAULT_ORIENTATION] is None
-            or type(self.raw_data[Datatype.FAULT_ORIENTATION]) is not geopandas.GeoDataFrame
-        ):
-            logger.warning("Fault orientation shapefile is not loaded or valid")
-            return (True, "Fault orientation shapefile is not loaded or valid")
-
-        # Create new geodataframe
-        fault_orientations = geopandas.GeoDataFrame(
-            self.raw_data[Datatype.FAULT_ORIENTATION]["geometry"]
-        )
-
-        config = self.config.fault_config
-
-        # Parse dip direction and dip columns
-        if config["dipdir_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            if config["orientation_type"] == "strike":
-                fault_orientations["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply(
-                    lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1
-                )
-            else:
-                fault_orientations["DIPDIR"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                    config["dipdir_column"]
-                ]
-        else:
-            print(
-                f"Fault orientation shapefile does not contain dipdir_column '{config['dipdir_column']}'"
-            )
-
-        if config["dip_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            fault_orientations["DIP"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                config["dip_column"]
-            ]
-        else:
-            print(
-                f"Fault orientation shapefile does not contain dip_column '{config['dip_column']}'"
-            )
-
-        # TODO LG would it be worthwhile adding a description column for faults?
-        # it would be possible to parse out the fault displacement, type, slip direction
-        # if this was stored in the descriptions?
-
-        # Add object id
-        if config["objectid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            fault_orientations["ID"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                config["objectid_column"]
-            ]
-        else:
-            fault_orientations["ID"] = numpy.arange(len(fault_orientations))
-        self.data[Datatype.FAULT_ORIENTATION] = fault_orientations
-        
-        if config["featureid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
-            fault_orientations["featureId"] = self.raw_data[Datatype.FAULT_ORIENTATION][
-                config["featureid_column"]
-            ]
-        else:
-            fault_orientations["featureId"] = numpy.arange(len(fault_orientations))
-            
-        return (False, "")
-
-    @beartype.beartype
-    def parse_structure_map(self) -> tuple:
-        """
-        Parse the structure shapefile data into a consistent format
-
-        Returns:
-            tuple: A tuple of (bool: success/fail, str: failure message)
-        """
-        # Check type and size of loaded structure map
-        if (
-            self.raw_data[Datatype.STRUCTURE] is None
-            or type(self.raw_data[Datatype.STRUCTURE]) is not geopandas.GeoDataFrame
-        ):
-            logger.warning("Structure map is not loaded or valid")
-            return (True, "Structure map is not loaded or valid")
-
-        if len(self.raw_data[Datatype.STRUCTURE]) < 2:
-            logger.warning(
-                "Stucture map does not enough orientations to complete calculations (need at least 2), projection may be inconsistent"
-            )
-
-        # Create new geodataframe
-        structure = geopandas.GeoDataFrame(self.raw_data[Datatype.STRUCTURE]["geometry"])
-        config = self.config.structure_config
-
-        # Parse dip direction and dip columns
-        if config["dipdir_column"] in self.raw_data[Datatype.STRUCTURE]:
-            if config["orientation_type"] == "strike":
-                structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE].apply(
-                    lambda row: (row[config["dipdir_column"]] + 90.0) % 360.0, axis=1
-                )
-            else:
-                structure["DIPDIR"] = self.raw_data[Datatype.STRUCTURE][config["dipdir_column"]]
-        else:
-            print(f"Structure map does not contain dipdir_column '{config['dipdir_column']}'")
-
-        # Ensure all DIPDIR values are within [0, 360]
-        structure["DIPDIR"] = structure["DIPDIR"] % 360.0
-
-        if config["dip_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["DIP"] = self.raw_data[Datatype.STRUCTURE][config["dip_column"]]
-        else:
-            print(f"Structure map does not contain dip_column '{config['dip_column']}'")
-
-        # Add bedding and overturned booleans
-        if config["overturned_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["OVERTURNED"] = (
-                self.raw_data[Datatype.STRUCTURE][config["overturned_column"]]
-                .astype(str)
-                .str.contains(config["overturned_text"])
-            )
-        else:
-            structure["OVERTURNED"] = False
-
-        if config["description_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["BEDDING"] = (
-                self.raw_data[Datatype.STRUCTURE][config["description_column"]]
-                .astype(str)
-                .str.contains(config["bedding_text"])
-            )
-        else:
-            structure["BEDDING"] = False
-
-        # Add object id
-        if config["objectid_column"] in self.raw_data[Datatype.STRUCTURE]:
-            structure["ID"] = self.raw_data[Datatype.STRUCTURE][config["objectid_column"]]
-        else:
-            structure["ID"] = numpy.arange(len(structure))
-
-        self.data[Datatype.STRUCTURE] = structure
-        return (False, "")
 
     @beartype.beartype
     def parse_geology_map(self) -> tuple:
@@ -1183,7 +1043,7 @@ def parse_fault_map(self) -> tuple:
 
         return (False, "")
 
-    @beartype.beartype
+        @beartype.beartype
     def parse_fault_orientations(self) -> tuple:
         """
         Parse the fault orientations shapefile data into a consistent format
@@ -1192,7 +1052,12 @@ def parse_fault_orientations(self) -> tuple:
             tuple: A tuple of (bool: success/fail, str: failure message)
         """
         # Check type and size of loaded structure map
-
+        if (
+            self.raw_data[Datatype.FAULT_ORIENTATION] is None
+            or type(self.raw_data[Datatype.FAULT_ORIENTATION]) is not geopandas.GeoDataFrame
+        ):
+            logger.warning("Fault orientation shapefile is not loaded or valid")
+            return (True, "Fault orientation shapefile is not loaded or valid")
 
         # Create new geodataframe
         fault_orientations = geopandas.GeoDataFrame(
@@ -1237,9 +1102,17 @@ def parse_fault_orientations(self) -> tuple:
         else:
             fault_orientations["ID"] = numpy.arange(len(fault_orientations))
         self.data[Datatype.FAULT_ORIENTATION] = fault_orientations
+        
+        if config["featureid_column"] in self.raw_data[Datatype.FAULT_ORIENTATION]:
+            fault_orientations["featureId"] = self.raw_data[Datatype.FAULT_ORIENTATION][
+                config["featureid_column"]
+            ]
+        else:
+            fault_orientations["featureId"] = numpy.arange(len(fault_orientations))
+            
         return (False, "")
 
-
+    
     @beartype.beartype
     def parse_fold_map(self) -> tuple:
         """

From 04b43f1fd2dbcf02aacd0b60225aebaf4cb897d8 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Mon, 13 Jan 2025 15:38:07 +1100
Subject: [PATCH 22/29] chore: another typo

---
 map2loop/mapdata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/map2loop/mapdata.py b/map2loop/mapdata.py
index ab6376a7..dd710a99 100644
--- a/map2loop/mapdata.py
+++ b/map2loop/mapdata.py
@@ -1043,7 +1043,7 @@ def parse_fault_map(self) -> tuple:
 
         return (False, "")
 
-        @beartype.beartype
+    @beartype.beartype
     def parse_fault_orientations(self) -> tuple:
         """
         Parse the fault orientations shapefile data into a consistent format

From fc2e393bcc51c9ca72d3ed90c9818e576191dbac Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Mon, 13 Jan 2025 15:58:32 +1100
Subject: [PATCH 23/29] fix: remove kwargs from project

---
 map2loop/project.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/map2loop/project.py b/map2loop/project.py
index ca4ecf8d..d752bcb3 100644
--- a/map2loop/project.py
+++ b/map2loop/project.py
@@ -35,7 +35,7 @@ class Project(object):
     """
     The main entry point into using map2loop
 
-    Attiributes
+    Attributes
     -----------
     verbose_level: m2l_enums.VerboseLevel
         A selection that defines how much console logging is output
@@ -75,7 +75,6 @@ def __init__(
         save_pre_checked_map_data: bool = False,
         loop_project_filename: str = "",
         overwrite_loopprojectfile: bool = False,
-        **kwargs,
     ):  
         """
         The initialiser for the map2loop project
@@ -121,17 +120,6 @@ def __init__(
             ValueError: use_australian_state_data not in state list ['WA', 'SA', 'QLD', 'NSW', 'TAS', 'VIC', 'ACT', 'NT']
         """
         
-        # Throw error if unexpected keyword arguments are passed to project
-        allowed_kwargs = {"metadata_filename"}
-        for key in kwargs.keys():
-            if key not in allowed_kwargs:
-                logger.error(
-                    f"Unexpected keyword argument '{key}' passed to Project. Allowed keywords: {', '.join(allowed_kwargs)}."
-                )
-                raise TypeError(
-                    f"Project got an unexpected keyword argument '{key}' - please double-check this before proceeding with map2loop processing"
-                )
-        
         # make sure all the needed arguments are provided
         if not use_australian_state_data: # this check has to skip if using Loop server data
             self.validate_required_inputs(
@@ -170,11 +158,6 @@ def __init__(
         self.fold_samples = pandas.DataFrame(columns=["ID", "X", "Y", "Z", "featureId"])
         self.geology_samples = pandas.DataFrame(columns=["ID", "X", "Y", "Z", "featureId"])
 
-        
-        # Check for alternate config filenames in kwargs
-        if "metadata_filename" in kwargs and config_filename == "":
-            config_filename = kwargs["metadata_filename"]
-
         # Sanity check on working projection parameter
         if issubclass(type(working_projection), str) or issubclass(type(working_projection), int):
             self.map_data.set_working_projection(working_projection)
@@ -257,8 +240,6 @@ def __init__(
         self.stratigraphic_column.populate(self.map_data.get_map_data(Datatype.GEOLOGY))
         self.deformation_history.populate(self.map_data.get_map_data(Datatype.FAULT))
 
-        if len(kwargs):
-            logger.warning(f"Unused keyword arguments: {kwargs}")
 
     @beartype.beartype
     def validate_required_inputs(

From e919f941be609c52eb065572d17041f30094d8a7 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Mon, 13 Jan 2025 16:44:13 +1100
Subject: [PATCH 24/29] chore: refactor geometry checks

---
 map2loop/data_checks.py | 129 ++++++++++++++++++++++++----------------
 1 file changed, 77 insertions(+), 52 deletions(-)

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index 0954b08f..68ca13b3 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -3,7 +3,7 @@
 
 #external imports
 import beartype as beartype
-from beartype.typing import Tuple
+from beartype.typing import Tuple, List
 import geopandas
 import shapely
 import pandas
@@ -38,21 +38,16 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
     geology_data = mapdata.raw_data[Datatype.GEOLOGY]
     config = mapdata.config.geology_config
     
-    # 1. Check geometry validity - tested & working
-    if not geology_data.geometry.is_valid.all():
-        logger.error("Invalid geometries found. Please fix those before proceeding with map2loop processing")
-        return (True, "Invalid geometries found in datatype GEOLOGY")
+    # 2. Validate geometry
+    failed, message = validate_geometry(
+        geodata=geology_data,
+        expected_geom_types=[shapely.Polygon, shapely.MultiPolygon],
+        datatype_name="GEOLOGY"
+    )
+    if failed:
+        return (failed, message)
     
-    # Check if all geometries are Polygon or MultiPolygon
-    if not geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon))).all():
-        invalid_types = geology_data[~geology_data.geometry.apply(lambda geom: isinstance(geom, (shapely.Polygon, shapely.MultiPolygon)))]
-        logger.error(
-            f"datatype GEOLOGY: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}"
-        )
-        return (True, "Invalid geometry types found in datatype GEOLOGY. All geometries must be Polygon or MultiPolygon.")
-
-    
-    # # 2. Required Columns & are they str, and then empty or null? 
+    # # 3. Required Columns & are they str, and then empty or null? 
     required_columns = [config["unitname_column"], config["alt_unitname_column"]]
     for col in required_columns:
         if col not in geology_data.columns:
@@ -141,6 +136,7 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
     logger.info("Geology fields validation passed.")
     return (False, "")
 
+
 @beartype.beartype
 def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
     """
@@ -174,18 +170,14 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
     structure_data = mapdata.raw_data[Datatype.STRUCTURE]
     config = mapdata.config.structure_config
 
-    # 1. Check geometry validity
-    if not structure_data.geometry.is_valid.all():
-        logger.error("datatype STRUCTURE: Invalid geometries found. Please fix those before proceeding with map2loop processing")
-        return (True, "Invalid geometries found in datatype STRUCTURE")
-
-    #  Check if all geometries are Points
-    if not structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point)).all():
-        invalid_types = structure_data[~structure_data.geometry.apply(lambda geom: isinstance(geom, shapely.Point))]
-        logger.error(
-            f"datatype STRUCTURE: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}"
-        )
-        return (True, "Invalid geometry types found in datatype STRUCTURE. All geometries must be Points.")
+    # 2. Validate geometry
+    failed, message = validate_geometry(
+        geodata=structure_data,
+        expected_geom_types=[shapely.Point, shapely.MultiPoint],
+        datatype_name="STRUCTURE"
+    )
+    if failed:
+        return (failed, message)
     
     # 2. Check mandatory numeric columns
     required_columns = [config["dipdir_column"], config["dip_column"]]
@@ -273,18 +265,14 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
     fault_data = mapdata.raw_data[Datatype.FAULT]
     config = mapdata.config.fault_config
     
-    # Check geometry
-    if not fault_data.geometry.is_valid.all():
-        logger.error("datatype FAULT: Invalid geometries found. Please fix those before proceeding with map2loop processing")
-        return (True, "Invalid geometries found in FAULT data.")
-
-    # Check for LineString or MultiLineString geometries
-    if not fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all():
-        invalid_types = fault_data[~fault_data.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))]
-        logger.error(
-            f"FAULT data contains invalid geometry types. Rows with invalid geometry types: {invalid_types.index.tolist()}"
-        )
-        return (True, "FAULT data contains geometries that are not LineString or MultiLineString.")
+    # 2. Validate geometry
+    failed, message = validate_geometry(
+        geodata=fault_data,
+        expected_geom_types=[shapely.LineString, shapely.MultiLineString],
+        datatype_name="FAULT"
+    )
+    if failed:
+        return (failed, message)
     
     # Check "structtype_column" if it exists
     if "structtype_column" in config:
@@ -476,18 +464,14 @@ def check_fold_fields_validity(mapdata) -> Tuple[bool, str]:
     # Debugging: Print column names in the fold_data
     logger.debug(f"Fold data columns: {folds.columns.tolist()}")
 
-    # Check geometry
-    if not folds.geometry.is_valid.all():
-        logger.error("datatype FOLD: Invalid geometries found. Please fix those before proceeding with map2loop processing")
-        return (True, "Invalid geometries found in FOLD data.")
-    
-    # Check for LineString or MultiLineString geometries
-    if not folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString))).all():
-        invalid_types = folds[~folds.geometry.apply(lambda geom: isinstance(geom, (shapely.LineString, shapely.MultiLineString)))]
-        logger.error(
-            f"datatype FOLD: Invalid geometry types found. Rows with invalid types: {invalid_types.index.tolist()}"
-        )
-        return (True, "Invalid geometry types found in FOLD data.")
+    # 2. Validate geometry
+    failed, message = validate_geometry(
+        geodata=folds,
+        expected_geom_types=[shapely.LineString, shapely.MultiLineString],
+        datatype_name="FOLD"
+    )
+    if failed:
+        return (failed, message)
     
     # Check "structtype_column" if it exists
     if "structtype_column" in config:
@@ -683,4 +667,45 @@ def check_keys(d: dict, parent_key=""):
     mfl = config_dict.get("fault", {}).get("minimum_fault_length", None)
     if mfl is not None and not isinstance(mfl, (int, float)):
         logger.error("minimum_fault_length must be a number.")
-        raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}")
\ No newline at end of file
+        raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}")
+    
+
+def validate_geometry(
+    geodata: geopandas.GeoDataFrame,
+    expected_geom_types: List[type],
+    datatype_name: str
+) -> Tuple[bool, str]:
+    """
+    Validates the geometry column of a GeoDataFrame.
+
+    Parameters:
+        geodata (gpd.GeoDataFrame): The GeoDataFrame to validate.
+        expected_geom_types (List[type]): A list of expected Shapely geometry types.
+        datatype_name (str): A string representing the datatype being validated (e.g., "GEOLOGY").
+
+    Returns:
+        Tuple[bool, str]: A tuple where the first element is a boolean indicating if validation failed,
+                          and the second element is an error message if failed.
+    """
+    # 1. Check if all geometries are valid
+    if not geodata.geometry.is_valid.all():
+        logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.")
+        return True, f"Invalid geometries found in datatype {datatype_name}."
+    
+    # 2. Check if all geometries are of the expected types
+    if not geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types))).all():
+        invalid_types = geodata[~geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types)))]
+        invalid_indices = invalid_types.index.tolist()
+        expected_types_names = ', '.join([geom_type.__name__ for geom_type in expected_geom_types])
+        logger.error(
+            f"Datatype {datatype_name}: Invalid geometry types found. Expected types: {expected_types_names}. "
+            f"Rows with invalid types: {invalid_indices}"
+        )
+        return True, (
+            f"Invalid geometry types found in datatype {datatype_name}. "
+            f"All geometries must be {expected_types_names}."
+        )
+    
+    # If all checks pass
+    logger.debug(f"Geometry validation passed for datatype {datatype_name}.")
+    return False, ""
\ No newline at end of file

From 000226238e6b1f756c6d89a336450e27f15ccbb6 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Mon, 13 Jan 2025 17:19:34 +1100
Subject: [PATCH 25/29] chore: update tests for geometry refactor

---
 map2loop/data_checks.py                     | 6 +++---
 tests/data_checks/test_input_data_faults.py | 2 +-
 tests/data_checks/test_input_data_fold.py   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index 68ca13b3..73aee9e9 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -669,7 +669,7 @@ def check_keys(d: dict, parent_key=""):
         logger.error("minimum_fault_length must be a number.")
         raise ValueError(f"minimum_fault_length must be a number, instead got: {type(mfl)}")
     
-
+@beartype.beartype
 def validate_geometry(
     geodata: geopandas.GeoDataFrame,
     expected_geom_types: List[type],
@@ -690,7 +690,7 @@ def validate_geometry(
     # 1. Check if all geometries are valid
     if not geodata.geometry.is_valid.all():
         logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.")
-        return True, f"Invalid geometries found in datatype {datatype_name}."
+        return True, f"Invalid geometries found in datatype {datatype_name}"
     
     # 2. Check if all geometries are of the expected types
     if not geodata.geometry.apply(lambda geom: isinstance(geom, tuple(expected_geom_types))).all():
@@ -707,5 +707,5 @@ def validate_geometry(
         )
     
     # If all checks pass
-    logger.debug(f"Geometry validation passed for datatype {datatype_name}.")
+    logger.debug(f"Geometry validation passed for datatype {datatype_name}")
     return False, ""
\ No newline at end of file
diff --git a/tests/data_checks/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py
index cf668889..2f70176a 100644
--- a/tests/data_checks/test_input_data_faults.py
+++ b/tests/data_checks/test_input_data_faults.py
@@ -34,7 +34,7 @@
             },
             {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"},
             True,
-            "Invalid geometries found in FAULT data."
+            "Invalid geometries found in datatype FAULT"
         ),
 
         # Non-string FEATURE column
diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py
index cb92ad2e..75051e56 100644
--- a/tests/data_checks/test_input_data_fold.py
+++ b/tests/data_checks/test_input_data_fold.py
@@ -27,7 +27,7 @@
         (
             {
                 "geometry": [
-                    shapely.geometry.Point(0, 0),  # Invalid type
+                    shapely.geometry.LineString([(0,0), (0,0)]),  # Invalid type
                     shapely.geometry.LineString([(0, 0), (1, 1)])
                 ],
                 "FEATURE": ["fold A", "fold B"],
@@ -36,7 +36,7 @@
             },
             {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
             True,
-            "Invalid geometry types found in FOLD data."
+            "Invalid geometries found in datatype FOLD"
         ),
         # Non-string FEATURE column
         (

From b066a2218b8d57dd46e15e6d07438c95da145012 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 14 Jan 2025 11:41:52 +1100
Subject: [PATCH 26/29] chore: refactor id checks

---
 map2loop/data_checks.py                       | 203 +++++++++---------
 tests/data_checks/test_input_data_fold.py     |   2 +-
 tests/data_checks/test_input_data_geology.py  |   4 +-
 .../data_checks/test_input_data_structure.py  |   2 +-
 4 files changed, 102 insertions(+), 109 deletions(-)

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index 73aee9e9..4d769f1d 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -85,31 +85,16 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
                     "Map2loop processing might not work as expected."
         )
     
-    # # 4. Check for duplicates in ID
-    if "objectid_column" in config and config["objectid_column"] in geology_data.columns:
-        objectid_values = geology_data[config["objectid_column"]]
+    # # 4. check ID column
+    if "objectid_column" in config:
+        id_validation_failed, id_message = validate_id_column(
+            geodata=geology_data,
+            config=config,
+            id_config_key="objectid_column", 
+            geodata_name="GEOLOGY")
         
-        # Check for None, NaN, or other null-like values
-        if objectid_values.isnull().any():
-            logger.error(
-                f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values. Ensure all values are valid and non-null."
-            )
-            return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains NaN or null values.")
-        
-        # Check for duplicate values
-        if objectid_values.duplicated().any():
-            logger.error(
-                f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column'). Please make sure that the column contains unique values."
-            )
-            return (True, f"Datatype GEOLOGY: Duplicate values found in column '{config['objectid_column']}' (config key: 'objectid_column').")
-        
-        # Check for uniqueness
-        if not objectid_values.is_unique:
-            logger.error(
-                f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values. Ensure all values are unique."
-            )
-            return (True, f"Datatype GEOLOGY: Column '{config['objectid_column']}' (config key: 'objectid_column') contains non-unique values.")
-
+        if id_validation_failed:
+            return (id_validation_failed, id_message)
 
     # 5. Check for NaNs/blanks in optional fields with warnings
     warning_fields = [
@@ -224,31 +209,17 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
                     "Map2loop processing might not work as expected."
         )
 
-    # check ID column for type, null values, and duplicates
-    optional_numeric_column_key = "objectid_column"
-    optional_numeric_column = config.get(optional_numeric_column_key)
-
-    if optional_numeric_column:
-        if optional_numeric_column in structure_data.columns:
-            # Check for non-integer values
-            if not structure_data[optional_numeric_column].apply(lambda x: isinstance(x, int) or pandas.isnull(x)).all():
-                logger.error(
-                    f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values. Rectify this, or remove this column from the config - map2loop will generate a new ID."
-                )
-                return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains non-integer values.")
-            # Check for NaN
-            if structure_data[optional_numeric_column].isnull().any():
-                logger.error(
-                    f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values. Rectify this, or remove this column from the config - map2loop will generate a new ID."
-                )
-                return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains NaN values.")
-            # Check for duplicates
-            if structure_data[optional_numeric_column].duplicated().any():
-                logger.error(
-                    f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values. Rectify this, or remove this column from the config - map2loop will generate a new ID."
-                )
-                return (True, f"Datatype STRUCTURE: ID column '{optional_numeric_column}' (config key: '{optional_numeric_column_key}') contains duplicate values.")
-
+    # check ID column 
+    if "objectid_column" in config:
+        id_validation_failed, id_message = validate_id_column(
+            geodata=structure_data,
+            config=config,
+            id_config_key="objectid_column", 
+            geodata_name="STRUCTURE")
+        
+        if id_validation_failed:
+            return (id_validation_failed, id_message)
+        
     return (False, "")
 
 @beartype.beartype
@@ -419,32 +390,17 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
             )
             return (True, f"Datatype FAULT: Column '{dip_estimate_column}' is missing from the fault data.")
 
-    # Check ID column
-    id_column = config.get("objectid_column")
     
-    if id_column:  
-        if id_column in fault_data.columns:
-            # Check for non-integer values
-            # Attempt to coerce the ID column to integers because WA data says so (ARodrigues)
-            fault_data[id_column] = pandas.to_numeric(fault_data[id_column], errors='coerce')
-
-            # Check if all values are integers or null after coercion
-            if not fault_data[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all():
-                logger.warning(
-                    f"Datatype FAULT: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs."
-                )
-            
-            # Check for NaN values
-            if fault_data[id_column].isnull().any():
-                logger.warning(
-                    f"Datatype FAULT: ID column '{id_column}' contains NaN or null values. Rectify this or remove the key from the config to auto-generate IDs."
-                )
-
-            # Check for duplicates
-            if fault_data[id_column].duplicated().any():
-                logger.error(
-                    f"Datatype FAULT: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs."
-                )
+    # # 4. check ID column
+    if "objectid_column" in config:
+        id_validation_failed, id_message = validate_id_column(
+            geodata=fault_data,
+            config=config,
+            id_config_key="objectid_column", 
+            geodata_name="FAULT")
+        
+        if id_validation_failed:
+            return (id_validation_failed, id_message)
     
     return (False, "")
 
@@ -546,36 +502,16 @@ def check_fold_fields_validity(mapdata) -> Tuple[bool, str]:
                 )
 
 
-    # Check ID column
-    id_column = config.get("objectid_column")
-    
-    if id_column:
-        if id_column in folds.columns:
-            # Attempt to coerce the ID column to numeric
-            folds[id_column] = pandas.to_numeric(folds[id_column], errors='coerce')
-            
-            # Check if all values are integers or null after coercion
-            if not folds[id_column].apply(lambda x: pandas.isnull(x) or isinstance(x, int)).all():
-                logger.warning(
-                    f"Datatype FOLD: ID column '{id_column}' must contain only integer values. Rectify this or remove the key from the config to auto-generate IDs."
-                )
-            
-            # Check for NaN values
-            if folds[id_column].isnull().any():
-                logger.warning(
-                    f"Datatype FOLD: ID column '{id_column}' contains NaN or null or str values. Rectify this or remove the key from the config to auto-generate IDs."
-                )
-            
-            # Check for duplicate values
-            if folds[id_column].duplicated().any():
-                logger.error(
-                    f"Datatype FOLD: ID column '{id_column}' contains duplicate values. Rectify this or remove the key from the config to auto-generate IDs."
-                )
-                return (True, f"Datatype FOLD: ID column '{id_column}' contains duplicate values.")
-        else:
-            logger.warning(
-                f"Datatype FOLD: ID column '{id_column}' is missing from the fold data. Ensure the column name is correct or remove the key from the config."
-            )
+    # # 4. check ID column
+    if "objectid_column" in config:
+        id_validation_failed, id_message = validate_id_column(
+            geodata=folds,
+            config=config,
+            id_config_key="objectid_column", 
+            geodata_name="FOLD")
+        
+        if id_validation_failed:
+            return (id_validation_failed, id_message)
 
     return (False, "")
 
@@ -708,4 +644,61 @@ def validate_geometry(
     
     # If all checks pass
     logger.debug(f"Geometry validation passed for datatype {datatype_name}")
-    return False, ""
\ No newline at end of file
+    return False, ""
+
+
+@beartype.beartype
+def validate_id_column(
+    geodata: geopandas.GeoDataFrame,
+    config: dict,
+    id_config_key: str, 
+    geodata_name: str
+) -> Tuple[bool, str]:
+
+    # Retrieve the ID column name from the configuration
+    id_column = config.get(id_config_key)
+    
+    if not id_column:
+        error_msg = f"Configuration key '{id_config_key}' is missing."
+        logger.error(error_msg)
+        return (True, error_msg)
+    
+    if id_column in geodata.columns:
+        geodata[id_column] = pandas.to_numeric(geodata[id_column], errors='coerce')
+    
+        # Check for non-numeric values (which are now NaN after coercion)
+        if geodata[id_column].isnull().any():
+            error_msg = (
+                f"Datatype {geodata_name}: Column '{id_column}' "
+                f"(config key: '{id_config_key}') contains non-numeric or NaN values. "
+                "Please rectify the values, or remove this key from the config dictionary to let map2loop assign IDs."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+        
+        if not (geodata[id_column] == geodata[id_column].astype(int)).all():
+            error_msg = (
+                f"Datatype {geodata_name}: Column '{id_column}' "
+                f"(config key: '{id_config_key}') contains non-integer values."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+
+        if geodata[id_column].duplicated().any():
+            error_msg = (
+                f"Datatype {geodata_name}: Column '{id_column}' "
+                f"(config key: '{id_config_key}') contains duplicate values."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+    
+    
+    elif id_column not in geodata.columns:
+        msg = (
+            f"Datatype {geodata_name}: Column '{id_column}' "
+            f"(config key: '{id_config_key}') is missing from the data. "
+            "Map2loop will automatically generate IDs."
+        )
+        logger.warning(msg)
+
+    return (False, "")
\ No newline at end of file
diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py
index 75051e56..740039b8 100644
--- a/tests/data_checks/test_input_data_fold.py
+++ b/tests/data_checks/test_input_data_fold.py
@@ -80,7 +80,7 @@
             },
             {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
             True,
-            "Datatype FOLD: ID column 'ID' contains duplicate values."
+            "Datatype FOLD: Column 'ID' (config key: 'objectid_column') contains duplicate values."
         ),
     ],
     ids=[
diff --git a/tests/data_checks/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py
index 4b083e76..517e6fc8 100644
--- a/tests/data_checks/test_input_data_geology.py
+++ b/tests/data_checks/test_input_data_geology.py
@@ -137,7 +137,7 @@ def __init__(self):
                 "ID": [1, 1],  # Duplicate ID
             },
             True,
-            "Datatype GEOLOGY: Duplicate values found in column 'ID' (config key: 'objectid_column').",
+            "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains duplicate values.",
         ),
         # nan in id
         (
@@ -158,7 +158,7 @@ def __init__(self):
                 "ID": [1, None],  
             },
             True,
-            "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains NaN or null values.",
+            "Datatype GEOLOGY: Column 'ID' (config key: 'objectid_column') contains non-numeric or NaN values. Please rectify the values, or remove this key from the config dictionary to let map2loop assign IDs.",
         ),
         # nan in unit name
         (
diff --git a/tests/data_checks/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py
index d96441fc..ceabdcaf 100644
--- a/tests/data_checks/test_input_data_structure.py
+++ b/tests/data_checks/test_input_data_structure.py
@@ -116,7 +116,7 @@ def __init__(self):
                 "ID": [1, 1]  # Duplicate ID
             },
             True,
-            "Datatype STRUCTURE: ID column 'ID' (config key: 'objectid_column') contains duplicate values.",
+            "Datatype STRUCTURE: Column 'ID' (config key: 'objectid_column') contains duplicate values.",
         ),
     ],
 )

From 9001a08729ee3b584cc985bacc9362a2ca9b9b42 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 14 Jan 2025 14:20:46 +1100
Subject: [PATCH 27/29] chore: refactor mandatory fields for str and geo

---
 map2loop/data_checks.py                       | 134 ++++++++++++++----
 tests/data_checks/test_input_data_geology.py  |   8 +-
 .../data_checks/test_input_data_structure.py  |   6 +-
 3 files changed, 113 insertions(+), 35 deletions(-)

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index 4d769f1d..b71ef6e8 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -47,20 +47,22 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
     if failed:
         return (failed, message)
     
-    # # 3. Required Columns & are they str, and then empty or null? 
-    required_columns = [config["unitname_column"], config["alt_unitname_column"]]
-    for col in required_columns:
-        if col not in geology_data.columns:
-            logger.error(f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.")
-            return (True, f"Datatype GEOLOGY: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from geology data.")
-        if not geology_data[col].apply(lambda x: isinstance(x, str)).all():
-            config_key = [k for k, v in config.items() if v == col][0]
-            logger.error(f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.")
-            return (True, f"Datatype GEOLOGY: Column '{config_key}' must contain only string values. Please check that the column contains only string values.")
-        if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any():
-            config_key = [k for k, v in config.items() if v == col][0]
-            logger.error(f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
-            return (True, f"Datatype GEOLOGY: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
+    
+    # check required columns in geology
+    required_columns = ["unitname_column", "alt_unitname_column"]
+    
+    validation_failed, message = validate_required_columns(
+        geodata=geology_data,
+        config=config,
+        required_columns=required_columns,
+        expected_type=str,
+        check_blank=True,
+        datatype_name="GEOLOGY"
+    )
+    if validation_failed:
+        return (validation_failed, message)
+    
+    
 
     # # 3. Optional Columns
     optional_string_columns = [
@@ -164,20 +166,19 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
     if failed:
         return (failed, message)
     
-    # 2. Check mandatory numeric columns
-    required_columns = [config["dipdir_column"], config["dip_column"]]
-    for col in required_columns:
-        if col not in structure_data.columns:
-            logger.error(f"DDatatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.")
-            return (True, f"Datatype STRUCTURE: Required column with config key: '{[k for k, v in config.items() if v == col][0]}' is missing from structure data.")
-        if not structure_data[col].apply(lambda x: isinstance(x, (int, float))).all():
-            config_key = [k for k, v in config.items() if v == col][0]
-            logger.error(f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.")
-            return (True, f"Datatype STRUCTURE: Column '{config_key}' must contain only numeric values. Please check that the column contains only numeric values.")
-        if structure_data[col].isnull().any():
-            config_key = [k for k, v in config.items() if v == col][0]
-            logger.error(f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
-            return (True, f"Datatype STRUCTURE: NaN or blank values found in required column '{config_key}'. Please double check the column for blank values.")
+    
+    # check required columns in structure (numeric dips & dip dir)
+    required_columns = ["dipdir_column", "dip_column"]
+    validation_failed, message = validate_required_columns(
+        geodata=structure_data,
+        config=config,
+        required_columns=required_columns,
+        expected_type=(int, float),
+        check_blank=False,
+        datatype_name="STRUCTURE"
+    )
+    if validation_failed:
+        return (validation_failed, message)
 
     if config["dip_column"] in structure_data.columns:
         invalid_dip = ~((structure_data[config["dip_column"]] >= 0) & (structure_data[config["dip_column"]] <= 90))
@@ -701,4 +702,81 @@ def validate_id_column(
         )
         logger.warning(msg)
 
+    return (False, "")
+
+from beartype.typing import List, Type, Tuple, Union
+
+def validate_required_columns(
+    geodata: geopandas.GeoDataFrame,
+    config: dict,
+    required_columns: List[str],
+    expected_type: Union[Type, Tuple[Type, ...]],
+    check_blank: bool = False,
+    datatype_name: str = "UNKNOWN"
+) -> Tuple[bool, str]:
+    """
+    Validate required columns in a GeoDataFrame.
+
+    This function checks whether required columns exist, have the expected data types,
+    and contain no null or (optionally) blank values.
+
+    Args:
+        geodata (geopandas.GeoDataFrame): The GeoDataFrame to validate.
+        config (dict): Configuration dictionary mapping config keys to column names.
+        required_columns (List[str]): List of config keys for required columns.
+        expected_type (Type or Tuple[Type, ...]): Expected data type(s) for the columns.
+        check_blank (bool, optional): Whether to check for blank (empty) strings. Defaults to False.
+        datatype_name (str, optional): Name of the datatype being validated (for logging). Defaults to "UNKNOWN".
+
+    Returns:
+        Tuple[bool, str]: (True, error_message) if validation fails, else (False, "").
+    """
+    for config_key in required_columns:
+        column_name = config.get(config_key)
+        
+        if not column_name:
+            error_msg = (
+                f"Configuration key '{config_key}' is missing for datatype '{datatype_name}'."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+        
+        if column_name not in geodata.columns:
+            error_msg = (
+                f"Datatype {datatype_name.upper()}: Required column with config key '{config_key}' "
+                f"(column: '{column_name}')  is missing from the data."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+        
+        # Check data type
+        if not geodata[column_name].apply(lambda x: isinstance(x, expected_type)).all():
+            error_msg = (
+                f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') "
+                f"must contain only {expected_type if isinstance(expected_type, type) else 'numeric'} values."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+        
+        # Check for null values
+        if geodata[column_name].isnull().any():
+            error_msg = (
+                f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}')  "
+                f"contains null values. Please ensure all values are present."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+        
+        # Optionally check for blank strings
+        if check_blank and issubclass(expected_type, str):
+            if geodata[column_name].str.strip().eq("").any():
+                error_msg = (
+                    f"Datatype {datatype_name.upper()}: Column '{config_key}' (column: '{column_name}') "
+                    f"contains blank (empty) values. Please ensure all values are populated."
+                )
+                logger.error(error_msg)
+                return (True, error_msg)
+    
+    # If all required columns pass validation
+    logger.info(f"Datatype {datatype_name.upper()}: All required columns validated successfully.")
     return (False, "")
\ No newline at end of file
diff --git a/tests/data_checks/test_input_data_geology.py b/tests/data_checks/test_input_data_geology.py
index 517e6fc8..4ae58ec2 100644
--- a/tests/data_checks/test_input_data_geology.py
+++ b/tests/data_checks/test_input_data_geology.py
@@ -80,7 +80,7 @@ def __init__(self):
                 "ID": [1],
             },
             True,
-            "Datatype GEOLOGY: Required column with config key: 'alt_unitname_column' is missing from geology data.",
+            "Datatype GEOLOGY: Required column with config key 'alt_unitname_column' (column: 'CODE')  is missing from the data.",
         ),
         # Non-string value in required column
         (
@@ -98,7 +98,7 @@ def __init__(self):
                 "ID": [1],
             },
             True,
-            "Datatype GEOLOGY: Column 'alt_unitname_column' must contain only string values. Please check that the column contains only string values.",
+            "Datatype GEOLOGY: Column 'alt_unitname_column' (column: 'CODE') must contain only <class 'str'> values.",
         ),
         # NaN or blank value in required column
         (
@@ -116,7 +116,7 @@ def __init__(self):
                 "ID": [1],
             },
             True,
-            "Datatype GEOLOGY: NaN or blank values found in required column 'unitname_column'. Please double check the column for blank values.",
+            "Datatype GEOLOGY: Column 'unitname_column' (column: 'UNITNAME') contains blank (empty) values. Please ensure all values are populated.",
         ),
         # Duplicate ID values
         (
@@ -179,7 +179,7 @@ def __init__(self):
                 "ID": [1, 1],  # Duplicate ID
             },
             True,
-            "Datatype GEOLOGY: Column 'unitname_column' must contain only string values. Please check that the column contains only string values.",
+            "Datatype GEOLOGY: Column 'unitname_column' (column: 'UNITNAME') must contain only <class 'str'> values.",
         ),
     ],
 )
diff --git a/tests/data_checks/test_input_data_structure.py b/tests/data_checks/test_input_data_structure.py
index ceabdcaf..60db846c 100644
--- a/tests/data_checks/test_input_data_structure.py
+++ b/tests/data_checks/test_input_data_structure.py
@@ -68,7 +68,7 @@ def __init__(self):
                 "ID": [1, 2]
             },
             True,
-            "Datatype STRUCTURE: Required column with config key: 'dipdir_column' is missing from structure data.",
+            "Datatype STRUCTURE: Required column with config key 'dipdir_column' (column: 'DIPDIR')  is missing from the data.",
         ),
         # Non-numeric value in numeric column
         (
@@ -84,7 +84,7 @@ def __init__(self):
                 "ID": [1, 2]
             },
             True,
-            "Datatype STRUCTURE: Column 'dipdir_column' must contain only numeric values. Please check that the column contains only numeric values.",
+            "Datatype STRUCTURE: Column 'dipdir_column' (column: 'DIPDIR') must contain only numeric values.",
         ),
         # NaN or blank value in required column
         (
@@ -100,7 +100,7 @@ def __init__(self):
                 "ID": [1, 2]
             },
             True,
-            "Datatype STRUCTURE: NaN or blank values found in required column 'dipdir_column'. Please double check the column for blank values.",
+            "Datatype STRUCTURE: Column 'dipdir_column' (column: 'DIPDIR')  contains null values. Please ensure all values are present.",
         ),
         # Duplicate ID column
         (

From 652281cbccdd5fc47cec3ebf9ff00a72883deb44 Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 14 Jan 2025 15:16:52 +1100
Subject: [PATCH 28/29] finalise refactor

---
 map2loop/data_checks.py                     | 580 ++++++++++++--------
 tests/data_checks/test_input_data_faults.py |   2 +-
 tests/data_checks/test_input_data_fold.py   |   2 +-
 3 files changed, 342 insertions(+), 242 deletions(-)

diff --git a/map2loop/data_checks.py b/map2loop/data_checks.py
index b71ef6e8..b36349ca 100644
--- a/map2loop/data_checks.py
+++ b/map2loop/data_checks.py
@@ -3,7 +3,7 @@
 
 #external imports
 import beartype as beartype
-from beartype.typing import Tuple, List
+from beartype.typing import Tuple, Optional, List, Dict, Type, Union
 import geopandas
 import shapely
 import pandas
@@ -51,7 +51,7 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
     # check required columns in geology
     required_columns = ["unitname_column", "alt_unitname_column"]
     
-    validation_failed, message = validate_required_columns(
+    failed, message = validate_required_columns(
         geodata=geology_data,
         config=config,
         required_columns=required_columns,
@@ -59,66 +59,52 @@ def check_geology_fields_validity(mapdata) -> tuple[bool, str]:
         check_blank=True,
         datatype_name="GEOLOGY"
     )
-    if validation_failed:
-        return (validation_failed, message)
-    
+    if failed:
+        return (failed, message)
     
-
-    # # 3. Optional Columns
+    # check optional columns
     optional_string_columns = [
         "group_column", "supergroup_column", "description_column",
         "rocktype_column", "alt_rocktype_column",
     ]
     
-    for key in optional_string_columns:
-        if key in config and config[key] in geology_data.columns:
-            if not geology_data[config[key]].apply(lambda x: isinstance(x, str)).all():
-                logger.warning(
-                    f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-string values. "
-                    "Map2loop processing might not work as expected."
-                )
-
-    optional_numeric_columns = ["minage_column", "maxage_column", "objectid_column"]
-    for key in optional_numeric_columns:
-        if key in config and config[key] in geology_data.columns:
-            if not geology_data[config[key]].apply(lambda x: isinstance(x, (int, float))).all():
-                logger.warning(
-                    f"Datatype GEOLOGY: Optional column '{config[key]}' (config key: '{key}') contains non-numeric values. "
-                    "Map2loop processing might not work as expected."
-        )
+    string_warnings = validate_optional_columns(
+        geodata=geology_data,
+        config=config,
+        optional_columns=optional_string_columns,
+        expected_type=str,
+        check_blank=True,  
+        datatype_name="GEOLOGY"
+    )
+    ### only emit warnings for optional columns
+    for warning in string_warnings:
+        logger.warning(warning)
+    
+    # 5. Validate Optional Numeric Columns
+    optional_numeric_columns = ["minage_column", "maxage_column"]
+    numeric_warnings = validate_optional_columns(
+        geodata=geology_data,
+        config=config,
+        optional_columns=optional_numeric_columns,
+        expected_type=(int, float),
+        check_blank=False,
+        datatype_name="GEOLOGY"
+    )
+    
+    ### only emit warnings for optional columns
+    for warning in numeric_warnings:
+        logger.warning(warning)
     
     # # 4. check ID column
     if "objectid_column" in config:
-        id_validation_failed, id_message = validate_id_column(
+        failed, message = validate_id_column(
             geodata=geology_data,
             config=config,
             id_config_key="objectid_column", 
             geodata_name="GEOLOGY")
         
-        if id_validation_failed:
-            return (id_validation_failed, id_message)
-
-    # 5. Check for NaNs/blanks in optional fields with warnings
-    warning_fields = [
-        "group_column", "supergroup_column", "description_column",
-        "rocktype_column", "minage_column", "maxage_column",
-    ]
-    for key in warning_fields:
-        col = config.get(key)
-        if col and col in geology_data.columns:
-            # Check if column contains string values before applying `.str`
-            if pandas.api.types.is_string_dtype(geology_data[col]):
-                if geology_data[col].isnull().any() or geology_data[col].str.strip().eq("").any():
-                    logger.warning(
-                        f"Datatype GEOLOGY: NaN or blank values found in optional column '{col}' (config key: '{key}')."
-                    )
-            else:
-                # Non-string columns, check only for NaN values
-                if geology_data[col].isnull().any():
-                    logger.warning(
-                        f"Datatype GEOLOGY: NaN values found in optional column '{col}' (config key: '{key}')."
-                    )
-
+        if failed:
+            return (failed, message)
 
     logger.info("Geology fields validation passed.")
     return (False, "")
@@ -169,7 +155,7 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
     
     # check required columns in structure (numeric dips & dip dir)
     required_columns = ["dipdir_column", "dip_column"]
-    validation_failed, message = validate_required_columns(
+    failed, message = validate_required_columns(
         geodata=structure_data,
         config=config,
         required_columns=required_columns,
@@ -177,49 +163,46 @@ def check_structure_fields_validity(mapdata) -> Tuple[bool, str]:
         check_blank=False,
         datatype_name="STRUCTURE"
     )
-    if validation_failed:
-        return (validation_failed, message)
-
-    if config["dip_column"] in structure_data.columns:
-        invalid_dip = ~((structure_data[config["dip_column"]] >= 0) & (structure_data[config["dip_column"]] <= 90))
-        if invalid_dip.any():
-            logger.warning(
-                f"Datatype STRUCTURE: Column '{config['dip_column']}' has values that are not between 0 and 90 degrees. Is this intentional?"
-            )
+    if failed:
+        return (failed, message)
 
-    if config["dipdir_column"] in structure_data.columns:
-        invalid_dipdir = ~((structure_data[config["dipdir_column"]] >= 0) & (structure_data[config["dipdir_column"]] <= 360))
-        if invalid_dipdir.any():
-            logger.warning(
-                f"Datatype STRUCTURE: Column '{config['dipdir_column']}' has values that are not between 0 and 360 degrees. Is this intentional?"
-            )
+    # 4. Validate Dip and Dip Direction value ranges
+    dip_columns = ["dip_column", "dipdir_column"]
+    dip_validation_failed, dip_message = validate_dip_columns(
+        geodata=structure_data,
+        config=config,
+        dip_columns=dip_columns,
+        datatype_name="STRUCTURE",
+        allow_nulls=False  # Dip and dipdir cannot have nulls in structure data
+    )
+    if dip_validation_failed:
+        logger.warning(dip_message)
     
-    # check validity of optional string columns
+    # check optional columns
     optional_string_columns = ["description_column", "overturned_column"]
-    for key in optional_string_columns:
-        if key in config and config[key] in structure_data.columns:
-            column_name = config[key]
-            if not structure_data[column_name].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
-                logger.warning(
-                    f"Datatype STRUCTURE: Optional column with config key: '{key}' contains non-string values. "
-                    "Map2loop processing might not work as expected."
-                )
-            if structure_data[column_name].isnull().any() or structure_data[column_name].str.strip().eq("").any():
-                logger.warning(
-                    f"Datatype STRUCTURE: Optional column config key: '{key}' contains NaN, empty, or null values. "
-                    "Map2loop processing might not work as expected."
-        )
+    string_warnings = validate_optional_columns(
+        geodata=structure_data,
+        config=config,
+        optional_columns=optional_string_columns,
+        expected_type=str,
+        check_blank=True,  
+        datatype_name="STRUCTURE"
+    )
+    
+    ## only emit warnings for optional columns
+    for warning in string_warnings:
+        logger.warning(warning)
 
     # check ID column 
     if "objectid_column" in config:
-        id_validation_failed, id_message = validate_id_column(
+        failed, id_message = validate_id_column(
             geodata=structure_data,
             config=config,
             id_config_key="objectid_column", 
             geodata_name="STRUCTURE")
         
-        if id_validation_failed:
-            return (id_validation_failed, id_message)
+        if failed:
+            return (failed, id_message)
         
     return (False, "")
 
@@ -246,43 +229,21 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
     if failed:
         return (failed, message)
     
-    # Check "structtype_column" if it exists
-    if "structtype_column" in config:
-        structtype_column = config["structtype_column"]
-
-        # Ensure the column exists in the data
-        if structtype_column not in fault_data.columns:
-            logger.warning(
-                f"Datatype FAULT: '{structtype_column}' (config key: 'structtype_column') is missing from the fault data. Consider removing that key from the config"
-            )
-        else:
-        # Check if all entries in the column are strings
-            if not fault_data[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
-                logger.error(
-                    f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
-                )
-                return (True, f"Datatype FAULT: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.")
-
-            # Warn about empty or null cells
-            if fault_data[structtype_column].isnull().any() or fault_data[structtype_column].str.strip().eq("").any():
-                logger.warning(
-                    f"Datatype FAULT: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected."
-                )
-
-    # Check if "fault_text" is defined and contained in the column
-    fault_text = config.get("fault_text", None)
-
-    # Check if the structtype_column exists in the fault_data
-    if structtype_column not in fault_data.columns:
-        logger.warning(
-            f"Datatype FAULT: The column '{structtype_column}' is not present in the fault data."
-        )
-
-    else:
-        if not fault_data[structtype_column].str.contains(fault_text).any():
-            logger.error(
-                f"Datatype FAULT: The 'fault_text' value '{fault_text}' is not found in column '{structtype_column}'. Project might end up with no faults"
-            )
+    # # Check "structtype_column" if it exists
+    text_keys = {
+        "fault_text": "fault_text"
+    }
+    structtype_validation_failed, structtype_message = validate_structtype_column(
+        geodata=fault_data,
+        config=config,
+        datatype_name="FAULT",
+        required=True,  # Assuming structtype_column is required in FAULT
+        text_keys=text_keys
+    )
+    if structtype_validation_failed:
+        return (structtype_validation_failed, structtype_message)
+    
+    
     
     #checks on name column
     name_column = config.get("name_column")
@@ -312,50 +273,17 @@ def check_fault_fields_validity(mapdata) -> Tuple[bool, str]:
                 f"Datatype FAULT: Column '{name_column}' contains duplicate values. This may affect processing."
             )
 
-    # dips & strikes
-    # Check for dips and dip directions
-    strike_dips_columns = ["dip_column", "dipdir_column"]
-
-    for key in strike_dips_columns:
-        column_name = config.get(key)
-        if column_name:  # Only proceed if the config has this key
-            if column_name in fault_data.columns:
-                
-                #coerce to numeric
-                fault_data[column_name] = pandas.to_numeric(fault_data[column_name], errors='coerce')
-                
-                # Check if the column contains only numeric values                    
-                if not fault_data[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all():
-                    logger.warning(
-                        f"Datatype FAULT: Column '{column_name}' (config key {key}) must contain only numeric values. Please ensure the column is numeric."
-                    )
-
-                # Check for NaN or empty values
-                if fault_data[column_name].isnull().any():
-                    logger.warning(
-                        f"Datatype FAULT: Column '{column_name}' (config key {key}) contains NaN or empty values. This may affect processing."
-                    )
-
-                # Check range constraints
-                if key == "dip_column":
-                    # Dips must be between 0 and 90
-                    invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 90))
-                    if invalid_values.any():
-                        logger.warning(
-                            f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 90]. Was this intentional?"
-                        )
-                elif key == "dipdir_column":
-                    # Dip directions must be between 0 and 360
-                    invalid_values = ~((fault_data[column_name] >= 0) & (fault_data[column_name] <= 360))
-                    if invalid_values.any():
-                        logger.warning(
-                            f"Datatype FAULT: Column '{column_name}' (config key {key}) contains values outside the range [0, 360]. Was this intentional?"
-                        )
-            else:
-                logger.warning(
-                    f"Datatype FAULT: Column '{column_name}' (config key {key}) is missing from the fault data. Please ensure the column name is correct, or otherwise remove that key from the config."
-                )
-                
+    # # dips & strikes
+    dip_columns = ["dip_column", "dipdir_column"]
+    dip_validation_failed, dip_message = validate_dip_columns(
+        geodata=fault_data,
+        config=config,
+        dip_columns=dip_columns,
+        datatype_name="FAULT",
+        allow_nulls=True  # Dip fields can be empty
+    )
+    if dip_validation_failed:
+        logger.warning(dip_message)
     
     # dip estimates
     dip_estimate_column = config.get("dip_estimate_column")
@@ -430,55 +358,20 @@ def check_fold_fields_validity(mapdata) -> Tuple[bool, str]:
     if failed:
         return (failed, message)
     
-    # Check "structtype_column" if it exists
-    if "structtype_column" in config:
-        structtype_column = config["structtype_column"]
-
-        # Ensure the column exists in the data
-        if structtype_column not in folds.columns:
-            logger.warning(
-                f"Datatype FOLD: '{structtype_column}' (config key: 'structtype_column') is missing from the fold data. Consider removing that key from the config"
-            )
-            return (True, f"Column '{structtype_column}' is missing from the fold data.")
-        else:
-            # Check if all entries in the column are strings
-            if not folds[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
-                logger.error(
-                    f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
-                )
-                return (True, f"Datatype FOLD: Column '{structtype_column}' (config key: 'structtype_column') contains non-string values.")
-
-            # Warn about empty or null cells
-            if folds[structtype_column].isnull().any() or folds[structtype_column].str.strip().eq("").any():
-                logger.warning(
-                    f"Datatype FOLD: Column '{structtype_column}' contains NaN, empty, or blank values. Processing might not work as expected."
-                )
-            
-            # Check if "fold_text" is defined and contained in the column
-            fold_text = config.get("fold_text", None)
-            if fold_text:
-                
-                # check if fold text is a string
-                if not isinstance(fold_text, str):
-                    logger.error("Datatype FOLD: 'fold_text' must be a string. Please ensure it is defined correctly in the config.")
-                    return (True, "Datatype FOLD: 'fold_text' must be a string.")
-                #check if it exists in the column strtype
-                if not folds[structtype_column].str.contains(fold_text, na=False).any():
-                    logger.error(f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'. This may impact processing.")
-                    return (True, f"Datatype FOLD: The 'fold_text' value '{fold_text}' is not found in column '{structtype_column}'.")
-
-            # check synform_text
-            synform_text = config.get("synform_text", None)
-            if synform_text:
-                # Check if synform_text is a string
-                if not isinstance(synform_text, str):
-                    logger.error("Datatype FOLD: 'synform_text' must be a string. Please ensure it is defined correctly in the config.")
-                    return (True, "Datatype FOLD: 'synform_text' must be a string.")
-                # Check if it exists in the structtype_column
-                if not folds[structtype_column].str.contains(synform_text, na=False).any():
-                    logger.warning(
-                        f"Datatype FOLD: The 'synform_text' value '{synform_text}' is not found in column '{structtype_column}'. This may impact processing."
-                    )
+    ## check structtype column if it exists
+    text_keys = {
+        "fold_text": "fold_text",
+        "synform_text": "synform_text"
+    }
+    structtype_validation_failed, structtype_message = validate_structtype_column(
+        geodata=folds,
+        config=config,
+        datatype_name="FOLD",
+        required=True,  # Assuming structtype_column is required in FOLD
+        text_keys=text_keys
+    )
+    if structtype_validation_failed:
+        return (structtype_validation_failed, structtype_message)
                         
     # check description column
     description_column = config.get("description_column", None)
@@ -612,18 +505,7 @@ def validate_geometry(
     expected_geom_types: List[type],
     datatype_name: str
 ) -> Tuple[bool, str]:
-    """
-    Validates the geometry column of a GeoDataFrame.
-
-    Parameters:
-        geodata (gpd.GeoDataFrame): The GeoDataFrame to validate.
-        expected_geom_types (List[type]): A list of expected Shapely geometry types.
-        datatype_name (str): A string representing the datatype being validated (e.g., "GEOLOGY").
 
-    Returns:
-        Tuple[bool, str]: A tuple where the first element is a boolean indicating if validation failed,
-                          and the second element is an error message if failed.
-    """
     # 1. Check if all geometries are valid
     if not geodata.geometry.is_valid.all():
         logger.error(f"Invalid geometries found in datatype {datatype_name}. Please fix them before proceeding.")
@@ -704,8 +586,7 @@ def validate_id_column(
 
     return (False, "")
 
-from beartype.typing import List, Type, Tuple, Union
-
+@beartype.beartype
 def validate_required_columns(
     geodata: geopandas.GeoDataFrame,
     config: dict,
@@ -714,23 +595,7 @@ def validate_required_columns(
     check_blank: bool = False,
     datatype_name: str = "UNKNOWN"
 ) -> Tuple[bool, str]:
-    """
-    Validate required columns in a GeoDataFrame.
-
-    This function checks whether required columns exist, have the expected data types,
-    and contain no null or (optionally) blank values.
 
-    Args:
-        geodata (geopandas.GeoDataFrame): The GeoDataFrame to validate.
-        config (dict): Configuration dictionary mapping config keys to column names.
-        required_columns (List[str]): List of config keys for required columns.
-        expected_type (Type or Tuple[Type, ...]): Expected data type(s) for the columns.
-        check_blank (bool, optional): Whether to check for blank (empty) strings. Defaults to False.
-        datatype_name (str, optional): Name of the datatype being validated (for logging). Defaults to "UNKNOWN".
-
-    Returns:
-        Tuple[bool, str]: (True, error_message) if validation fails, else (False, "").
-    """
     for config_key in required_columns:
         column_name = config.get(config_key)
         
@@ -779,4 +644,239 @@ def validate_required_columns(
     
     # If all required columns pass validation
     logger.info(f"Datatype {datatype_name.upper()}: All required columns validated successfully.")
+    return (False, "")
+
+
+
+def validate_optional_columns(
+    geodata: geopandas.GeoDataFrame,
+    config: Dict[str, str],
+    optional_columns: List[str],
+    expected_type: Union[Type, Tuple[Type, ...]],
+    check_blank: bool = False,
+    datatype_name: str = "UNKNOWN"
+) -> List[str]:
+
+    warnings = []
+
+    for config_key in optional_columns:
+        column_name = config.get(config_key)
+
+        if not column_name:
+            warning_msg = (
+                f"Configuration key '{config_key}' is missing for datatype '{datatype_name}'. "
+                f"Optional column validation for this key is skipped."
+            )
+            logger.warning(warning_msg)
+            warnings.append(warning_msg)
+            continue  
+
+        if column_name in geodata.columns:
+            # Type Check
+            if not geodata[column_name].apply(lambda x: isinstance(x, expected_type) or pandas.isnull(x)).all():
+                warning_msg = (
+                    f"Datatype {datatype_name.upper()}: Optional column '{column_name}' "
+                    f"(config key: '{config_key}') contains values that are not of type "
+                    f"{expected_type if isinstance(expected_type, type) else expected_type}. "
+                    "Map2loop processing might not work as expected."
+                )
+                logger.warning(warning_msg)
+                warnings.append(warning_msg)
+
+            # Blank String Check (if applicable)
+            if check_blank and issubclass(expected_type, str):
+                if geodata[column_name].str.strip().eq("").any():
+                    warning_msg = (
+                        f"Datatype {datatype_name.upper()}: Optional column '{column_name}' "
+                        f"(config key: '{config_key}') contains blank (empty) string values. "
+                        "Map2loop processing might not work as expected."
+                    )
+                    logger.warning(warning_msg)
+                    warnings.append(warning_msg)
+
+            # Null Value Check
+            if geodata[column_name].isnull().any():
+                warning_msg = (
+                    f"Datatype {datatype_name.upper()}: Optional column '{column_name}' "
+                    f"(config key: '{config_key}') contains NaN or null values. "
+                    "Map2loop processing might not work as expected."
+                )
+                logger.warning(warning_msg)
+                warnings.append(warning_msg)
+                
+        # else:
+        #     warning_msg = (
+        #         f"Datatype {datatype_name.upper()}: Optional column '{column_name}' "
+        #         f"(config key: '{config_key}') is missing from the data. "
+        #     )
+        ####### this might be taking it a bit too far
+        
+            # logger.info(warning_msg)
+            # warnings.append(warning_msg)
+
+    return warnings
+
+
+@beartype.beartype
+def validate_dip_columns(
+    geodata: geopandas.GeoDataFrame,
+    config: Dict[str, str],
+    dip_columns: List[str],
+    datatype_name: str = "UNKNOWN",
+    allow_nulls: bool = False
+) -> Tuple[bool, str]:
+
+    validation_failed = False
+    messages = []
+    
+    # Define fixed ranges
+    fixed_ranges = {
+        "dip_column": (0, 90),
+        "dipdir_column": (0, 360)
+    }
+    
+    for key in dip_columns:
+        column_name = config.get(key)
+        if not column_name and datatype_name == "STRUCTURE": # noly mandatory for structure, not faults!
+            warning_msg = (
+                f"Configuration key '{key}' is missing for datatype '{datatype_name}'. "
+                f"Dip column validation for this key is skipped."
+            )
+            logger.warning(warning_msg)
+            messages.append(warning_msg)
+            validation_failed = True
+            continue
+
+        if column_name in geodata.columns:
+            # Coerce to numeric
+            geodata[column_name] = pandas.to_numeric(geodata[column_name], errors='coerce')
+
+            # Check for non-numeric or NaN values
+            if geodata[column_name].isnull().any():
+                if not allow_nulls:
+                    warning_msg = (
+                        f"Datatype {datatype_name.upper()}: Column '{column_name}' "
+                        f"(config key: '{key}') contains non-numeric or NaN values."
+                    )
+                    logger.warning(warning_msg)
+                    messages.append(warning_msg)
+                    validation_failed = True
+
+            # Check if all values are numeric
+            if not geodata[column_name].apply(lambda x: isinstance(x, (int, float)) or pandas.isnull(x)).all():
+                warning_msg = (
+                    f"Datatype {datatype_name.upper()}: Column '{column_name}' "
+                    f"(config key: '{key}') must contain only numeric values."
+                )
+                logger.warning(warning_msg)
+                messages.append(warning_msg)
+                validation_failed = True
+
+            # Range validation
+            min_val, max_val = fixed_ranges.get(key, (None, None))
+            if min_val is not None and max_val is not None:
+                invalid_values = ~geodata[column_name].between(min_val, max_val, inclusive='both')
+                if invalid_values.any():
+                    warning_msg = (
+                        f"Datatype {datatype_name.upper()}: Column '{column_name}' "
+                        f"(config key: '{key}') contains values outside the range [{min_val}, {max_val}]. "
+                        "Is this intentional?"
+                    )
+                    logger.warning(warning_msg)
+                    messages.append(warning_msg)
+
+    summary_message = "\n".join(messages)
+    return (validation_failed, summary_message)
+
+
+@beartype.beartype
+def validate_structtype_column(
+    geodata: geopandas.GeoDataFrame,
+    config: Dict[str, str],
+    datatype_name: str,
+    required: bool = True,
+    text_keys: Optional[Dict[str, str]] = None
+) -> Tuple[bool, str]:
+
+    structtype_key = "structtype_column"
+    structtype_column = config.get(structtype_key)
+
+    if not structtype_column:
+        if required:
+            error_msg = (
+                f"Configuration key '{structtype_key}' is missing for datatype '{datatype_name}'. "
+                f"Validation for 'structtype_column' is skipped."
+            )
+            logger.warning(error_msg)
+            return (True, error_msg)
+        else:
+            warning_msg = (
+                f"Configuration key '{structtype_key}' is missing for datatype '{datatype_name}'. "
+                f"Optional 'structtype_column' validation is skipped."
+            )
+            logger.warning(warning_msg)
+            return (False, "")
+    
+    if structtype_column not in geodata.columns:
+        if required:
+            error_msg = (
+                f"Datatype {datatype_name.upper()}: '{structtype_column}' (config key: '{structtype_key}') "
+                f"is missing from the data. Consider removing that key from the config."
+            )
+            logger.error(error_msg)
+            return (True, error_msg)
+        else:
+            warning_msg = (
+                f"Datatype {datatype_name.upper()}: '{structtype_column}' (config key: '{structtype_key}') "
+                f"is missing from the data. Consider removing that key from the config."
+            )
+            logger.warning(warning_msg)
+            return (False, "")
+    
+    # Check if all entries are strings or nulls
+    if not geodata[structtype_column].apply(lambda x: isinstance(x, str) or pandas.isnull(x)).all():
+        error_msg = (
+            f"Datatype {datatype_name.upper()}: Column '{structtype_column}' "
+            f"(config key: '{structtype_key}') contains non-string values. "
+            "Please ensure all values in this column are strings."
+        )
+        logger.error(error_msg)
+        return (True, error_msg)
+    
+    # Warn about empty or null cells
+    if geodata[structtype_column].isnull().any() or geodata[structtype_column].str.strip().eq("").any():
+        warning_msg = (
+            f"Datatype {datatype_name.upper()}: Column '{structtype_column}' contains NaN, empty, or blank values. "
+            "Processing might not work as expected."
+        )
+        logger.warning(warning_msg)
+    
+    # Check for specific text keys
+    if text_keys:
+        for text_key, config_key in text_keys.items():
+            text_value = config.get(config_key, None)
+            if text_value:
+                if not isinstance(text_value, str):
+                    error_msg = (
+                        f"Datatype {datatype_name.upper()}: '{config_key}' must be a string. "
+                        "Please ensure it is defined correctly in the config."
+                    )
+                    logger.error(error_msg)
+                    return (True, error_msg)
+                
+                if not geodata[structtype_column].str.contains(text_value, na=False).any():
+                    if text_key == "synform_text":
+                        warning_msg = (
+                            f"Datatype {datatype_name.upper()}: The '{text_key}' value '{text_value}' is not found in column '{structtype_column}'. "
+                            "This may impact processing."
+                        )
+                        logger.warning(warning_msg)
+                    else:
+                        error_msg = (
+                            f"Datatype {datatype_name.upper()}: The '{text_key}' value '{text_value}' is not found in column '{structtype_column}'. "
+                            "Project might end up with no faults."
+                        )
+                        logger.error(error_msg)
+                        return (True, error_msg)
+    
     return (False, "")
\ No newline at end of file
diff --git a/tests/data_checks/test_input_data_faults.py b/tests/data_checks/test_input_data_faults.py
index 2f70176a..6c594d72 100644
--- a/tests/data_checks/test_input_data_faults.py
+++ b/tests/data_checks/test_input_data_faults.py
@@ -49,7 +49,7 @@
             },
             {"structtype_column": "FEATURE", "fault_text": "Fault", "objectid_column": "ID"},
             True,
-            "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values."
+            "Datatype FAULT: Column 'FEATURE' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
         ),
         # Invalid values in DIP estimate column
         (
diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py
index 740039b8..e20b97d4 100644
--- a/tests/data_checks/test_input_data_fold.py
+++ b/tests/data_checks/test_input_data_fold.py
@@ -51,7 +51,7 @@
             },
             {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
             True,
-            "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values."
+            "Datatype FAULT: Column 'FOLD' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
         ),
         # Missing ID column
         (

From d1f67a9e64fb09fc2b3e13351f23af9148e2eb5a Mon Sep 17 00:00:00 2001
From: AngRodrigues <angela.rodrigues@monash.edu>
Date: Tue, 14 Jan 2025 15:32:09 +1100
Subject: [PATCH 29/29] chore: finalise details

---
 tests/data_checks/test_input_data_fold.py          | 2 +-
 tests/project/test_ignore_codes_setters_getters.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/data_checks/test_input_data_fold.py b/tests/data_checks/test_input_data_fold.py
index e20b97d4..94834894 100644
--- a/tests/data_checks/test_input_data_fold.py
+++ b/tests/data_checks/test_input_data_fold.py
@@ -51,7 +51,7 @@
             },
             {"structtype_column": "FEATURE", "fold_text": "fold", "objectid_column": "ID", "description_column": "description"},
             True,
-            "Datatype FAULT: Column 'FOLD' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
+            "Datatype FOLD: Column 'FEATURE' (config key: 'structtype_column') contains non-string values. Please ensure all values in this column are strings."
         ),
         # Missing ID column
         (
diff --git a/tests/project/test_ignore_codes_setters_getters.py b/tests/project/test_ignore_codes_setters_getters.py
index 34de34a4..0674ca95 100644
--- a/tests/project/test_ignore_codes_setters_getters.py
+++ b/tests/project/test_ignore_codes_setters_getters.py
@@ -21,6 +21,7 @@ def test_set_get_ignore_codes():
     config_dictionary = {
         "structure": {"dipdir_column": "azimuth2", "dip_column": "dip"},
         "geology": {"unitname_column": "unitname", "alt_unitname_column": "code"},
+        "fault": {'structtype_column': 'feature', 'fault_text': 'Fault'},
     }
     with patch.object(Project, 'validate_required_inputs', return_value=None):
         project = Project(