Merge pull request #164 from AustralianCancerDataNetwork/static-typing

Added static typing to `pydicer` modules
AustralianCancerDataNetwork · Feb 2, 2024 · 9836a54 · 9836a54
2 parents 8a371f2 + 27209d5
commit 9836a54
Show file tree

Hide file tree

Showing 22 changed files with 264 additions and 126 deletions.
diff --git a/pydicer/config.py b/pydicer/config.py
@@ -15,7 +15,13 @@
         "available in the .pydicer directory.",
         "type": int,
         "default": 0,
-        "choices": [logging.NOTSET, logging.DEBUG, logging.INFO, logging.WARNING, logging.ERROR],
+        "choices": [
+            logging.NOTSET,
+            logging.DEBUG,
+            logging.INFO,
+            logging.WARNING,
+            logging.ERROR,
+        ],
     },
     "for_fallback_linkage": {
         "module": "general",
@@ -80,7 +86,6 @@
 class PyDicerConfig:
     class __PyDicerConfig:  # pylint: disable=invalid-name
         def __init__(self, working_dir=None):
-
             if working_dir is None:
                 raise ValueError("working_dir must be set on config init")
             self.working_dir = Path(working_dir)
@@ -128,7 +133,7 @@ def get_working_dir(self):
         """
         return self.instance.working_dir
 
-    def get_config(self, name):
+    def get_config(self, name: str) -> object:
         """Get the value of the config item with the specified name
 
         Args:
@@ -146,7 +151,7 @@ def get_config(self, name):
 
         return self.instance.pydicer_config[name]
 
-    def set_config(self, name, value):
+    def set_config(self, name: str, value: object):
         """Set the value for the config with the given name
 
         Args:
@@ -163,7 +168,8 @@ def set_config(self, name, value):
 
         if not isinstance(value, PYDICER_CONFIG[name]["type"]) and not value is None:
             raise ValueError(
-                f"Config {name} must be of type " f"{type(self.instance.pydicer_config[name])}"
+                f"Config {name} must be of type "
+                f"{type(self.instance.pydicer_config[name])}"
             )
 
         self.instance.pydicer_config[name] = value

diff --git a/pydicer/convert/data.py b/pydicer/convert/data.py
@@ -3,6 +3,8 @@
 import copy
 import shutil
 from pathlib import Path
+from typing import Union
+
 import pandas as pd
 import numpy as np
 import SimpleITK as sitk
@@ -51,7 +53,7 @@
 ]
 
 
-def get_object_type(sop_class_uid):
+def get_object_type(sop_class_uid: str) -> str:
     """Get the type of the object (used for the output path)
 
     Args:
@@ -69,7 +71,9 @@ def get_object_type(sop_class_uid):
     return object_type
 
 
-def handle_missing_slice(files, ignore_duplicates=False):
+def handle_missing_slice(
+    files: Union[pd.DataFrame, list], ignore_duplicates: bool = False
+) -> list:
     """function to interpolate missing slices in an image
 
     Example usage:
@@ -98,6 +102,8 @@ def handle_missing_slice(files, ignore_duplicates=False):
     Args:
         df_files (pd.DataFrame|list): the DataFrame which was produced by PreprocessData
         or list of filepaths to dicom slices
+        ignore_duplicates (booleanbool, optional): specifices whether the function is to ignore
+        duplicate slices when handling missing ones
 
     Returns:
         file_paths(list): a list of the interpolated file paths
@@ -231,7 +237,7 @@ def handle_missing_slice(files, ignore_duplicates=False):
     return df_files.file_path.tolist()
 
 
-def link_via_frame_of_reference(for_uid, df_preprocess):
+def link_via_frame_of_reference(for_uid: str, df_preprocess: pd.DataFrame) -> pd.DataFrame:
     """Find the image series linked to this FOR
 
     Args:
@@ -271,7 +277,7 @@ def __init__(self, working_directory="."):
         self.pydicer_directory = working_directory.joinpath(PYDICER_DIR_NAME)
         self.output_directory = working_directory.joinpath(CONVERTED_DIR_NAME)
 
-    def add_entry(self, entry):
+    def add_entry(self, entry: dict):
         """Add an entry of a converted data object to the patient's converted dataframe.
 
         Args:
@@ -308,7 +314,7 @@ def add_entry(self, entry):
         df_pat_data = df_pat_data.reset_index(drop=True)
         df_pat_data.to_csv(converted_df_path)
 
-    def convert(self, patient=None, force=True):
+    def convert(self, patient: Union[str, list]=None, force: bool=True):
         """Converts the DICOM which was preprocessed into the pydicer output directory.
 
         Args:

diff --git a/pydicer/convert/headers.py b/pydicer/convert/headers.py
@@ -1,11 +1,16 @@
 import logging
 import json
+from typing import Union
+from pathlib import Path
+
 import pydicom
 
 logger = logging.getLogger(__name__)
 
 
-def convert_dicom_headers(dcm_file, binary_path, json_file):
+def convert_dicom_headers(
+    dcm_file: Union[str, Path], binary_path: str, json_file: Union[str, Path]
+):
     """Save the DICOM Headers as a JSON file
 
     Args:

diff --git a/pydicer/dataset/functions.py b/pydicer/dataset/functions.py
@@ -7,7 +7,7 @@
 logger = logging.getLogger(__name__)
 
 
-def rt_latest_struct(df, **kwargs):
+def rt_latest_struct(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
     """Select the latest Structure set and the image which it is linked to. You can specify keyword
     arguments to for a match on any top level DICOM attributes. You may also supply lists of values
     to these, one of which should match to select that series.
@@ -91,18 +91,24 @@ def rt_latest_struct(df, **kwargs):
         keep_rows.append(struct_row.name)  # Track index of row to keep
 
         # Find the linked image
-        df_linked_img = df[df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid]
+        df_linked_img = df[
+            df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid
+        ]
 
         if len(df_linked_img) == 0:
-            logger.warning("No linked images found for structure: %s", struct_row.hashed_uid)
+            logger.warning(
+                "No linked images found for structure: %s", struct_row.hashed_uid
+            )
             continue
 
-        keep_rows.append(df_linked_img.iloc[0].name)  # Keep the index of the row of the image too
+        keep_rows.append(
+            df_linked_img.iloc[0].name
+        )  # Keep the index of the row of the image too
 
     return df.loc[keep_rows]
 
 
-def rt_latest_dose(df, **kwargs):
+def rt_latest_dose(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
     """Select the latest RTDOSE and the image, structure and plan which it is linked to. You can
     specify keyword arguments to for a match on any top level DICOM attributes. You may also supply
     lists of values to these, one of which should match to select that series.
@@ -191,16 +197,22 @@ def rt_latest_dose(df, **kwargs):
         keep_rows.append(dose_row.name)  # Track index of row of dose to keep
 
         # Find the linked plan
-        df_linked_plan = df[df["sop_instance_uid"] == dose_row.referenced_sop_instance_uid]
+        df_linked_plan = df[
+            df["sop_instance_uid"] == dose_row.referenced_sop_instance_uid
+        ]
 
         if len(df_linked_plan) == 0:
-            logger.warning("No linked plans found for dose: %s", dose_row.sop_instance_uid)
+            logger.warning(
+                "No linked plans found for dose: %s", dose_row.sop_instance_uid
+            )
             continue
 
         # Find the linked structure set
         plan_row = df_linked_plan.iloc[0]
         keep_rows.append(plan_row.name)  # Keep the index of the row of the plan
-        df_linked_struct = df[df["sop_instance_uid"] == plan_row.referenced_sop_instance_uid]
+        df_linked_struct = df[
+            df["sop_instance_uid"] == plan_row.referenced_sop_instance_uid
+        ]
 
         if len(df_linked_struct) == 0:
             # Try to link via Frame of Reference instead
@@ -209,18 +221,26 @@ def rt_latest_dose(df, **kwargs):
             ]
 
         if len(df_linked_struct) == 0:
-            logger.warning("No structures found for plan: %s", plan_row.sop_instance_uid)
+            logger.warning(
+                "No structures found for plan: %s", plan_row.sop_instance_uid
+            )
             continue
 
         # Find the linked image
         struct_row = df_linked_struct.iloc[0]
         keep_rows.append(struct_row.name)  # Keep the index of the row of the structure
-        df_linked_img = df[df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid]
+        df_linked_img = df[
+            df["sop_instance_uid"] == struct_row.referenced_sop_instance_uid
+        ]
 
         if len(df_linked_img) == 0:
-            logger.warning("No linked images found for structure: %s", struct_row.hashed_uid)
+            logger.warning(
+                "No linked images found for structure: %s", struct_row.hashed_uid
+            )
             continue
 
-        keep_rows.append(df_linked_img.iloc[0].name)  # Keep the index of the row of the image too
+        keep_rows.append(
+            df_linked_img.iloc[0].name
+        )  # Keep the index of the row of the image too
 
     return df.loc[keep_rows]
diff --git a/pydicer/dataset/nnunet.py b/pydicer/dataset/nnunet.py
@@ -45,7 +45,7 @@ def __init__(
         nnunet_description: str = "",
         dataset_name: str = CONVERTED_DIR_NAME,
         image_modality: str = "CT",
-        mapping_id=DEFAULT_MAPPING_ID,
+        mapping_id: str = DEFAULT_MAPPING_ID,
     ):
         """Prepare a dataset to train models using nnUNet.
 
@@ -219,12 +219,16 @@ def check_duplicates_train_test(self):
         """
 
         if len(self.training_cases) == 0:
-            raise SystemError("training_cases are empty, run split_dataset function first.")
+            raise SystemError(
+                "training_cases are empty, run split_dataset function first."
+            )
 
         img_stats = []
 
         df = read_converted_data(self.working_directory, dataset_name=self.dataset_name)
-        df_images = df[(df.modality == "CT") | (df.modality == "MR") | (df.modality == "PT")]
+        df_images = df[
+            (df.modality == "CT") | (df.modality == "MR") | (df.modality == "PT")
+        ]
 
         for case in self.training_cases + self.testing_cases:
             df_pat = df_images[df_images.patient_id == case]
@@ -252,7 +256,9 @@ def check_duplicates_train_test(self):
 
         # Check to see if we have any duplicate image spacing and sizes, if so inspect these
         # further
-        duplicated_rows = df_img_stats.duplicated(subset=["spacing", "size"], keep=False)
+        duplicated_rows = df_img_stats.duplicated(
+            subset=["spacing", "size"], keep=False
+        )
         df_img_stats["voxel_sum"] = df_img_stats.apply(
             lambda row: sitk.GetArrayFromImage(sitk.ReadImage(row.img_path)).sum()
             if row.name in duplicated_rows.index
@@ -342,7 +348,9 @@ def check_structure_names(self) -> pd.DataFrame:
                 print(f"Structure {s} is missing for patients: {missing_pats}")
 
                 incomplete_structures.append(s)
-                incomplete_patients += [p for p in missing_pats if not p in incomplete_patients]
+                incomplete_patients += [
+                    p for p in missing_pats if not p in incomplete_patients
+                ]
 
         if incomplete_structures:
             print(
@@ -383,7 +391,8 @@ def check_overlapping_structures(self):
                     structure_name_j = structure_names[sj]
 
                     structure_sum = (
-                        structure_set[structure_name_i] + structure_set[structure_name_j]
+                        structure_set[structure_name_i]
+                        + structure_set[structure_name_j]
                     )
                     arr = sitk.GetArrayFromImage(structure_sum)
                     if arr.max() > 1:
@@ -444,7 +453,9 @@ def prepare_dataset(self) -> Path:
         """
 
         if len(self.training_cases) == 0:
-            raise SystemError("training_cases are empty, run split_dataset function first.")
+            raise SystemError(
+                "training_cases are empty, run split_dataset function first."
+            )
 
         # First check that all cases (in training set) have the structures which are to be learnt
         df_structures = self.check_structure_names()
@@ -571,7 +582,9 @@ def generate_training_scripts(
             raise FileNotFoundError(
                 "Ensure that the folder in which to generate the script exists."
             )
-        script_path = script_directory.joinpath(f"train_{self.nnunet_id}_{self.nnunet_name}.sh")
+        script_path = script_directory.joinpath(
+            f"train_{self.nnunet_id}_{self.nnunet_name}.sh"
+        )
 
         if isinstance(folds, str):
             folds = [folds]
@@ -637,7 +650,9 @@ def train(self, script_directory: Union[str, Path] = ".", in_screen: bool = True
         """
         # Make sure the script folder exists
         script_directory = Path(script_directory)
-        script_path = script_directory.joinpath(f"train_{self.nnunet_id}_{self.nnunet_name}.sh")
+        script_path = script_directory.joinpath(
+            f"train_{self.nnunet_id}_{self.nnunet_name}.sh"
+        )
 
         if not script_path.exists():
             raise FileNotFoundError(

diff --git a/pydicer/dataset/preparation.py b/pydicer/dataset/preparation.py
@@ -1,7 +1,7 @@
 import logging
 import os
 from pathlib import Path
-from typing import Callable
+from typing import Callable, Union
 
 import pandas as pd
 
@@ -22,7 +22,7 @@ class PrepareDataset:
             Defaults to ".".
     """
 
-    def __init__(self, working_directory="."):
+    def __init__(self, working_directory: Union[str, Path] = "."):
         self.working_directory = Path(working_directory)
 
     def add_object_to_dataset(self, dataset_name: str, data_object_row: pd.Series):