feat: add io/loaders module (#83)

Includes features, general, and images specific loading functions - **New Features** - Introduced a new module for loading various data types in the READII pipeline. - Added functions for loading imaging feature sets, dataset configurations, and data files into DataFrames.
bhklab · Dec 13, 2024 · 81fbd78 · 81fbd78
1 parent f3cbd63
commit 81fbd78
Show file tree

Hide file tree

Showing 12 changed files with 409 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -146,6 +146,8 @@ dmypy.json
 
 # Test outputs
 tests/output/*
+tests/*/procdata/*
+tests/*/results/*
 
 # pixi environments
 .pixi

diff --git a/pixi.lock b/pixi.lock
diff --git a/src/readii/io/loaders/__init__.py b/src/readii/io/loaders/__init__.py
@@ -0,0 +1,12 @@
+"""Module for loading different data types for the READII pipeline."""
+
+from .features import loadFeatureFilesFromImageTypes
+from .general import loadFileToDataFrame, loadImageDatasetConfig
+from .images import getImageTypesFromDirectory
+
+__all__ = [
+    "loadFeatureFilesFromImageTypes",
+    "loadFileToDataFrame",
+    "loadImageDatasetConfig",
+    "getImageTypesFromDirectory"
+]
diff --git a/src/readii/io/loaders/features.py b/src/readii/io/loaders/features.py
@@ -0,0 +1,85 @@
+import os
+import pandas as pd 
+
+from typing import Optional, Dict
+
+from readii.io.loaders.general import loadFileToDataFrame
+
+from readii.utils import logger
+
+
+def loadFeatureFilesFromImageTypes(extracted_feature_dir:str,
+                                   image_types:list, 
+                                   drop_labels:Optional[bool]=True, 
+                                   labels_to_drop:Optional[list]=None)->Dict[str,pd.DataFrame]:
+    """Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes.
+
+    Parameters
+    ----------
+    extracted_feature_dir : str
+        Path to the directory containing the extracted feature csv files
+    image_types : list, optional
+        List of image types to load in. The default is ['original'].
+    drop_labels : bool, optional
+        Whether to drop the labels from the dataframes. Use when loading labelled data from data_setup_for_modeling.ipynb. The default is True.
+    labels_to_drop : list, optional
+        List of labels to drop from the dataframes. The default is ["patient_ID","survival_time_in_years","survival_event_binary"] based on code
+        in data_setup_for_modeling.ipynb.
+
+    Returns
+    -------
+    feature_sets : dict
+        Dictionary of dataframes containing the extracted radiomics features.
+    """
+    # Set default labels to drop if not specified
+    if labels_to_drop is None:
+        labels_to_drop = ["patient_ID","survival_time_in_years","survival_event_binary"]
+
+    # Initialize dictionary to store the feature sets
+    feature_sets = {}
+
+    # Check if the passed in extracted feature directory exists
+    if not os.path.isdir(extracted_feature_dir):
+        raise FileNotFoundError(f"Extracted feature directory {extracted_feature_dir} does not exist.")
+
+    feature_file_list = os.listdir(extracted_feature_dir)
+
+    # Loop through all the files in the directory
+    for image_type in image_types:
+        try:
+            # Extract the image type feature csv file from the feature directory  
+            matching_files = [file for file in feature_file_list if (image_type in file) and (file.endswith(".csv"))]  
+            if matching_files:  
+                image_type_feature_file = matching_files[0]  
+                # Remove the image type file from the list of feature files  
+                feature_file_list.remove(image_type_feature_file)
+        except IndexError as e:
+            logger.warning(f"No {image_type} feature csv files found in {extracted_feature_dir}")
+            # Skip to the next image type
+            continue
+
+
+        # Get the full path to the feature file
+        feature_file_path = os.path.join(extracted_feature_dir, image_type_feature_file)
+
+        # Load the feature data into a pandas dataframe
+        raw_feature_data = loadFileToDataFrame(feature_file_path)
+
+        try:
+            # Drop the labels from the dataframe if specified
+            if drop_labels:
+                # Data is now only extracted features
+                raw_feature_data.drop(labels_to_drop, axis=1, inplace=True)
+        except KeyError as e:
+            logger.warning(f"{feature_file_path} does not have the labels {labels_to_drop} to drop.")
+            # Skip to the next image type
+            continue
+
+        # Save the dataframe to the feature_sets dictionary
+        feature_sets[image_type] = raw_feature_data
+
+    # After processing all image types, check if any feature sets were loaded 
+    if not feature_sets:
+        raise ValueError(f"No valid feature sets were loaded from {extracted_feature_dir}")
+
+    return feature_sets
diff --git a/src/readii/io/loaders/general.py b/src/readii/io/loaders/general.py
@@ -0,0 +1,99 @@
+from pathlib import Path
+
+import pandas as pd
+import yaml
+
+
+class ConfigError(Exception):
+    """Base class for errors in the config module."""
+
+    pass
+
+class DataFrameLoadError(Exception):
+    """Custom exception for DataFrame loading errors."""
+
+    pass
+
+def loadImageDatasetConfig(dataset_name: str, config_dir_path: str | Path) -> dict:
+    """Load the configuration file for a given dataset.
+
+    Expects the configuration file to be named <dataset_name>.yaml.
+
+    Parameters
+    ----------
+    dataset_name : str
+        Name of the dataset to load the configuration file for.
+    config_dir_path : str or pathlib.Path
+        Path to the directory containing the configuration files.
+
+    Returns
+    -------
+    dict
+        Dictionary containing the configuration settings for the dataset.
+
+    Examples
+    --------
+    >>> config = loadImageDatasetConfig("NSCLC_Radiogenomics", "config")
+    """
+    config_dir_path = Path(config_dir_path)
+    config_file_path = config_dir_path / f"{dataset_name}.yaml"
+
+    if not config_file_path.exists():
+        msg = f"Config file {config_file_path} does not exist."
+        raise FileNotFoundError(msg)
+
+    try:
+        with config_file_path.open("r") as f:
+            config = yaml.safe_load(f)
+    except yaml.YAMLError as ye:
+        raise ConfigError("Invalid YAML in config file") from ye
+
+    if not config:
+        raise ConfigError("Config file is empty or invalid")
+
+    return config
+
+
+
+def loadFileToDataFrame(file_path: str | Path) -> pd.DataFrame:
+    """Load data from a csv or xlsx file into a pandas dataframe.
+
+    Parameters
+    ----------
+    file_path : str or pathlib.Path
+        Path to the data file.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing the data from the file.
+    """
+    file_path = Path(file_path)
+    if not file_path:
+        raise ValueError("File is empty")
+
+    if not file_path.exists():
+        msg = f"File {file_path} does not exist"
+        raise FileNotFoundError(msg)
+
+    # Get the file extension
+    file_extension = file_path.suffix
+
+    try:
+        if file_extension == '.xlsx':
+            df = pd.read_excel(file_path)
+        elif file_extension == '.csv':
+            df = pd.read_csv(file_path)
+        else:
+            msg = f"Unsupported file format {file_extension}. Please provide a .csv or .xlsx file."
+            raise ValueError(msg)
+
+    except pd.errors.EmptyDataError as e:
+        raise DataFrameLoadError("File is empty") from e
+
+    except (pd.errors.ParserError, ValueError) as e:
+        raise DataFrameLoadError("Error parsing file") from e
+
+    if df.empty:
+        raise DataFrameLoadError("Dataframe is empty")
+    return df
diff --git a/src/readii/io/loaders/images.py b/src/readii/io/loaders/images.py
@@ -0,0 +1,52 @@
+from pathlib import Path
+from typing import Union
+
+def getImageTypesFromDirectory(raw_data_dir:Union[Path|str],
+                               feature_file_prefix:str = "",
+                               feature_file_suffix:str = ".csv"):
+    """ Function to get a list of image types from a directory containing image feature files.
+
+    Parameters
+    ----------
+    raw_data_dir : str
+        Path to the directory containing the image feature files.
+    feature_file_prefix : str, optional
+        Prefix to remove from the feature file name. The default is "".
+    feature_file_suffix : str, optional
+        Suffix to remove from the feature file name. The default is ".csv".
+    
+    Returns
+    -------
+    list
+        List of image types from the image feature files.
+    """
+    # Check if raw_data_dir is a string or a Path object, convert to Path object if it is a string
+    if isinstance(raw_data_dir, str):
+        raw_data_dir = Path(raw_data_dir)
+
+    # Check if the directory exists
+    if not raw_data_dir.exists():
+        raise FileNotFoundError(f"Directory {raw_data_dir} does not exist.")
+
+    # Check if the directory is a directory
+    if not raw_data_dir.is_dir():
+        raise NotADirectoryError(f"Path {raw_data_dir} is not a directory.")
+
+    # Check that directory contains files with the specified prefix and suffix
+    if not any(raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}")):
+        raise FileNotFoundError(f"No files with prefix {feature_file_prefix} and suffix {feature_file_suffix} found in directory {raw_data_dir}.")
+
+    # Initialize an empty list to store the image types
+    image_types = []
+
+    # Get list of file banes with the specified prefix and suffix in the directory
+    for file in raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}"):
+        file_name = file.name
+
+        # Remove the prefix and suffix from the file name
+        image_type = file_name.removeprefix(feature_file_prefix).removesuffix(feature_file_suffix)
+
+        # Add the image type to the list
+        image_types.append(image_type)
+
+    return image_types
diff --git a/tests/4D-Lung/4D-Lung.yaml b/tests/4D-Lung/4D-Lung.yaml
@@ -0,0 +1,20 @@
+# Config file for 4D-Lung for READII
+dataset_name: 4D-Lung 
+
+### CLINICAL VARIABLE INFORMATION ###
+# Event values should be in the order [Alive_value, Dead_value]
+outcome_variables:
+    time_label: 
+    event_label: 
+    convert_to_years: False
+    event_value_mapping: 
+
+exclusion_variables: 
+
+train_test_split:
+    split: False
+    split_variable: 
+    impute: 
+
+
+image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
diff --git a/tests/NSCLC_Radiogenomics/NSCLC_Radiogenomics.yaml b/tests/NSCLC_Radiogenomics/NSCLC_Radiogenomics.yaml
@@ -0,0 +1,20 @@
+# Config file for NSCLC_Radiogenomics for READII
+dataset_name: NSCLC_Radiogenomics 
+
+### CLINICAL VARIABLE INFORMATION ###
+# Event values should be in the order [Alive_value, Dead_value]
+outcome_variables:
+    time_label: ""
+    event_label: "Survival Status"
+    convert_to_years: False
+    event_value_mapping: {'Alive': 0, 'Dead': 1}
+
+exclusion_variables: 
+
+train_test_split:
+    split: False
+    split_variable: 
+    impute: 
+
+
+image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
diff --git a/tests/io/loaders/test_general.py b/tests/io/loaders/test_general.py
@@ -0,0 +1,29 @@
+from readii.io.loaders.general import loadImageDatasetConfig
+import pytest
+
+@pytest.fixture
+def nsclcConfigDirPath():
+    return "tests/NSCLC_Radiogenomics"
+
+@pytest.fixture
+def lung4DConfigDirPath():
+    return "tests/4D-Lung"
+
+@pytest.fixture
+def expected_image_types():
+    return ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
+
+
+def test_NSCLC_loadImageDatasetConfig(nsclcConfigDirPath, expected_image_types):
+    config = loadImageDatasetConfig("NSCLC_Radiogenomics", nsclcConfigDirPath)
+    assert config["dataset_name"] == "NSCLC_Radiogenomics"
+    assert config["image_types"] == expected_image_types
+    assert config["outcome_variables"]["event_label"] == "Survival Status"
+    assert config["outcome_variables"]["event_value_mapping"] == {'Alive': 0, 'Dead': 1}
+
+def test_lung4D_loadImageDatasetConfig(lung4DConfigDirPath, expected_image_types):
+    config = loadImageDatasetConfig("4D-Lung", lung4DConfigDirPath)
+    assert config["dataset_name"] == "4D-Lung"
+    assert config["image_types"] == expected_image_types
+    assert config["outcome_variables"]["event_label"] is None
+    assert config["outcome_variables"]["event_value_mapping"] is None
diff --git a/tests/output/ct_to_seg_match_list_4D-Lung.csv b/tests/output/ct_to_seg_match_list_4D-Lung.csv
@@ -0,0 +1,2 @@
+patient_ID,study_CT,study_description_CT,series_CT,series_description_CT,subseries_CT,modality_CT,instances_CT,instance_uid_CT,reference_ct_CT,reference_rs_CT,reference_pl_CT,reference_frame_CT,folder_CT,orientation_CT,orientation_type_CT,MR_repetition_time_CT,MR_echo_time_CT,MR_scan_sequence_CT,MR_magnetic_field_strength_CT,MR_imaged_nucleus_CT,file_path_CT,series_seg,subseries_seg,modality_seg,instances_seg,instance_uid_seg,reference_ct_seg,reference_rs_seg,reference_pl_seg,reference_frame_seg,folder_seg,orientation_seg,orientation_type_seg,MR_repetition_time_seg,MR_echo_time_seg,MR_scan_sequence_seg,MR_magnetic_field_strength_seg,MR_imaged_nucleus_seg,file_path_seg,edge_type
+113_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.324605948863389564556891313296,p4,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,"P4^P113^S303^I10349, Gated, 40.0%B",default,CT,99,1.3.6.1.4.1.14519.5.2.1.6834.5010.249506064276270740866733345688,,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543,"[1, 0, 0, 0, 1, 0]",,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543/1-81.dcm,2.25.186899387610254289948150314209581209847.35,default,RTSTRUCT,1,1.3.6.1.4.1.14519.5.2.1.6834.5010.815153834456695039602326691312,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,,,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,2
diff --git a/tests/test_feature_extraction.py b/tests/test_feature_extraction.py
@@ -13,6 +13,8 @@
 import collections
 import pandas as pd
 import os 
+import shutil
+from pathlib import Path
 
 @pytest.fixture
 def nsclcCTImage():
@@ -44,7 +46,21 @@ def pyradiomicsParamFilePath():
 
 @pytest.fixture
 def nsclcMetadataPath():
-    return "tests/output/ct_to_seg_match_list_NSCLC_Radiogenomics.csv"
+    oldpath = Path("tests/output/ct_to_seg_match_list_NSCLC_Radiogenomics.csv")
+    newpath = Path("tests/NSCLC_Radiogenomics/procdata/ct_to_seg_match_list_NSCLC_Radiogenomics.csv")
+    newpath.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy(oldpath, newpath)
+    yield newpath.as_posix()
+    newpath.unlink()
+
+@pytest.fixture
+def lung4DMetadataPath():
+    oldpath = Path("tests/output/ct_to_seg_match_list_4D-Lung.csv")
+    newpath = Path("tests/4D-Lung/procdata/ct_to_seg_match_list_4D-Lung.csv")
+    newpath.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy(oldpath, newpath)
+    yield newpath.as_posix()
+    newpath.unlink()
 
 
 def test_singleRadiomicFeatureExtraction_SEG(nsclcCTImage, nsclcSEGImage, pyradiomicsParamFilePath):
@@ -108,11 +124,21 @@ def test_radiomicFeatureExtraction(nsclcMetadataPath):
         "Volume feature is incorrect"
 
 
-def test_radiomicFeatureExtraction_output(nsclcMetadataPath):
-    """Test output creation from radiomic feature extraction"""
+def test_NSCLC_radiomicFeatureExtraction_output(nsclcMetadataPath):
+    """Test output creation from radiomic feature extraction for SEG dataset"""
     actual = radiomicFeatureExtraction(nsclcMetadataPath,
                                        imageDirPath = "tests/",
                                        roiNames = None,
-                                       outputDirPath = "tests/output/")
-    expected_path = "tests/output/features/radiomicfeatures_original_NSCLC_Radiogenomics.csv"
+                                       outputDirPath = "tests/NSCLC_Radiogenomics/results/")
+    expected_path = "tests/NSCLC_Radiogenomics/results/features/radiomicfeatures_original_NSCLC_Radiogenomics.csv"
+    assert os.path.exists(expected_path)
+
+
+def test_4DLung_radiomicFeatureExtraction_output(lung4DMetadataPath):
+    """Test output creation from radiomic feature extraction for RTSTRUCT dataset"""
+    actual = radiomicFeatureExtraction(lung4DMetadataPath,
+                                       imageDirPath = "tests/",
+                                       roiNames = "Tumor_c40",
+                                       outputDirPath = "tests/4D-Lung/results/")
+    expected_path = "tests/4D-Lung/results/features/radiomicfeatures_original_4D-Lung.csv"
     assert os.path.exists(expected_path)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		patient_ID,study_CT,study_description_CT,series_CT,series_description_CT,subseries_CT,modality_CT,instances_CT,instance_uid_CT,reference_ct_CT,reference_rs_CT,reference_pl_CT,reference_frame_CT,folder_CT,orientation_CT,orientation_type_CT,MR_repetition_time_CT,MR_echo_time_CT,MR_scan_sequence_CT,MR_magnetic_field_strength_CT,MR_imaged_nucleus_CT,file_path_CT,series_seg,subseries_seg,modality_seg,instances_seg,instance_uid_seg,reference_ct_seg,reference_rs_seg,reference_pl_seg,reference_frame_seg,folder_seg,orientation_seg,orientation_type_seg,MR_repetition_time_seg,MR_echo_time_seg,MR_scan_sequence_seg,MR_magnetic_field_strength_seg,MR_imaged_nucleus_seg,file_path_seg,edge_type
		113_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.324605948863389564556891313296,p4,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,"P4^P113^S303^I10349, Gated, 40.0%B",default,CT,99,1.3.6.1.4.1.14519.5.2.1.6834.5010.249506064276270740866733345688,,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543,"[1, 0, 0, 0, 1, 0]",,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543/1-81.dcm,2.25.186899387610254289948150314209581209847.35,default,RTSTRUCT,1,1.3.6.1.4.1.14519.5.2.1.6834.5010.815153834456695039602326691312,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,,,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,2