Skip to content

Commit

Permalink
feat: add io/loaders module (#83)
Browse files Browse the repository at this point in the history
Includes features, general, and images specific loading functions


- **New Features**
- Introduced a new module for loading various data types in the READII
pipeline.
- Added functions for loading imaging feature sets, dataset
configurations, and data files into DataFrames.
  • Loading branch information
strixy16 authored Dec 13, 2024
1 parent f3cbd63 commit 81fbd78
Show file tree
Hide file tree
Showing 12 changed files with 409 additions and 45 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ dmypy.json

# Test outputs
tests/output/*
tests/*/procdata/*
tests/*/results/*

# pixi environments
.pixi
Expand Down
56 changes: 28 additions & 28 deletions pixi.lock

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions src/readii/io/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Module for loading different data types for the READII pipeline."""

from .features import loadFeatureFilesFromImageTypes
from .general import loadFileToDataFrame, loadImageDatasetConfig
from .images import getImageTypesFromDirectory

__all__ = [
"loadFeatureFilesFromImageTypes",
"loadFileToDataFrame",
"loadImageDatasetConfig",
"getImageTypesFromDirectory"
]
85 changes: 85 additions & 0 deletions src/readii/io/loaders/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import pandas as pd

from typing import Optional, Dict

from readii.io.loaders.general import loadFileToDataFrame

from readii.utils import logger


def loadFeatureFilesFromImageTypes(extracted_feature_dir:str,
image_types:list,
drop_labels:Optional[bool]=True,
labels_to_drop:Optional[list]=None)->Dict[str,pd.DataFrame]:
"""Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes.
Parameters
----------
extracted_feature_dir : str
Path to the directory containing the extracted feature csv files
image_types : list, optional
List of image types to load in. The default is ['original'].
drop_labels : bool, optional
Whether to drop the labels from the dataframes. Use when loading labelled data from data_setup_for_modeling.ipynb. The default is True.
labels_to_drop : list, optional
List of labels to drop from the dataframes. The default is ["patient_ID","survival_time_in_years","survival_event_binary"] based on code
in data_setup_for_modeling.ipynb.
Returns
-------
feature_sets : dict
Dictionary of dataframes containing the extracted radiomics features.
"""
# Set default labels to drop if not specified
if labels_to_drop is None:
labels_to_drop = ["patient_ID","survival_time_in_years","survival_event_binary"]

# Initialize dictionary to store the feature sets
feature_sets = {}

# Check if the passed in extracted feature directory exists
if not os.path.isdir(extracted_feature_dir):
raise FileNotFoundError(f"Extracted feature directory {extracted_feature_dir} does not exist.")

feature_file_list = os.listdir(extracted_feature_dir)

# Loop through all the files in the directory
for image_type in image_types:
try:
# Extract the image type feature csv file from the feature directory
matching_files = [file for file in feature_file_list if (image_type in file) and (file.endswith(".csv"))]
if matching_files:
image_type_feature_file = matching_files[0]
# Remove the image type file from the list of feature files
feature_file_list.remove(image_type_feature_file)
except IndexError as e:
logger.warning(f"No {image_type} feature csv files found in {extracted_feature_dir}")
# Skip to the next image type
continue


# Get the full path to the feature file
feature_file_path = os.path.join(extracted_feature_dir, image_type_feature_file)

# Load the feature data into a pandas dataframe
raw_feature_data = loadFileToDataFrame(feature_file_path)

try:
# Drop the labels from the dataframe if specified
if drop_labels:
# Data is now only extracted features
raw_feature_data.drop(labels_to_drop, axis=1, inplace=True)
except KeyError as e:
logger.warning(f"{feature_file_path} does not have the labels {labels_to_drop} to drop.")
# Skip to the next image type
continue

# Save the dataframe to the feature_sets dictionary
feature_sets[image_type] = raw_feature_data

# After processing all image types, check if any feature sets were loaded
if not feature_sets:
raise ValueError(f"No valid feature sets were loaded from {extracted_feature_dir}")

return feature_sets
99 changes: 99 additions & 0 deletions src/readii/io/loaders/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from pathlib import Path

import pandas as pd
import yaml


class ConfigError(Exception):
"""Base class for errors in the config module."""

pass

class DataFrameLoadError(Exception):
"""Custom exception for DataFrame loading errors."""

pass

def loadImageDatasetConfig(dataset_name: str, config_dir_path: str | Path) -> dict:
"""Load the configuration file for a given dataset.
Expects the configuration file to be named <dataset_name>.yaml.
Parameters
----------
dataset_name : str
Name of the dataset to load the configuration file for.
config_dir_path : str or pathlib.Path
Path to the directory containing the configuration files.
Returns
-------
dict
Dictionary containing the configuration settings for the dataset.
Examples
--------
>>> config = loadImageDatasetConfig("NSCLC_Radiogenomics", "config")
"""
config_dir_path = Path(config_dir_path)
config_file_path = config_dir_path / f"{dataset_name}.yaml"

if not config_file_path.exists():
msg = f"Config file {config_file_path} does not exist."
raise FileNotFoundError(msg)

try:
with config_file_path.open("r") as f:
config = yaml.safe_load(f)
except yaml.YAMLError as ye:
raise ConfigError("Invalid YAML in config file") from ye

if not config:
raise ConfigError("Config file is empty or invalid")

return config



def loadFileToDataFrame(file_path: str | Path) -> pd.DataFrame:
"""Load data from a csv or xlsx file into a pandas dataframe.
Parameters
----------
file_path : str or pathlib.Path
Path to the data file.
Returns
-------
pd.DataFrame
Dataframe containing the data from the file.
"""
file_path = Path(file_path)
if not file_path:
raise ValueError("File is empty")

if not file_path.exists():
msg = f"File {file_path} does not exist"
raise FileNotFoundError(msg)

# Get the file extension
file_extension = file_path.suffix

try:
if file_extension == '.xlsx':
df = pd.read_excel(file_path)
elif file_extension == '.csv':
df = pd.read_csv(file_path)
else:
msg = f"Unsupported file format {file_extension}. Please provide a .csv or .xlsx file."
raise ValueError(msg)

except pd.errors.EmptyDataError as e:
raise DataFrameLoadError("File is empty") from e

except (pd.errors.ParserError, ValueError) as e:
raise DataFrameLoadError("Error parsing file") from e

if df.empty:
raise DataFrameLoadError("Dataframe is empty")
return df
52 changes: 52 additions & 0 deletions src/readii/io/loaders/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from pathlib import Path
from typing import Union

def getImageTypesFromDirectory(raw_data_dir:Union[Path|str],
feature_file_prefix:str = "",
feature_file_suffix:str = ".csv"):
""" Function to get a list of image types from a directory containing image feature files.
Parameters
----------
raw_data_dir : str
Path to the directory containing the image feature files.
feature_file_prefix : str, optional
Prefix to remove from the feature file name. The default is "".
feature_file_suffix : str, optional
Suffix to remove from the feature file name. The default is ".csv".
Returns
-------
list
List of image types from the image feature files.
"""
# Check if raw_data_dir is a string or a Path object, convert to Path object if it is a string
if isinstance(raw_data_dir, str):
raw_data_dir = Path(raw_data_dir)

# Check if the directory exists
if not raw_data_dir.exists():
raise FileNotFoundError(f"Directory {raw_data_dir} does not exist.")

# Check if the directory is a directory
if not raw_data_dir.is_dir():
raise NotADirectoryError(f"Path {raw_data_dir} is not a directory.")

# Check that directory contains files with the specified prefix and suffix
if not any(raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}")):
raise FileNotFoundError(f"No files with prefix {feature_file_prefix} and suffix {feature_file_suffix} found in directory {raw_data_dir}.")

# Initialize an empty list to store the image types
image_types = []

# Get list of file banes with the specified prefix and suffix in the directory
for file in raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}"):
file_name = file.name

# Remove the prefix and suffix from the file name
image_type = file_name.removeprefix(feature_file_prefix).removesuffix(feature_file_suffix)

# Add the image type to the list
image_types.append(image_type)

return image_types
20 changes: 20 additions & 0 deletions tests/4D-Lung/4D-Lung.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Config file for 4D-Lung for READII
dataset_name: 4D-Lung

### CLINICAL VARIABLE INFORMATION ###
# Event values should be in the order [Alive_value, Dead_value]
outcome_variables:
time_label:
event_label:
convert_to_years: False
event_value_mapping:

exclusion_variables:

train_test_split:
split: False
split_variable:
impute:


image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
20 changes: 20 additions & 0 deletions tests/NSCLC_Radiogenomics/NSCLC_Radiogenomics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Config file for NSCLC_Radiogenomics for READII
dataset_name: NSCLC_Radiogenomics

### CLINICAL VARIABLE INFORMATION ###
# Event values should be in the order [Alive_value, Dead_value]
outcome_variables:
time_label: ""
event_label: "Survival Status"
convert_to_years: False
event_value_mapping: {'Alive': 0, 'Dead': 1}

exclusion_variables:

train_test_split:
split: False
split_variable:
impute:


image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]
29 changes: 29 additions & 0 deletions tests/io/loaders/test_general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from readii.io.loaders.general import loadImageDatasetConfig
import pytest

@pytest.fixture
def nsclcConfigDirPath():
return "tests/NSCLC_Radiogenomics"

@pytest.fixture
def lung4DConfigDirPath():
return "tests/4D-Lung"

@pytest.fixture
def expected_image_types():
return ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"]


def test_NSCLC_loadImageDatasetConfig(nsclcConfigDirPath, expected_image_types):
config = loadImageDatasetConfig("NSCLC_Radiogenomics", nsclcConfigDirPath)
assert config["dataset_name"] == "NSCLC_Radiogenomics"
assert config["image_types"] == expected_image_types
assert config["outcome_variables"]["event_label"] == "Survival Status"
assert config["outcome_variables"]["event_value_mapping"] == {'Alive': 0, 'Dead': 1}

def test_lung4D_loadImageDatasetConfig(lung4DConfigDirPath, expected_image_types):
config = loadImageDatasetConfig("4D-Lung", lung4DConfigDirPath)
assert config["dataset_name"] == "4D-Lung"
assert config["image_types"] == expected_image_types
assert config["outcome_variables"]["event_label"] is None
assert config["outcome_variables"]["event_value_mapping"] is None
2 changes: 2 additions & 0 deletions tests/output/ct_to_seg_match_list_4D-Lung.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
patient_ID,study_CT,study_description_CT,series_CT,series_description_CT,subseries_CT,modality_CT,instances_CT,instance_uid_CT,reference_ct_CT,reference_rs_CT,reference_pl_CT,reference_frame_CT,folder_CT,orientation_CT,orientation_type_CT,MR_repetition_time_CT,MR_echo_time_CT,MR_scan_sequence_CT,MR_magnetic_field_strength_CT,MR_imaged_nucleus_CT,file_path_CT,series_seg,subseries_seg,modality_seg,instances_seg,instance_uid_seg,reference_ct_seg,reference_rs_seg,reference_pl_seg,reference_frame_seg,folder_seg,orientation_seg,orientation_type_seg,MR_repetition_time_seg,MR_echo_time_seg,MR_scan_sequence_seg,MR_magnetic_field_strength_seg,MR_imaged_nucleus_seg,file_path_seg,edge_type
113_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.324605948863389564556891313296,p4,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,"P4^P113^S303^I10349, Gated, 40.0%B",default,CT,99,1.3.6.1.4.1.14519.5.2.1.6834.5010.249506064276270740866733345688,,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543,"[1, 0, 0, 0, 1, 0]",,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543/1-81.dcm,2.25.186899387610254289948150314209581209847.35,default,RTSTRUCT,1,1.3.6.1.4.1.14519.5.2.1.6834.5010.815153834456695039602326691312,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,,,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,2
36 changes: 31 additions & 5 deletions tests/test_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import collections
import pandas as pd
import os
import shutil
from pathlib import Path

@pytest.fixture
def nsclcCTImage():
Expand Down Expand Up @@ -44,7 +46,21 @@ def pyradiomicsParamFilePath():

@pytest.fixture
def nsclcMetadataPath():
return "tests/output/ct_to_seg_match_list_NSCLC_Radiogenomics.csv"
oldpath = Path("tests/output/ct_to_seg_match_list_NSCLC_Radiogenomics.csv")
newpath = Path("tests/NSCLC_Radiogenomics/procdata/ct_to_seg_match_list_NSCLC_Radiogenomics.csv")
newpath.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(oldpath, newpath)
yield newpath.as_posix()
newpath.unlink()

@pytest.fixture
def lung4DMetadataPath():
oldpath = Path("tests/output/ct_to_seg_match_list_4D-Lung.csv")
newpath = Path("tests/4D-Lung/procdata/ct_to_seg_match_list_4D-Lung.csv")
newpath.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(oldpath, newpath)
yield newpath.as_posix()
newpath.unlink()


def test_singleRadiomicFeatureExtraction_SEG(nsclcCTImage, nsclcSEGImage, pyradiomicsParamFilePath):
Expand Down Expand Up @@ -108,11 +124,21 @@ def test_radiomicFeatureExtraction(nsclcMetadataPath):
"Volume feature is incorrect"


def test_radiomicFeatureExtraction_output(nsclcMetadataPath):
"""Test output creation from radiomic feature extraction"""
def test_NSCLC_radiomicFeatureExtraction_output(nsclcMetadataPath):
"""Test output creation from radiomic feature extraction for SEG dataset"""
actual = radiomicFeatureExtraction(nsclcMetadataPath,
imageDirPath = "tests/",
roiNames = None,
outputDirPath = "tests/output/")
expected_path = "tests/output/features/radiomicfeatures_original_NSCLC_Radiogenomics.csv"
outputDirPath = "tests/NSCLC_Radiogenomics/results/")
expected_path = "tests/NSCLC_Radiogenomics/results/features/radiomicfeatures_original_NSCLC_Radiogenomics.csv"
assert os.path.exists(expected_path)


def test_4DLung_radiomicFeatureExtraction_output(lung4DMetadataPath):
"""Test output creation from radiomic feature extraction for RTSTRUCT dataset"""
actual = radiomicFeatureExtraction(lung4DMetadataPath,
imageDirPath = "tests/",
roiNames = "Tumor_c40",
outputDirPath = "tests/4D-Lung/results/")
expected_path = "tests/4D-Lung/results/features/radiomicfeatures_original_4D-Lung.csv"
assert os.path.exists(expected_path)
Loading

0 comments on commit 81fbd78

Please sign in to comment.