-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Includes features, general, and images specific loading functions - **New Features** - Introduced a new module for loading various data types in the READII pipeline. - Added functions for loading imaging feature sets, dataset configurations, and data files into DataFrames.
- Loading branch information
Showing
12 changed files
with
409 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
"""Module for loading different data types for the READII pipeline.""" | ||
|
||
from .features import loadFeatureFilesFromImageTypes | ||
from .general import loadFileToDataFrame, loadImageDatasetConfig | ||
from .images import getImageTypesFromDirectory | ||
|
||
__all__ = [ | ||
"loadFeatureFilesFromImageTypes", | ||
"loadFileToDataFrame", | ||
"loadImageDatasetConfig", | ||
"getImageTypesFromDirectory" | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import os | ||
import pandas as pd | ||
|
||
from typing import Optional, Dict | ||
|
||
from readii.io.loaders.general import loadFileToDataFrame | ||
|
||
from readii.utils import logger | ||
|
||
|
||
def loadFeatureFilesFromImageTypes(extracted_feature_dir:str, | ||
image_types:list, | ||
drop_labels:Optional[bool]=True, | ||
labels_to_drop:Optional[list]=None)->Dict[str,pd.DataFrame]: | ||
"""Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes. | ||
Parameters | ||
---------- | ||
extracted_feature_dir : str | ||
Path to the directory containing the extracted feature csv files | ||
image_types : list, optional | ||
List of image types to load in. The default is ['original']. | ||
drop_labels : bool, optional | ||
Whether to drop the labels from the dataframes. Use when loading labelled data from data_setup_for_modeling.ipynb. The default is True. | ||
labels_to_drop : list, optional | ||
List of labels to drop from the dataframes. The default is ["patient_ID","survival_time_in_years","survival_event_binary"] based on code | ||
in data_setup_for_modeling.ipynb. | ||
Returns | ||
------- | ||
feature_sets : dict | ||
Dictionary of dataframes containing the extracted radiomics features. | ||
""" | ||
# Set default labels to drop if not specified | ||
if labels_to_drop is None: | ||
labels_to_drop = ["patient_ID","survival_time_in_years","survival_event_binary"] | ||
|
||
# Initialize dictionary to store the feature sets | ||
feature_sets = {} | ||
|
||
# Check if the passed in extracted feature directory exists | ||
if not os.path.isdir(extracted_feature_dir): | ||
raise FileNotFoundError(f"Extracted feature directory {extracted_feature_dir} does not exist.") | ||
|
||
feature_file_list = os.listdir(extracted_feature_dir) | ||
|
||
# Loop through all the files in the directory | ||
for image_type in image_types: | ||
try: | ||
# Extract the image type feature csv file from the feature directory | ||
matching_files = [file for file in feature_file_list if (image_type in file) and (file.endswith(".csv"))] | ||
if matching_files: | ||
image_type_feature_file = matching_files[0] | ||
# Remove the image type file from the list of feature files | ||
feature_file_list.remove(image_type_feature_file) | ||
except IndexError as e: | ||
logger.warning(f"No {image_type} feature csv files found in {extracted_feature_dir}") | ||
# Skip to the next image type | ||
continue | ||
|
||
|
||
# Get the full path to the feature file | ||
feature_file_path = os.path.join(extracted_feature_dir, image_type_feature_file) | ||
|
||
# Load the feature data into a pandas dataframe | ||
raw_feature_data = loadFileToDataFrame(feature_file_path) | ||
|
||
try: | ||
# Drop the labels from the dataframe if specified | ||
if drop_labels: | ||
# Data is now only extracted features | ||
raw_feature_data.drop(labels_to_drop, axis=1, inplace=True) | ||
except KeyError as e: | ||
logger.warning(f"{feature_file_path} does not have the labels {labels_to_drop} to drop.") | ||
# Skip to the next image type | ||
continue | ||
|
||
# Save the dataframe to the feature_sets dictionary | ||
feature_sets[image_type] = raw_feature_data | ||
|
||
# After processing all image types, check if any feature sets were loaded | ||
if not feature_sets: | ||
raise ValueError(f"No valid feature sets were loaded from {extracted_feature_dir}") | ||
|
||
return feature_sets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
import yaml | ||
|
||
|
||
class ConfigError(Exception): | ||
"""Base class for errors in the config module.""" | ||
|
||
pass | ||
|
||
class DataFrameLoadError(Exception): | ||
"""Custom exception for DataFrame loading errors.""" | ||
|
||
pass | ||
|
||
def loadImageDatasetConfig(dataset_name: str, config_dir_path: str | Path) -> dict: | ||
"""Load the configuration file for a given dataset. | ||
Expects the configuration file to be named <dataset_name>.yaml. | ||
Parameters | ||
---------- | ||
dataset_name : str | ||
Name of the dataset to load the configuration file for. | ||
config_dir_path : str or pathlib.Path | ||
Path to the directory containing the configuration files. | ||
Returns | ||
------- | ||
dict | ||
Dictionary containing the configuration settings for the dataset. | ||
Examples | ||
-------- | ||
>>> config = loadImageDatasetConfig("NSCLC_Radiogenomics", "config") | ||
""" | ||
config_dir_path = Path(config_dir_path) | ||
config_file_path = config_dir_path / f"{dataset_name}.yaml" | ||
|
||
if not config_file_path.exists(): | ||
msg = f"Config file {config_file_path} does not exist." | ||
raise FileNotFoundError(msg) | ||
|
||
try: | ||
with config_file_path.open("r") as f: | ||
config = yaml.safe_load(f) | ||
except yaml.YAMLError as ye: | ||
raise ConfigError("Invalid YAML in config file") from ye | ||
|
||
if not config: | ||
raise ConfigError("Config file is empty or invalid") | ||
|
||
return config | ||
|
||
|
||
|
||
def loadFileToDataFrame(file_path: str | Path) -> pd.DataFrame: | ||
"""Load data from a csv or xlsx file into a pandas dataframe. | ||
Parameters | ||
---------- | ||
file_path : str or pathlib.Path | ||
Path to the data file. | ||
Returns | ||
------- | ||
pd.DataFrame | ||
Dataframe containing the data from the file. | ||
""" | ||
file_path = Path(file_path) | ||
if not file_path: | ||
raise ValueError("File is empty") | ||
|
||
if not file_path.exists(): | ||
msg = f"File {file_path} does not exist" | ||
raise FileNotFoundError(msg) | ||
|
||
# Get the file extension | ||
file_extension = file_path.suffix | ||
|
||
try: | ||
if file_extension == '.xlsx': | ||
df = pd.read_excel(file_path) | ||
elif file_extension == '.csv': | ||
df = pd.read_csv(file_path) | ||
else: | ||
msg = f"Unsupported file format {file_extension}. Please provide a .csv or .xlsx file." | ||
raise ValueError(msg) | ||
|
||
except pd.errors.EmptyDataError as e: | ||
raise DataFrameLoadError("File is empty") from e | ||
|
||
except (pd.errors.ParserError, ValueError) as e: | ||
raise DataFrameLoadError("Error parsing file") from e | ||
|
||
if df.empty: | ||
raise DataFrameLoadError("Dataframe is empty") | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
def getImageTypesFromDirectory(raw_data_dir:Union[Path|str], | ||
feature_file_prefix:str = "", | ||
feature_file_suffix:str = ".csv"): | ||
""" Function to get a list of image types from a directory containing image feature files. | ||
Parameters | ||
---------- | ||
raw_data_dir : str | ||
Path to the directory containing the image feature files. | ||
feature_file_prefix : str, optional | ||
Prefix to remove from the feature file name. The default is "". | ||
feature_file_suffix : str, optional | ||
Suffix to remove from the feature file name. The default is ".csv". | ||
Returns | ||
------- | ||
list | ||
List of image types from the image feature files. | ||
""" | ||
# Check if raw_data_dir is a string or a Path object, convert to Path object if it is a string | ||
if isinstance(raw_data_dir, str): | ||
raw_data_dir = Path(raw_data_dir) | ||
|
||
# Check if the directory exists | ||
if not raw_data_dir.exists(): | ||
raise FileNotFoundError(f"Directory {raw_data_dir} does not exist.") | ||
|
||
# Check if the directory is a directory | ||
if not raw_data_dir.is_dir(): | ||
raise NotADirectoryError(f"Path {raw_data_dir} is not a directory.") | ||
|
||
# Check that directory contains files with the specified prefix and suffix | ||
if not any(raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}")): | ||
raise FileNotFoundError(f"No files with prefix {feature_file_prefix} and suffix {feature_file_suffix} found in directory {raw_data_dir}.") | ||
|
||
# Initialize an empty list to store the image types | ||
image_types = [] | ||
|
||
# Get list of file banes with the specified prefix and suffix in the directory | ||
for file in raw_data_dir.glob(f"{feature_file_prefix}*{feature_file_suffix}"): | ||
file_name = file.name | ||
|
||
# Remove the prefix and suffix from the file name | ||
image_type = file_name.removeprefix(feature_file_prefix).removesuffix(feature_file_suffix) | ||
|
||
# Add the image type to the list | ||
image_types.append(image_type) | ||
|
||
return image_types |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Config file for 4D-Lung for READII | ||
dataset_name: 4D-Lung | ||
|
||
### CLINICAL VARIABLE INFORMATION ### | ||
# Event values should be in the order [Alive_value, Dead_value] | ||
outcome_variables: | ||
time_label: | ||
event_label: | ||
convert_to_years: False | ||
event_value_mapping: | ||
|
||
exclusion_variables: | ||
|
||
train_test_split: | ||
split: False | ||
split_variable: | ||
impute: | ||
|
||
|
||
image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Config file for NSCLC_Radiogenomics for READII | ||
dataset_name: NSCLC_Radiogenomics | ||
|
||
### CLINICAL VARIABLE INFORMATION ### | ||
# Event values should be in the order [Alive_value, Dead_value] | ||
outcome_variables: | ||
time_label: "" | ||
event_label: "Survival Status" | ||
convert_to_years: False | ||
event_value_mapping: {'Alive': 0, 'Dead': 1} | ||
|
||
exclusion_variables: | ||
|
||
train_test_split: | ||
split: False | ||
split_variable: | ||
impute: | ||
|
||
|
||
image_types: ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from readii.io.loaders.general import loadImageDatasetConfig | ||
import pytest | ||
|
||
@pytest.fixture | ||
def nsclcConfigDirPath(): | ||
return "tests/NSCLC_Radiogenomics" | ||
|
||
@pytest.fixture | ||
def lung4DConfigDirPath(): | ||
return "tests/4D-Lung" | ||
|
||
@pytest.fixture | ||
def expected_image_types(): | ||
return ["original", "shuffled_full","shuffled_roi","shuffled_non_roi","randomized_sampled_full","randomized_sampled_roi","randomized_sampled_non_roi"] | ||
|
||
|
||
def test_NSCLC_loadImageDatasetConfig(nsclcConfigDirPath, expected_image_types): | ||
config = loadImageDatasetConfig("NSCLC_Radiogenomics", nsclcConfigDirPath) | ||
assert config["dataset_name"] == "NSCLC_Radiogenomics" | ||
assert config["image_types"] == expected_image_types | ||
assert config["outcome_variables"]["event_label"] == "Survival Status" | ||
assert config["outcome_variables"]["event_value_mapping"] == {'Alive': 0, 'Dead': 1} | ||
|
||
def test_lung4D_loadImageDatasetConfig(lung4DConfigDirPath, expected_image_types): | ||
config = loadImageDatasetConfig("4D-Lung", lung4DConfigDirPath) | ||
assert config["dataset_name"] == "4D-Lung" | ||
assert config["image_types"] == expected_image_types | ||
assert config["outcome_variables"]["event_label"] is None | ||
assert config["outcome_variables"]["event_value_mapping"] is None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
patient_ID,study_CT,study_description_CT,series_CT,series_description_CT,subseries_CT,modality_CT,instances_CT,instance_uid_CT,reference_ct_CT,reference_rs_CT,reference_pl_CT,reference_frame_CT,folder_CT,orientation_CT,orientation_type_CT,MR_repetition_time_CT,MR_echo_time_CT,MR_scan_sequence_CT,MR_magnetic_field_strength_CT,MR_imaged_nucleus_CT,file_path_CT,series_seg,subseries_seg,modality_seg,instances_seg,instance_uid_seg,reference_ct_seg,reference_rs_seg,reference_pl_seg,reference_frame_seg,folder_seg,orientation_seg,orientation_type_seg,MR_repetition_time_seg,MR_echo_time_seg,MR_scan_sequence_seg,MR_magnetic_field_strength_seg,MR_imaged_nucleus_seg,file_path_seg,edge_type | ||
113_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.324605948863389564556891313296,p4,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,"P4^P113^S303^I10349, Gated, 40.0%B",default,CT,99,1.3.6.1.4.1.14519.5.2.1.6834.5010.249506064276270740866733345688,,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543,"[1, 0, 0, 0, 1, 0]",,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-29543/1-81.dcm,2.25.186899387610254289948150314209581209847.35,default,RTSTRUCT,1,1.3.6.1.4.1.14519.5.2.1.6834.5010.815153834456695039602326691312,1.3.6.1.4.1.14519.5.2.1.6834.5010.339023390306606021995936229543,,,1.3.6.1.4.1.14519.5.2.1.6834.5010.107174034240688216982546597713,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,,,,,,,,4D-Lung/113_HM10395/11-26-1999-NA-p4-13296/1.000000-P4P113S303I10349 Gated 40.0B-47.35/1-1.dcm,2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.