diff --git a/Jenkinsfile b/Jenkinsfile index 64a16385b..0a73863e4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,7 +14,9 @@ pipeline { environment { PATH = "$HOME/miniconda/bin:$PATH" } - when { changeset "environment.yml" } + when { + changeset "requirements.txt" + } steps { echo 'Building Conda environment... ${BRANCH_NAME}' sh 'ls' @@ -26,7 +28,9 @@ pipeline { environment { PATH = "$HOME/miniconda3/bin:$PATH" } - when { changeset "environment.yml" } + when { + changeset "requirements.txt" + } steps { echo 'Building Conda environment...' + 'env.BRANCH_NAME' sh 'ls' @@ -114,6 +118,7 @@ pipeline { cd test ln -s /mnt/data/ci/data_ci_linux ./data taskset -c 0-21 pytest \ + --junitxml=./test-reports/instantation_linux.xml \ --verbose \ --working_directory=$WORK_DIR_LINUX \ --disable-warnings \ @@ -124,6 +129,11 @@ pipeline { conda deactivate ''' } + post { + always { + junit 'test/test-reports/*.xml' + } + } } stage('Instantiate Mac') { agent { label 'macos' } @@ -143,11 +153,20 @@ pipeline { module load clinica.all cd test ln -s /Volumes/data/data_ci ./data - pytest --verbose --disable-warnings -k 'test_instantiate' + pytest \ + --verbose \ + --junitxml=./test-reports/instantation_mac.xml \ + --disable-warnings \ + -k 'test_instantiate' module purge conda deactivate ''' } + post { + always { + junit 'test/test-reports/*.xml' + } + } } } } diff --git a/clinica/VERSION b/clinica/VERSION index 42045acae..c2c0004f0 100644 --- a/clinica/VERSION +++ b/clinica/VERSION @@ -1 +1 @@ -0.3.4 +0.3.5 diff --git a/clinica/cmdline.py b/clinica/cmdline.py index fc914cfd1..ea6562f84 100644 --- a/clinica/cmdline.py +++ b/clinica/cmdline.py @@ -200,6 +200,7 @@ def execute(): from clinica.pipelines.t1_volume_existing_template.t1_volume_existing_template_cli import T1VolumeExistingTemplateCLI from clinica.pipelines.t1_volume_parcellation.t1_volume_parcellation_cli import T1VolumeParcellationCLI from clinica.pipelines.t1_linear.t1_linear_cli import T1LinearCLI + from clinica.pipelines.deeplearning_prepare_data.deeplearning_prepare_data_cli import DeepLearningPrepareDataCLI from clinica.pipelines.dwi_preprocessing_using_phasediff_fieldmap.dwi_preprocessing_using_phasediff_fieldmap_cli import DwiPreprocessingUsingPhaseDiffFieldmapCli from clinica.pipelines.dwi_preprocessing_using_t1.dwi_preprocessing_using_t1_cli import DwiPreprocessingUsingT1Cli from clinica.pipelines.dwi_dti.dwi_dti_cli import DwiDtiCli @@ -230,6 +231,7 @@ def execute(): PETVolumeCLI(), PetSurfaceCLI(), # PetSurfaceLongitudinalCLI(), + DeepLearningPrepareDataCLI(), SpatialSVMCLI(), StatisticsSurfaceCLI(), StatisticsVolumeCLI(), diff --git a/clinica/iotools/converters/nifd_to_bids/nifd_to_bids.py b/clinica/iotools/converters/nifd_to_bids/nifd_to_bids.py index 779bfdda8..b42407be5 100644 --- a/clinica/iotools/converters/nifd_to_bids/nifd_to_bids.py +++ b/clinica/iotools/converters/nifd_to_bids/nifd_to_bids.py @@ -16,7 +16,6 @@ def convert_images(path_to_dataset, bids_dir, path_to_clinical): # Conversion of the entire dataset in BIDS - '''Scans available files in the path_to_dataset, identifies the patients that have images described by the json file, converts the image with the highest quality for each category''' diff --git a/clinica/iotools/utils/data_handling.py b/clinica/iotools/utils/data_handling.py index 549495a84..0fec26dd8 100644 --- a/clinica/iotools/utils/data_handling.py +++ b/clinica/iotools/utils/data_handling.py @@ -545,7 +545,6 @@ def create_subs_sess_list(input_dir, output_dir, def center_nifti_origin(input_image, output_image): - """ Put the origin of the coordinate system at the center of the image diff --git a/clinica/pipelines/deeplearning_prepare_data/__init__.py b/clinica/pipelines/deeplearning_prepare_data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_cli.py b/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_cli.py new file mode 100644 index 000000000..6bc85961b --- /dev/null +++ b/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_cli.py @@ -0,0 +1,117 @@ +# coding: utf8 + + +import clinica.engine as ce +from colorama import Fore + +class DeepLearningPrepareDataCLI(ce.CmdParser): + + def define_name(self): + """Define the sub-command name to run this pipeline.""" + self._name = 'deeplearning-prepare-data' + + def define_description(self): + """Define a description of this pipeline.""" + self._description = ('Prepare data generated Clinica for PyTorch with Tensor extraction:\n' + 'http://clinica.run/doc/Pipelines/DeepLearning_PrepareData/') + + def define_options(self): + """Define the sub-command arguments.""" + from clinica.engine.cmdparser import PIPELINE_CATEGORIES + + # Clinica compulsory arguments (e.g. BIDS, CAPS, group_id...) + # Most of the time, you will want to read your pipeline inputs into + # a BIDS and/or CAPS directory. If your pipeline does not require BIDS input, + # simply remove the two lines involving the BIDS directory. + clinica_comp = self._args.add_argument_group(PIPELINE_CATEGORIES['CLINICA_COMPULSORY']) + clinica_comp.add_argument("caps_directory", + help='Path to the CAPS directory.') + clinica_comp.add_argument("extract_method", + help='''Format of the extracted features. Three options: + 'image' to convert to PyTorch tensor the complete 3D image, + 'patch' to extract 3D volumetric patches and + 'slice' to extract 2D slices from the image. + By default the features are extracted from the cropped image.''', + choices=['image', 'slice', 'patch'], default='image' + ) + + optional = self._args.add_argument_group(PIPELINE_CATEGORIES['OPTIONAL']) + optional.add_argument('-uui', '--use_uncropped_image', + help='''Use the uncropped image instead of the + cropped image generated by t1-linear.''', + default=False, action="store_true" + ) + + optional_patch = self._args.add_argument_group( + "%sPipeline options if you chose ‘patch’ extraction%s" % (Fore.BLUE, Fore.RESET) + ) + optional_patch.add_argument( + '-ps', '--patch_size', + help='''Patch size (default: --patch_size 50).''', + type=int, default=50 + ) + optional_patch.add_argument( + '-ss', '--stride_size', + help='''Stride size (default: --stride_size 50).''', + type=int, default=50 + ) + + optional_slice = self._args.add_argument_group( + "%sPipeline options if you chose ‘slice’ extraction%s" % (Fore.BLUE, Fore.RESET) + ) + optional_slice.add_argument( + '-sd', '--slice_direction', + help='''Slice direction. Three options: + '0' -> Sagittal plane, + '1' -> Coronal plane or + '2' -> Axial plane + (default: sagittal plane i.e. --slice_direction 0)''', + type=int, default=0 + ) + optional_slice.add_argument( + '-sm', '--slice_mode', + help='''Slice mode. Two options: 'rgb' to save the slice in + three identical channels, ‘single’ to save the slice in a + single channel (default: --slice_mode rgb).''', + choices=['rgb', 'single'], default='rgb' + ) + + # Clinica standard arguments (e.g. --n_procs) + self.add_clinica_standard_arguments() + + def run_command(self, args): + """Run the pipeline with defined args.""" + from networkx import Graph + from .deeplearning_prepare_data_pipeline import DeepLearningPrepareData + from clinica.utils.ux import print_end_pipeline, print_crash_files_and_exit + + parameters = { + # Add your own pipeline parameters here to use them inside your + # pipeline. See the file `deeplearning_prepare_data_pipeline.py` to + # see an example of use. + 'extract_method': args.extract_method, + 'patch_size': args.patch_size, + 'stride_size': args.stride_size, + 'slice_direction': args.slice_direction, + 'slice_mode': args.slice_mode, + 'use_uncropped_image': args.use_uncropped_image, + } + + pipeline = DeepLearningPrepareData( + caps_directory=self.absolute_path(args.caps_directory), + tsv_file=self.absolute_path(args.subjects_sessions_tsv), + base_dir=self.absolute_path(args.working_directory), + parameters=parameters, + name=self.name + ) + + if args.n_procs: + exec_pipeline = pipeline.run(plugin='MultiProc', + plugin_args={'n_procs': args.n_procs}) + else: + exec_pipeline = pipeline.run() + + if isinstance(exec_pipeline, Graph): + print_end_pipeline(self.name, pipeline.base_dir, pipeline.base_dir_was_specified) + else: + print_crash_files_and_exit(args.logname, pipeline.base_dir) diff --git a/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_pipeline.py b/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_pipeline.py new file mode 100644 index 000000000..883df6217 --- /dev/null +++ b/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_pipeline.py @@ -0,0 +1,254 @@ +# -*- coding: utf-8 -*- + + +import clinica.pipelines.engine as cpe + + +# Use hash instead of parameters for iterables folder names +# Otherwise path will be too long and generate OSError +from nipype import config +cfg = dict(execution={'parameterize_dirs': False}) +config.update_config(cfg) + + +class DeepLearningPrepareData(cpe.Pipeline): + """Deeplearning prepare data - MRI in nifti format are transformed into + PyTorch tensors. The transformation is applied to: the whole volume, a + selection of 3D patches, or slices extracted from the 3D volume. By default + it uses the cropped version of the MRI (see option "--use_uncropper_image") + + + Returns: + A clinica pipeline object containing the Deeplearning prepare data pipeline. + + Raises: + + """ + + def check_custom_dependencies(self): + """Check dependencies that can not be listed in the `info.json` file.""" + pass + + def get_input_fields(self): + """Specify the list of possible inputs of this pipeline. + + Returns: + A list of (string) input fields name. + """ + + return ['input_nifti'] + + def get_output_fields(self): + """Specify the list of possible outputs of this pipeline. + + Returns: + A list of (string) output fields name. + """ + + return ['image_id'] # Fill here the list + + def build_input_node(self): + """Build and connect an input node to the pipeline.""" + import nipype.interfaces.utility as nutil + import nipype.pipeline.engine as npe + from clinica.utils.exceptions import ClinicaBIDSError, ClinicaException + from clinica.utils.stream import cprint + from clinica.utils.inputs import clinica_file_reader + from clinica.utils.input_files import T1W_LINEAR + from clinica.utils.input_files import T1W_LINEAR_CROPPED + from clinica.utils.ux import print_images_to_process + + if self.parameters.get('use_uncropped_image'): + FILE_TYPE = T1W_LINEAR + else: + FILE_TYPE = T1W_LINEAR_CROPPED + + # T1w_Linear file: + try: + t1w_files = clinica_file_reader(self.subjects, + self.sessions, + self.caps_directory, + FILE_TYPE) + except ClinicaException as e: + err = 'Clinica faced error(s) while trying to read files in your CAPS directory.\n' + str(e) + raise ClinicaBIDSError(err) + + if len(self.subjects): + print_images_to_process(self.subjects, self.sessions) + cprint('The pipeline will last approximately 30 seconds per image.') # Replace by adequate computational time. + + if self.parameters.get('extract_method') == 'slice': + self.slice_direction = self.parameters.get('slice_direction') + self.slice_mode = self.parameters.get('slice_mode') + else: + self.slice_direction = 'axial' + self.slice_mode = 'rgb' + + if self.parameters.get('extract_method') == 'patch': + self.patch_size = self.parameters.get('patch_size') + self.stride_size = self.parameters.get('stride_size') + else: + self.patch_size = 50 + self.stride_size = 50 + + # The reading node + # ------------------------- + read_node = npe.Node(name="ReadingFiles", + iterables=[ + ('input_nifti', t1w_files), + ], + synchronize=True, + interface=nutil.IdentityInterface( + fields=self.get_input_fields()) + ) + + self.connect([ + (read_node, self.input_node, [('input_nifti', 'input_nifti')]), + ]) + + def build_output_node(self): + """Build and connect an output node to the pipeline.""" + import nipype.interfaces.utility as nutil + from nipype.interfaces.io import DataSink + import nipype.pipeline.engine as npe + from clinica.utils.nipype import (fix_join, container_from_filename) + from clinica.utils.filemanip import get_subject_id + + # Write node + # ---------------------- + write_node = npe.Node( + name="WriteCaps", + interface=DataSink() + ) + write_node.inputs.base_directory = self.caps_directory + write_node.inputs.parameterization = False + + # Get subject ID node + # ---------------------- + image_id_node = npe.Node( + interface=nutil.Function( + input_names=['bids_or_caps_file'], + output_names=['image_id'], + function=get_subject_id), + name='ImageID' + ) + + # Find container path from t1w filename + # ---------------------- + container_path = npe.Node( + nutil.Function( + input_names=['bids_or_caps_filename'], + output_names=['container'], + function=container_from_filename), + name='ContainerPath') + + self.connect([ + (self.input_node, image_id_node, [('input_nifti', 'bids_or_caps_file')]), + (self.input_node, container_path, [('input_nifti', 'bids_or_caps_filename')]), + # (image_id_node, write_node, [('image_id', '@image_id')]), + (image_id_node, write_node, [('image_id', '@image_id')]), + ]) + + subfolder = 'image_based' + if self.parameters.get('extract_method') == 'slice': + subfolder = 'slice_based' + self.connect([ + (self.output_node, write_node, [('slices_rgb_T1', '@slices_rgb_T1')]), + (self.output_node, write_node, [('slices_original_T1', '@slices_original_T1')]) + ]) + + elif self.parameters.get('extract_method') == 'patch': + subfolder = 'patch_based' + self.connect([ + (self.output_node, write_node, [('patches_T1', '@patches_T1')]) + ]) + else: + self.connect([ + (self.output_node, write_node, [('output_pt_file', '@output_pt_file')]) + ]) + + self.connect([ + (container_path, write_node, [( + ( + 'container', fix_join, + 'deeplearning_prepare_data', subfolder, 't1_linear' + ), + 'container')]), + ]) + + def build_core_nodes(self): + """Build and connect the core nodes of the pipeline.""" + + import nipype.interfaces.utility as nutil + import nipype.pipeline.engine as npe + from .deeplearning_prepare_data_utils import (extract_slices, + extract_patches, + save_as_pt) + # The processing nodes + + # Node to save MRI in nii.gz format into pytorch .pt format + # ---------------------- + save_as_pt = npe.MapNode( + name='save_as_pt', + iterfield=['input_img'], + interface=nutil.Function( + function=save_as_pt, + input_names=['input_img'], + output_names=['output_file'] + ) + ) + + # Extract slices node (options: 3 directions, mode) + # ---------------------- + extract_slices = npe.MapNode( + name='extract_slices', + iterfield=['input_tensor'], + interface=nutil.Function( + function=extract_slices, + input_names=[ + 'input_tensor', 'slice_direction', + 'slice_mode' + ], + output_names=['output_file_rgb', 'output_file_original'] + ) + ) + + extract_slices.inputs.slice_direction = self.slice_direction + extract_slices.inputs.slice_mode = self.slice_mode + + # Extract patches node (options, patch size and stride size) + # ---------------------- + extract_patches = npe.MapNode( + name='extract_patches', + iterfield=['input_tensor'], + interface=nutil.Function( + function=extract_patches, + input_names=['input_tensor', 'patch_size', 'stride_size'], + output_names=['output_patch'] + ) + ) + + extract_patches.inputs.patch_size = self.patch_size + extract_patches.inputs.stride_size = self.stride_size + + # Connections + # ---------------------- + self.connect([ + (self.input_node, save_as_pt, [('input_nifti', 'input_img')]), + ]) + + if self.parameters.get('extract_method') == 'slice': + self.connect([ + (save_as_pt, extract_slices, [('output_file', 'input_tensor')]), + (extract_slices, self.output_node, [('output_file_rgb', 'slices_rgb_T1')]), + (extract_slices, self.output_node, [('output_file_original', 'slices_original_T1')]) + ]) + elif self.parameters.get('extract_method') == 'patch': + self.connect([ + (save_as_pt, extract_patches, [('output_file', 'input_tensor')]), + (extract_patches, self.output_node, [('output_patch', 'patches_T1')]) + ]) + else: + self.connect([ + (save_as_pt, self.output_node, [('output_file', 'output_pt_file')]), + ]) diff --git a/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_utils.py b/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_utils.py new file mode 100644 index 000000000..04c679984 --- /dev/null +++ b/clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_utils.py @@ -0,0 +1,242 @@ +# coding: utf8 + +def extract_slices(input_tensor, slice_direction=0, slice_mode='single'): + """Extracts the slices from three directions + + This function extracts slices form the preprocesed nifti image. The + direction of extraction can be defined either on sagital direction (0), + cornal direction (1) or axial direction (other). The output slices can be + stores following two modes: single (1 channel) ou RGB (3 channels, all the + same). + + + Args: + input_tensor: tensor version of the nifti MRI. + slice_direction: which axis direction that the slices were extracted + slice_mode: 'single' or 'RGB'. + + Returns: + file: multiple tensors saved on the disk, suffixes corresponds to + indexes of the slices. Same location than input file. + """ + import torch + import os + + image_tensor = torch.load(input_tensor) + # reshape the tensor, delete the first dimension for slice-level + image_tensor = image_tensor.view(image_tensor.shape[1], image_tensor.shape[2], image_tensor.shape[3]) + + # sagital + # M and N correspond to the first and last slices (if need to remove) + M = 0 + N = 0 + slice_list_sag = range(M, image_tensor.shape[0] - N) # delete the first M slices and last N slices + + basedir = os.getcwd() + input_tensor_filename = os.path.basename(input_tensor) + + txt_idx = input_tensor_filename.rfind("_") + it_filename_prefix = input_tensor_filename[0:txt_idx] + it_filename_suffix = input_tensor_filename[txt_idx:] + + output_file_original = [] + output_file_rgb = [] + if slice_direction == 0: + for index_slice, index_slice_list in zip(slice_list_sag, range(len(slice_list_sag))): + # for i in slice_list: + # sagital + slice_select_sag = image_tensor[index_slice, :, :] + + extracted_slice_original_sag = slice_select_sag.unsqueeze(0) # shape should be 1 * W * L + + # train for transfer learning, creating the fake RGB image. + slice_select_sag = (slice_select_sag - slice_select_sag.min()) / (slice_select_sag.max() - slice_select_sag.min()) + extracted_slice_rgb_sag = torch.stack((slice_select_sag, slice_select_sag, slice_select_sag)) # shape should be 3 * W * L + + # save into .pt format + if slice_mode == 'single': + output_file_original.append( + os.path.join( + basedir, + it_filename_prefix + + '_axis-sag_channel-single_slice-' + + str(index_slice) + + it_filename_suffix + ) + ) + torch.save(extracted_slice_original_sag.clone(), output_file_original[index_slice_list]) + elif slice_mode == 'rgb': + output_file_rgb.append( + os.path.join( + basedir, + it_filename_prefix + + '_axis-sag_channel-rgb_slice-' + + str(index_slice) + + it_filename_suffix + ) + ) + torch.save(extracted_slice_rgb_sag.clone(), output_file_rgb[index_slice_list]) + + elif slice_direction == 1: + # cornal + slice_list_cor = range(M, image_tensor.shape[1] - N) # delete the first M slices and last N slices + for index_slice, index_slice_list in zip(slice_list_cor, range(len(slice_list_cor))): + # for i in slice_list: + # sagital + slice_select_cor = image_tensor[:, index_slice, :] + + extracted_slice_original_cor = slice_select_cor.unsqueeze(0) # shape should be 1 * W * L + + # train for transfer learning, creating the fake RGB image. + slice_select_cor = (slice_select_cor - slice_select_cor.min()) / (slice_select_cor.max() - slice_select_cor.min()) + extracted_slice_rgb_cor = torch.stack((slice_select_cor, slice_select_cor, slice_select_cor)) # shape should be 3 * W * L + + # save into .pt format + if slice_mode == 'single': + output_file_original.append( + os.path.join( + basedir, + it_filename_prefix + + '_axis-cor_channel-single_slice-' + + str(index_slice) + + it_filename_suffix + ) + ) + torch.save(extracted_slice_original_cor.clone(), output_file_original[index_slice_list]) + elif slice_mode == 'rgb': + output_file_rgb.append( + os.path.join( + basedir, + it_filename_prefix + + '_axis-cor_channel-rgb_slice-' + + str(index_slice) + + it_filename_suffix + ) + ) + torch.save(extracted_slice_rgb_cor.clone(), output_file_rgb[index_slice_list]) + + else: + + # axial + slice_list_axi = range(M, image_tensor.shape[2] - N) # delete the first M slices and last N slices + for index_slice, index_slice_list in zip(slice_list_axi, range(len(slice_list_axi))): + # for i in slice_list: + # sagital + slice_select_axi = image_tensor[:, :, index_slice] + + extracted_slice_original_axi = slice_select_axi.unsqueeze(0) # shape should be 1 * W * L + + # train for transfer learning, creating the fake RGB image. + slice_select_axi = (slice_select_axi - slice_select_axi.min()) / (slice_select_axi.max() - slice_select_axi.min()) + extracted_slice_rgb_axi = torch.stack((slice_select_axi, slice_select_axi, slice_select_axi)) # shape should be 3 * W * L + + # save into .pt format + if slice_mode == 'single': + output_file_original.append( + os.path.join( + basedir, + it_filename_prefix + + '_axis-axi_channel-single_slice-' + + str(index_slice) + + it_filename_suffix + ) + ) + torch.save(extracted_slice_original_axi.clone(), output_file_original[index_slice_list]) + elif slice_mode == 'rgb': + output_file_rgb.append( + os.path.join( + basedir, + it_filename_prefix + + '_axis-axi_channel-rgb_slice-' + + str(index_slice) + + it_filename_suffix + ) + ) + torch.save(extracted_slice_rgb_axi.clone(), output_file_rgb[index_slice_list]) + + return output_file_rgb, output_file_original + + +def extract_patches(input_tensor, patch_size, stride_size): + """Extracts the patches + + This function extracts patches form the preprocesed nifti image. Patch size + if provieded as input and also the stride size. If stride size is smaller + than the patch size an overlap exist between consecutive patches. If stride + size is equal to path size there is no overlap. Otherwise, unprocessed + zones can exits. + + Args: + input_tensor: tensor version of the nifti MRI. + patch_size: size of a single patch. + stride_size: size of the stride leading to next patch. + + Returns: + file: multiple tensors saved on the disk, suffixes corresponds to + indexes of the patches. Same location than input file. + """ + import torch + import os + + basedir = os.getcwd() + image_tensor = torch.load(input_tensor) + + # use classifiers tensor.upfold to crop the patch. + patches_tensor = image_tensor.unfold(1, patch_size, stride_size).unfold(2, patch_size, stride_size).unfold(3, patch_size, stride_size).contiguous() + # the dimension of patch_tensor should be [1, patch_num1, patch_num2, patch_num3, patch_size1, patch_size2, patch_size3] + patches_tensor = patches_tensor.view(-1, patch_size, patch_size, patch_size) + + input_tensor_filename = os.path.basename(input_tensor) + txt_idx = input_tensor_filename.rfind("_") + it_filename_prefix = input_tensor_filename[0:txt_idx] + it_filename_suffix = input_tensor_filename[txt_idx:] + + output_patch = [] + for index_patch in range(patches_tensor.shape[0]): + extracted_patch = patches_tensor[index_patch, ...].unsqueeze_(0) # add one dimension + # save into .pt format + output_patch.append( + os.path.join( + basedir, + it_filename_prefix + + '_patchsize-' + + str(patch_size) + + '_stride-' + + str(stride_size) + + '_patch-' + + str(index_patch) + + it_filename_suffix + ) + ) + torch.save(extracted_patch.clone(), output_patch[index_patch]) + + return output_patch + + +def save_as_pt(input_img): + """Saves PyTorch tensor version of the nifti image + + This function convert nifti image to tensor (.pt) version of the image. + Tensor version is saved at the same location than input_img. + + Args: + input_tensor: tensor version of the nifti MRI. + + Returns: + filename (str): single tensor file saved on the disk. Same location than input file. + + """ + + import torch + import os + import nibabel as nib + + basedir = os.getcwd() + image_array = nib.load(input_img).get_fdata() + image_tensor = torch.from_numpy(image_array).unsqueeze(0).float() + # make sure the tensor dtype is torch.float32 + output_file = os.path.join(basedir, os.path.basename(input_img).split('.nii.gz')[0] + '.pt') + # save + torch.save(image_tensor.clone(), output_file) + + return output_file diff --git a/clinica/pipelines/deeplearning_prepare_data/info.json b/clinica/pipelines/deeplearning_prepare_data/info.json new file mode 100644 index 000000000..b3e2ced24 --- /dev/null +++ b/clinica/pipelines/deeplearning_prepare_data/info.json @@ -0,0 +1,8 @@ +{ + "id": "aramislab/deeplearning-prepare-data", + "author": "Mauricio Diaz", + "version": "0.1.0", + "space_caps": "80M", + "space_wd": "110M", + "dependencies": [] +} diff --git a/clinica/pipelines/engine.py b/clinica/pipelines/engine.py index 850432ef4..777433f40 100644 --- a/clinica/pipelines/engine.py +++ b/clinica/pipelines/engine.py @@ -1,7 +1,9 @@ # coding: utf8 """ +This module contains the Pipeline abstract class needed for Clinica. +Subclasses are located in clinica/pipeline//_pipeline.py """ import abc @@ -509,7 +511,7 @@ def human2bytes(s): + 'Running anyway...' + Fore.RESET) def update_parallelize_info(self, plugin_args): - """ Peforms some checks of the number of threads given in parameters, + """ Performs some checks of the number of threads given in parameters, given the number of CPUs of the machine in which clinica is running. We force the use of plugin MultiProc @@ -557,7 +559,7 @@ def update_parallelize_info(self, plugin_args): cprint('How many threads do you want to use? If you do not ' + 'answer within ' + str(timeout) + ' sec, default value of ' + str(n_cpu - 1) - + ' will be taken.') + + ' will be taken. Use --n_procs argument if you want to disable this message next time.') stdin_answer, __, ___ = select.select([sys.stdin], [], [], timeout) if stdin_answer: answer = str(sys.stdin.readline().strip()) @@ -568,7 +570,7 @@ def update_parallelize_info(self, plugin_args): break cprint(Fore.RED + 'Your answer must be a positive integer.' + Fore.RESET) - # If plugin_args is None, create the dictionnary + # If plugin_args is None, create the dictionary # If it already a dict, just update (or create -it is the same # code-) the correct key / value if plugin_args is None: diff --git a/clinica/pipelines/machine_learning/algorithm.py b/clinica/pipelines/machine_learning/algorithm.py index a62390bb1..b3c8372d0 100644 --- a/clinica/pipelines/machine_learning/algorithm.py +++ b/clinica/pipelines/machine_learning/algorithm.py @@ -33,17 +33,9 @@ class DualSVMAlgorithm(base.MLAlgorithm): - def __init__(self, kernel, y, balanced=True, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - self._kernel = kernel - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads - def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): - if self._balanced: + if self._algorithm_params['balanced']: svc = SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced') else: svc = SVC(C=c, kernel='precomputed', probability=True, tol=1e-6) @@ -87,15 +79,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -105,7 +97,7 @@ def evaluate(self, train_index, test_index): x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) @@ -146,7 +138,7 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: + if self._algorithm_params['balanced']: svc = SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced') else: svc = SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6) @@ -177,29 +169,30 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return True -class LogisticReg(base.MLAlgorithm): + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': True, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} - def __init__(self, x, y, penalty='l2', balanced=False, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - """ penalty can either be 'l2' or 'l1'""" - self._penalty = penalty - self._x = x - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads + return parameters_dict - def _launch_logistic_reg(self, x_train, x_test, y_train, y_test, c, shared_x=None, train_indices=None, - test_indices=None): - # x_train_, mean_x, std_x = centered_normalised_data(x_train) - # x_test_ = (x_test - mean_x)/std_x +class LogisticReg(base.MLAlgorithm): - if self._balanced: - classifier = LogisticRegression(penalty=self._penalty, tol=1e-6, C=c, class_weight='balanced') + def _launch_logistic_reg(self, x_train, x_test, y_train, y_test, c): + + if self._algorithm_params['balanced']: + classifier = LogisticRegression(penalty=self._algorithm_params['penalty'], tol=1e-6, C=c, + class_weight='balanced') else: - classifier = LogisticRegression(penalty=self._penalty, tol=1e-6, C=c) + classifier = LogisticRegression(penalty=self._algorithm_params['penalty'], tol=1e-6, C=c) classifier.fit(x_train, y_train) y_hat_train = classifier.predict(x_train) @@ -240,15 +233,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -259,7 +252,7 @@ def evaluate(self, train_index, test_index): y_train_inner = y_train[inner_train_index] y_test_inner = y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (x_train_inner, x_test_inner, y_train_inner, y_test_inner, c)) @@ -300,10 +293,11 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: - classifier = LogisticRegression(C=best_c, penalty=self._penalty, tol=1e-6, class_weight='balanced') + if self._algorithm_params['balanced']: + classifier = LogisticRegression(C=best_c, penalty=self._algorithm_params['penalty'], tol=1e-6, + class_weight='balanced') else: - classifier = LogisticRegression(C=best_c, penalty=self._penalty, tol=1e-6) + classifier = LogisticRegression(C=best_c, penalty=self._algorithm_params['penalty'], tol=1e-6) classifier.fit(self._x, self._y) @@ -314,7 +308,7 @@ def save_classifier(self, classifier, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.coef_.transpose()) np.savetxt(path.join(output_dir, 'intercept.txt'), classifier.intercept_) - def save_weights(self, classifier, output_dir): + def save_weights(self, classifier, x, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.coef_.transpose()) return classifier.coef_.transpose() @@ -332,35 +326,34 @@ def _centered_normalised_data(features): features_bis = (features - mean)/std return features_bis, mean, std + @staticmethod + def uses_kernel(): + return False + + @staticmethod + def get_default_parameters(): + parameters_dict = {'penalty': 'l2', + 'balanced': False, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} + + return parameters_dict + class RandomForest(base.MLAlgorithm): - def __init__(self, x, y, balanced=False, grid_search_folds=10, - n_estimators_range=(10, 25, 50, 100, 150, 200, 500), - max_depth_range=(None, 6, 8, 10, 12), - min_samples_split_range=(2, 4, 6, 8), - max_features_range=('auto', 0.1, 0.2, 0.3, 0.4, 0.5), - n_threads=15): - self._x = x - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._n_threads = n_threads - - def _launch_random_forest(self, x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, max_features): - - if self._balanced: + def _launch_random_forest(self, x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, + max_features): + + if self._algorithm_params['balanced']: classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features, - class_weight='balanced', n_jobs=self._n_threads) + class_weight='balanced', n_jobs=self._algorithm_params['n_threads']) else: classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features, - n_jobs=self._n_threads) + n_jobs=self._algorithm_params['n_threads']) classifier.fit(x_train, y_train) y_hat_train = classifier.predict(x_train) @@ -405,9 +398,6 @@ def _select_best_parameter(self, async_result): params_list.append(best_params) accuracies.append(best_acc) - # TODO For exploratory purpose only. Erase later - # pd.concat(all_params_acc).to_csv('all_params_acc_%s.tsv' % datetime.datetime.now(), sep='\t', index=False, encoding='utf-8') - best_acc = np.mean(accuracies) best_n_estimators = int(round(np.mean([x[0] for x in params_list]))) best_max_depth = int(round(np.mean([x[1] if x[1] is not None else 50 for x in params_list]))) @@ -435,21 +425,21 @@ def max_feature_to_float(m): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) - parameters_combinations = list(itertools.product(self._n_estimators_range, - self._max_depth_range, - self._min_samples_split_range, - self._max_features_range)) + parameters_combinations = list(itertools.product(self._algorithm_params['n_estimators_range'], + self._algorithm_params['max_depth_range'], + self._algorithm_params['min_samples_split_range'], + self._algorithm_params['max_features_range'])) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] @@ -499,16 +489,16 @@ def evaluate_no_cv(self, train_index, test_index): y_test = self._y[test_index] best_parameter = dict() - best_parameter['n_estimators'] = self._n_estimators_range - best_parameter['max_depth'] = self._max_depth_range - best_parameter['min_samples_split'] = self._min_samples_split_range - best_parameter['max_features'] = self._max_features_range + best_parameter['n_estimators'] = self._algorithm_params['n_estimators_range'] + best_parameter['max_depth'] = self._algorithm_params['max_depth_range'] + best_parameter['min_samples_split'] = self._algorithm_params['min_samples_split_range'] + best_parameter['max_features'] = self._algorithm_params['max_features_range'] _, y_hat, auc, y_hat_train = self._launch_random_forest(x_train, x_test, y_train, y_test, - self._n_estimators_range, - self._max_depth_range, - self._min_samples_split_range, - self._max_features_range) + self._algorithm_params['n_estimators_range'], + self._algorithm_params['max_depth_range'], + self._algorithm_params['min_samples_split_range'], + self._algorithm_params['max_features_range']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) @@ -528,8 +518,11 @@ def apply_best_parameters(self, results_list): mean_bal_acc = np.mean([result['best_parameter']['balanced_accuracy'] for result in results_list]) best_n_estimators = int(round(np.mean([result['best_parameter']['n_estimators'] for result in results_list]))) - best_max_depth = int(round(np.mean([result['best_parameter']['max_depth'] if result['best_parameter']['max_depth'] is not None else 50 for result in results_list]))) - best_min_samples_split = int(round(np.mean([result['best_parameter']['min_samples_split'] for result in results_list]))) + best_max_depth = int(round(np.mean([result['best_parameter']['max_depth'] + if result['best_parameter']['max_depth'] is not None + else 50 for result in results_list]))) + best_min_samples_split = int(round(np.mean([result['best_parameter']['min_samples_split'] + for result in results_list]))) max_feat = [] n_features = self._x.shape[1] @@ -552,14 +545,16 @@ def apply_best_parameters(self, results_list): max_feat.append(max_features) best_max_features = np.mean(max_feat) - if self._balanced: + if self._algorithm_params['balanced']: classifier = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, - min_samples_split=best_min_samples_split, max_features=best_max_features, - class_weight='balanced', n_jobs=self._n_threads) + min_samples_split=best_min_samples_split, + max_features=best_max_features, + class_weight='balanced', n_jobs=self._algorithm_params['n_threads']) else: classifier = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, - min_samples_split=best_min_samples_split, max_features=best_max_features, - n_jobs=self._n_threads) + min_samples_split=best_min_samples_split, + max_features=best_max_features, + n_jobs=self._algorithm_params['n_threads']) classifier.fit(self._x, self._y) @@ -574,7 +569,7 @@ def save_classifier(self, classifier, output_dir): # print classifier.estimators_ # np.savetxt(path.join(output_dir, 'estimators.txt'), str(classifier.estimators_)) - def save_weights(self, classifier, output_dir): + def save_weights(self, classifier, x, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.feature_importances_) return classifier.feature_importances_ @@ -584,41 +579,43 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return False + + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': False, + 'grid_search_folds': 10, + 'n_estimators_range': (10, 25, 50, 100, 150, 200, 500), + 'max_depth_range': (None, 6, 8, 10, 12), + 'min_samples_split_range': (2, 4, 6, 8), + 'max_features_range': ('auto', 0.1, 0.2, 0.3, 0.4, 0.5), + 'n_threads': 15} + + return parameters_dict + class XGBoost(base.MLAlgorithm): - def __init__(self, x, y, balanced=False, grid_search_folds=10, - max_depth_range=(0, 6), - learning_rate_range=(0.1, 0.3), - n_estimators_range=(100, 200), - colsample_bytree_range=(0.5, 1), - reg_alpha=0, - reg_lambda=1, - n_threads=15): - self._x = x - self._y = y - self._balanced = balanced - self._scale_pos_weight = float(len(self._y - sum(self._y)) / sum(self._y)) - self._grid_search_folds = grid_search_folds - self._max_depth_range = max_depth_range - self._learning_rate_range = learning_rate_range - self._n_estimators_range = n_estimators_range - self._colsample_bytree_range = colsample_bytree_range - self._reg_alpha = reg_alpha - self._reg_lambda = reg_lambda - self._n_threads = n_threads - - def _launch_xgboost(self, x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, colsample_bytree): - if self._balanced: + + def _launch_xgboost(self, x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, + colsample_bytree): + + if self._algorithm_params['balanced']: # set scale_pos_weight # http://xgboost.readthedocs.io/en/latest//how_to/param_tuning.html + scale_pos_weight = float(len(self._y - sum(self._y)) / sum(self._y)) classifier = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, - n_jobs=self._n_threads, colsample_bytree=colsample_bytree, - reg_alpha=self._reg_alpha, reg_lambda=self._reg_lambda, - scale_pos_weight=self._scale_pos_weight) + n_jobs=self._algorithm_params['n_threads'], colsample_bytree=colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda'], + scale_pos_weight=scale_pos_weight) else: classifier = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, - n_jobs=self._n_threads, colsample_bytree=colsample_bytree, - reg_alpha=self._reg_alpha, reg_lambda=self._reg_lambda) + n_jobs=self._algorithm_params['n_threads'], colsample_bytree=colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda']) classifier.fit(x_train, y_train) y_hat_train = classifier.predict(x_train) @@ -677,21 +674,21 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) - parameters_combinations = list(itertools.product(self._max_depth_range, - self._learning_rate_range, - self._n_estimators_range, - self._colsample_bytree_range)) + parameters_combinations = list(itertools.product(self._algorithm_params['max_depth_range'], + self._algorithm_params['learning_rate_range'], + self._algorithm_params['n_estimators_range'], + self._algorithm_params['colsample_bytree_range'])) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] @@ -741,16 +738,16 @@ def evaluate_no_cv(self, train_index, test_index): y_test = self._y[test_index] best_parameter = dict() - best_parameter['max_depth'] = self._max_depth_range - best_parameter['learning_rate'] = self._learning_rate_range - best_parameter['n_estimators'] = self._n_estimators_range - best_parameter['colsample_bytree'] = self._colsample_bytree_range + best_parameter['max_depth'] = self._algorithm_params['max_depth_range'] + best_parameter['learning_rate'] = self._algorithm_params['learning_rate_range'] + best_parameter['n_estimators'] = self._algorithm_params['n_estimators_range'] + best_parameter['colsample_bytree'] = self._algorithm_params['colsample_bytree_range'] _, y_hat, auc, y_hat_train = self._launch_xgboost(x_train, x_test, y_train, y_test, - self._max_depth_range, - self._learning_rate_range, - self._n_estimators_range, - self._colsample_bytree_range) + self._algorithm_params['max_depth_range'], + self._algorithm_params['learning_rate_range'], + self._algorithm_params['n_estimators_range'], + self._algorithm_params['colsample_bytree_range']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) @@ -775,16 +772,21 @@ def apply_best_parameters(self, results_list): best_n_estimators = int(round(np.mean([result['best_parameter']['n_estimators'] for result in results_list]))) best_colsample_bytree = np.mean([result['best_parameter']['colsample_bytree'] for result in results_list]) - if self._balanced: + if self._algorithm_params['balanced']: + scale_pos_weight = float(len(self._y - sum(self._y)) / sum(self._y)) + classifier = XGBClassifier(max_depth=best_max_depth, learning_rate=best_learning_rate, - n_estimators=best_n_estimators, n_jobs=self._n_threads, - colsample_bytree=best_colsample_bytree, reg_alpha=self._reg_alpha, - reg_lambda=self._reg_lambda, scale_pos_weight=self._scale_pos_weight) + n_estimators=best_n_estimators, n_jobs=self._algorithm_params['n_threads'], + colsample_bytree=best_colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda'], + scale_pos_weight=scale_pos_weight) else: classifier = XGBClassifier(max_depth=best_max_depth, learning_rate=best_learning_rate, - n_estimators=best_n_estimators, n_jobs=self._n_threads, - colsample_bytree=best_colsample_bytree, reg_alpha=self._reg_alpha, - reg_lambda=self._reg_lambda) + n_estimators=best_n_estimators, n_jobs=self._algorithm_params['n_threads'], + colsample_bytree=best_colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda']) classifier.fit(self._x, self._y) @@ -799,7 +801,7 @@ def save_classifier(self, classifier, output_dir): # print classifier.estimators_ # np.savetxt(path.join(output_dir, 'estimators.txt'), str(classifier.estimators_)) - def save_weights(self, classifier, output_dir): + def save_weights(self, classifier, x, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.feature_importances_) return classifier.feature_importances_ @@ -809,20 +811,32 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return False + + @staticmethod + def get_default_parameters(): + parameters_dict = {'balanced': False, + 'grid_search_folds': 10, + 'max_depth_range': (0, 6), + 'learning_rate_range': (0.1, 0.3), + 'n_estimators_range': (100, 200), + 'colsample_bytree_range': (0.5, 1), + 'reg_alpha': 0, + 'reg_lambda': 1, + 'n_threads': 15} + + return parameters_dict + class OneVsOneSVM(base.MLAlgorithm): - def __init__(self, kernel, y, balanced=True, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - self._kernel = kernel - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): - if self._balanced: - svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6)) @@ -865,15 +879,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -883,7 +897,7 @@ def evaluate(self, train_index, test_index): x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) @@ -923,8 +937,9 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: - svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6)) @@ -953,20 +968,28 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return True + + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': True, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} + + return parameters_dict + class OneVsRestSVM(base.MLAlgorithm): - def __init__(self, kernel, y, balanced=True, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - self._kernel = kernel - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): - if self._balanced: - svc = OneVsRestClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsRestClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsRestClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6)) @@ -1009,15 +1032,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -1027,7 +1050,7 @@ def evaluate(self, train_index, test_index): x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) @@ -1067,8 +1090,9 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: - svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6)) @@ -1096,3 +1120,17 @@ def save_weights(self, classifier, x, output_dir): def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + + @staticmethod + def uses_kernel(): + return True + + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': True, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} + + return parameters_dict diff --git a/clinica/pipelines/machine_learning/base.py b/clinica/pipelines/machine_learning/base.py index aff46e809..033492d43 100644 --- a/clinica/pipelines/machine_learning/base.py +++ b/clinica/pipelines/machine_learning/base.py @@ -1,7 +1,7 @@ # coding: utf8 -import abc +from abc import ABC, abstractmethod __author__ = "Jorge Samper-Gonzalez" __copyright__ = "Copyright 2016-2019 The Aramis Lab Team" @@ -13,114 +13,151 @@ __status__ = "Development" -class MLWorkflow: - __metaclass__ = abc.ABCMeta +class MLWorkflow(ABC): - # def __init__(self, ml_input, ml_validation, ml_algorithm, output_dir): - # self._ml_input = ml_input - # self._ml_validation = ml_validation - # self._ml_algorithm = ml_algorithm - # self._output_dir = output_dir + def __init__(self, input_class, validation_class, algorithm_class, all_params, output_dir): - @abc.abstractmethod - def run(self): - pass + self._input_class = input_class + self._validation_class = validation_class + self._algorithm_class = algorithm_class - def save_image(self): + self._input_params = self.create_parameters_dict(all_params, input_class) + self._validation_params = self.create_parameters_dict(all_params, validation_class) + self._algorithm_params = self.create_parameters_dict(all_params, algorithm_class) - import os - import pandas as pd + self._output_dir = output_dir - pd.io.parsers.read_csv(os.path.join(self._output_dir, 'results.tsv'), sep='\t') + self._input = None + self._validation = None + self._algorithm = None - @staticmethod - def metric_distribution(metric, labels, output_path, num_classes=2, metric_label='balanced accuracy'): - """ + def run(self): - Distribution plots of various metrics such as balanced accuracy! + from os import path, makedirs - metric is expected to be ndarray of size [num_repetitions, num_datasets] + # Instantiating input class + self._input = self._input_class(self._input_params) - """ - import numpy as np - import matplotlib.pyplot as plt - from matplotlib import cm - from matplotlib.backends.backend_pdf import PdfPages + # Computing input values + x = self._input.get_x() + y = self._input.get_y() - num_repetitions = metric.shape[0] - num_datasets = metric.shape[1] - assert len(labels) == num_datasets, "Differing number of features and labels!" - method_ticks = 1.0 + np.arange(num_datasets) + # Instantiating classification algorithm + if self._algorithm_class.uses_kernel(): + kernel = self._input.get_kernel() + self._algorithm = self._algorithm_class(kernel, y, self._algorithm_params) + else: + self._algorithm = self._algorithm_class(x, y, self._algorithm_params) - fig, ax = plt.subplots(figsize=[9, 9]) - line_coll = ax.violinplot(metric, widths=0.8, bw_method=0.2, - showmedians=True, showextrema=False, - positions=method_ticks) + # Instantiating cross-validation method and classification algorithm + self._validation = self._validation_class(self._algorithm, self._validation_params) - cmap = cm.get_cmap('Paired', num_datasets) - for cc, ln in enumerate(line_coll['bodies']): - ln.set_facecolor(cmap(cc)) - ln.set_label(labels[cc]) + # Launching classification with selected cross-validation + classifier, best_params, results = self._validation.validate(y) - plt.legend(loc=2, ncol=num_datasets) + # Creation of the directory to save results + classifier_dir = path.join(self._output_dir, 'classifier') + if not path.exists(classifier_dir): + makedirs(classifier_dir) - ax.tick_params(axis='both', which='major', labelsize=15) - ax.grid(axis='y', which='major') + # Saving algorithm trained classifier + self._algorithm.save_classifier(classifier, classifier_dir) + self._algorithm.save_weights(classifier, x, classifier_dir) + self._algorithm.save_parameters(best_params, classifier_dir) - lower_lim = np.round(np.min([np.float64(0.9 / num_classes), metric.min()]), 3) - upper_lim = np.round(np.max([1.01, metric.max()]), 3) - step_tick = 0.1 - ax.set_ylim(lower_lim, upper_lim) + # Saving validation trained classifier + self._validation.save_results(self._output_dir) - ax.set_xticks(method_ticks) - ax.set_xlim(np.min(method_ticks) - 1, np.max(method_ticks) + 1) - ax.set_xticklabels(labels, rotation=45) # 'vertical' + @staticmethod + def create_parameters_dict(locals_dictionary, component_class): - ax.set_yticks(np.arange(lower_lim, upper_lim, step_tick)) - ax.set_yticklabels(np.arange(lower_lim, upper_lim, step_tick)) - # plt.xlabel(xlabel, fontsize=16) - plt.ylabel(metric_label, fontsize=16) + default_parameters = component_class.get_default_parameters() + for key in locals_dictionary: + if key in default_parameters: + default_parameters[key] = locals_dictionary[key] + return default_parameters - fig.tight_layout() - pp1 = PdfPages(output_path + '.pdf') - pp1.savefig() - pp1.close() +class MLInput(ABC): - return + def __init__(self, input_params): + self._input_params = self.get_default_parameters() + self._input_params.update(input_params) -class MLInput: - __metaclass__ = abc.ABCMeta + self._x = None + self._y = None + self._kernel = None - @abc.abstractmethod + @abstractmethod def get_x(self): pass - @abc.abstractmethod + @abstractmethod def get_y(self): pass + @staticmethod + @abstractmethod + def get_default_parameters(): + pass -class MLValidation: - __metaclass__ = abc.ABCMeta - @abc.abstractmethod +class MLValidation(ABC): + + def __init__(self, ml_algorithm, validation_params): + + self._ml_algorithm = ml_algorithm + + self._validation_params = self.get_default_parameters() + self._validation_params.update(validation_params) + + self._validation_results = [] + self._classifier = None + self._best_params = None + + @abstractmethod def validate(self, y): pass + @staticmethod + @abstractmethod + def get_default_parameters(): + pass + + +class MLAlgorithm(ABC): + + def __init__(self, input_data, y, algorithm_params): + + self._algorithm_params = self.get_default_parameters() + self._algorithm_params.update(algorithm_params) + + if self.uses_kernel(): + self._kernel = input_data + else: + self._x = input_data -class MLAlgorithm: - __metaclass__ = abc.ABCMeta + self._y = y - @abc.abstractmethod + @staticmethod + @abstractmethod + def uses_kernel(): + pass + + @abstractmethod def evaluate(self, train_index, test_index): pass - @abc.abstractmethod + @abstractmethod def save_classifier(self, classifier, output_dir): pass - @abc.abstractmethod + @abstractmethod def save_parameters(self, parameters, output_dir): pass + + @staticmethod + @abstractmethod + def get_default_parameters(): + pass diff --git a/clinica/pipelines/machine_learning/input.py b/clinica/pipelines/machine_learning/input.py index 669026fb8..81c4ceb7f 100644 --- a/clinica/pipelines/machine_learning/input.py +++ b/clinica/pipelines/machine_learning/input.py @@ -7,6 +7,7 @@ import numpy as np from pandas.io import parsers +from clinica.utils.stream import cprint from clinica.pipelines.machine_learning import base import clinica.pipelines.machine_learning.voxel_based_io as vbio import clinica.pipelines.machine_learning.vertex_based_io as vtxbio @@ -15,6 +16,8 @@ import clinica.pipelines.machine_learning.ml_utils as utils + + __author__ = "Jorge Samper-Gonzalez" __copyright__ = "Copyright 2016-2019 The Aramis Lab Team" __credits__ = ["Jorge Samper-Gonzalez", "Simona Bottani"] @@ -27,50 +30,37 @@ class CAPSInput(base.MLInput): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, precomputed_kernel=None): - """ + def __init__(self, input_params): - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - precomputed_kernel: - """ + super().__init__(input_params) - self._caps_directory = caps_directory - self._group_id = group_id - self._image_type = image_type self._images = None - self._x = None - self._y = None - self._kernel = None - subjects_visits = parsers.read_csv(subjects_visits_tsv, sep='\t') + subjects_visits = parsers.read_csv(self._input_params['subjects_visits_tsv'], sep='\t') if list(subjects_visits.columns.values) != ['participant_id', 'session_id']: raise Exception('Subjects and visits file is not in the correct format.') self._subjects = list(subjects_visits.participant_id) self._sessions = list(subjects_visits.session_id) - diagnoses = parsers.read_csv(diagnoses_tsv, sep='\t') + diagnoses = parsers.read_csv(self._input_params['diagnoses_tsv'], sep='\t') if 'diagnosis' not in list(diagnoses.columns.values): raise Exception('Diagnoses file is not in the correct format.') self._diagnoses = list(diagnoses.diagnosis) - if image_type not in ['T1', 'fdg', 'av45', 'pib', 'flute', 'dwi']: - raise Exception("Incorrect image type. It must be one of the values 'T1', 'fdg', 'av45', 'pib', 'flute' or 'dwi'") + if self._input_params['image_type'] not in ['T1', 'fdg', 'av45', 'pib', 'flute', 'dwi']: + raise Exception("Incorrect image type. It must be one of the values 'T1', 'fdg', 'av45', " + "'pib', 'flute' or 'dwi'") - if precomputed_kernel is not None: - if type(precomputed_kernel) == np.ndarray: - if precomputed_kernel.shape == (len(self._subjects), len(self._subjects)): - self._kernel = precomputed_kernel + if self._input_params['precomputed_kernel'] is not None: + if type(self._input_params['precomputed_kernel']) == np.ndarray: + if self._input_params['precomputed_kernel'].shape == (len(self._subjects), len(self._subjects)): + self._kernel = self._input_params['precomputed_kernel'] else: raise Exception("""Precomputed kernel provided is not in the correct format. It must be a numpy.ndarray object with number of rows and columns equal to the number of subjects, or a filename to a numpy txt file containing an object with the described format.""") - elif type(precomputed_kernel == str): - self._kernel = np.loadtxt(precomputed_kernel) + elif type(self._input_params['precomputed_kernel'] == str): + self._kernel = np.loadtxt(self._input_params['precomputed_kernel']) else: raise Exception("""Precomputed kernel provided is not in the correct format. It must be a numpy.ndarray object with number of rows and columns equal to the number of subjects, @@ -119,9 +109,9 @@ def get_kernel(self, kernel_function=utils.gram_matrix_linear, recompute_if_exis if self._x is None: self.get_x() - print("Computing kernel ...") + cprint("Computing kernel ...") self._kernel = kernel_function(self._x) - print("Kernel computed") + cprint("Kernel computed") return self._kernel def save_kernel(self, output_dir): @@ -143,36 +133,29 @@ def save_kernel(self, output_dir): def save_weights_as_nifti(self, weights, output_dir): pass + @staticmethod + def get_default_parameters(): + + parameters_dict = {'caps_directory': None, + 'subjects_visits_tsv': None, + 'diagnoses_tsv': None, + 'group_id': None, + 'image_type': None, + 'precomputed_kernel': None} + + return parameters_dict + class CAPSVoxelBasedInput(CAPSInput): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, fwhm=0, - modulated="on", pvc=None, mask_zeros=True, precomputed_kernel=None): - """ + def __init__(self, input_params): + + super().__init__(input_params) - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - fwhm: - modulated: - mask_zeros: - precomputed_kernel: - """ - - super(CAPSVoxelBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel=precomputed_kernel) - - self._fwhm = fwhm - self._modulated = modulated - self._pvc = pvc - self._mask_zeros = mask_zeros self._orig_shape = None self._data_mask = None - if modulated not in ['on', 'off']: + if self._input_params['modulated'] not in ['on', 'off']: raise Exception("Incorrect modulation parameter. It must be one of the values 'on' or 'off'") def get_images(self): @@ -184,23 +167,24 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'T1': - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) + if self._input_params['image_type'] == 'T1': + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 't1/spm/dartel/group-' + self._group_id, + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 't1/spm/dartel/group-' + self._input_params['group_id'], '%s_%s_T1w_segm-graymatter_space-Ixi549Space_modulated-%s%s_probability.nii.gz' - % (self._subjects[i], self._sessions[i], self._modulated, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['modulated'], fwhm)) for i in range(len(self._subjects))] else: - pvc = '' if self._pvc is None else '_pvc-%s' % self._pvc - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) - suvr = 'pons' if self._image_type == 'fdg' else 'cerebellumPons' + pvc = '' if self._input_params['pvc'] is None else '_pvc-%s' % self._input_params['pvc'] + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) + suvr = 'pons' if self._input_params['image_type'] == 'fdg' else 'cerebellumPons' - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 'pet/preprocessing/group-' + self._group_id, + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet/preprocessing/group-' + self._input_params['group_id'], '%s_%s_task-rest_acq-%s_pet_space-Ixi549Space%s_suvr-%s_mask-brain%s_pet.nii.gz' - % (self._subjects[i], self._sessions[i], self._image_type, pvc, suvr, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['image_type'], pvc, + suvr, fwhm)) for i in range(len(self._subjects))] for image in self._images: @@ -218,9 +202,9 @@ def get_x(self): if self._x is not None: return self._x - print('Loading ' + str(len(self.get_images())) + ' subjects') - self._x, self._orig_shape, self._data_mask = vbio.load_data(self._images, mask=self._mask_zeros) - print('Subjects loaded') + cprint('Loading ' + str(len(self.get_images())) + ' subjects') + self._x, self._orig_shape, self._data_mask = vbio.load_data(self._images, mask=self._input_params['mask_zeros']) + cprint('Subjects loaded') return self._x @@ -233,33 +217,30 @@ def save_weights_as_nifti(self, weights, output_dir): data = vbio.revert_mask(weights, self._data_mask, self._orig_shape) vbio.weights_to_nifti(data, self._images[0], output_filename) + @staticmethod + def get_default_parameters(): -class CAPSRegionBasedInput(CAPSInput): + parameters_dict = super(CAPSVoxelBasedInput, CAPSVoxelBasedInput).get_default_parameters() - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, - pvc=None, precomputed_kernel=None): - """ + new_parameters = {'fwhm': 0, + 'modulated': "on", + 'pvc': None, + 'mask_zeros': True} - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - atlas: - precomputed_kernel: - """ + parameters_dict.update(new_parameters) - super(CAPSRegionBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel=precomputed_kernel) + return parameters_dict - self._atlas = atlas - self._pvc = pvc - self._orig_shape = None - self._data_mask = None - if atlas not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: - raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers' ") +class CAPSRegionBasedInput(CAPSInput): + + def __init__(self, input_params): + + super().__init__(input_params) + + if self._input_params['atlas'] not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: + raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', " + "'AICHA', 'LPBA40', 'Hammers' ") def get_images(self): """ @@ -270,20 +251,21 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'T1': - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 't1/spm/dartel/group-' + self._group_id, + if self._input_params['image_type'] == 'T1': + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 't1/spm/dartel/group-' + self._input_params['group_id'], 'atlas_statistics/', '%s_%s_T1w_space-%s_map-graymatter_statistics.tsv' - % (self._subjects[i], self._sessions[i], self._atlas)) + % (self._subjects[i], self._sessions[i], self._input_params['atlas'])) for i in range(len(self._subjects))] else: - pvc = '' if self._pvc is None else '_pvc-%s' % self._pvc - suvr = 'pons' if self._image_type == 'fdg' else 'cerebellumPons' - - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 'pet/preprocessing/group-' + self._group_id, 'atlas_statistics', - '%s_%s_task-rest_acq-%s_pet_space-%s%s_suvr-%s_statistics.tsv' - % (self._subjects[i], self._sessions[i], self._image_type, self._atlas, pvc, suvr)) + pvc = '' if self._input_params['pvc'] is None else '_pvc-%s' % self._input_params['pvc'] + suvr = 'pons' if self._input_params['image_type'] == 'fdg' else 'cerebellumPons' + + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet/preprocessing/group-' + self._input_params['group_id'], + 'atlas_statistics', '%s_%s_task-rest_acq-%s_pet_space-%s%s_suvr-%s_statistics.tsv' + % (self._subjects[i], self._sessions[i], self._input_params['image_type'], + self._input_params['atlas'], pvc, suvr)) for i in range(len(self._subjects))] for image in self._images: @@ -301,9 +283,9 @@ def get_x(self): if self._x is not None: return self._x - print('Loading ' + str(len(self.get_images())) + ' subjects') + cprint('Loading ' + str(len(self.get_images())) + ' subjects') self._x = rbio.load_data(self._images, self._subjects) - print('Subjects loaded') + cprint('Subjects loaded') return self._x @@ -319,17 +301,27 @@ def save_weights_as_nifti(self, weights, output_dir): """ output_filename = path.join(output_dir, 'weights.nii.gz') - rbio.weights_to_nifti(weights, self._atlas, output_filename) + rbio.weights_to_nifti(weights, self._input_params['atlas'], output_filename) + + @staticmethod + def get_default_parameters(): + + parameters_dict = super(CAPSRegionBasedInput, CAPSRegionBasedInput).get_default_parameters() + + new_parameters = {'atlas': None, + 'pvc': None, + 'mask_zeros': True} + + parameters_dict.update(new_parameters) + + return parameters_dict class CAPSVertexBasedInput(CAPSInput): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, fwhm, image_type, precomputed_kernel=None): - super(CAPSVertexBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel) - self._fwhm = fwhm - self._image_type = image_type - self._caps_directory = caps_directory + def __init__(self, input_params): + + super().__init__(input_params) def get_images(self): import os @@ -340,14 +332,15 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'fdg' and self._images is None: + if self._input_params['image_type'] == 'fdg' and self._images is None: self._images = [] hemi = ['lh', 'rh'] for i in range(len(self._subjects)): - self._images.append([os.path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], 'pet', - 'surface', self._subjects[i] + '_' + self._sessions[i] - + '_task-rest_acq-fdg_pet_space-fsaverage_suvr-pons_pvc-iy_hemi-' + h - + '_fwhm-' + str(self._fwhm) + '_projection.mgh') for h in hemi]) + self._images.append([os.path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet', 'surface', self._subjects[i] + '_' + + self._sessions[i] + '_task-rest_acq-fdg_pet_space-fsaverage_' + 'suvr-pons_pvc-iy_hemi-' + h + '_fwhm-' + + str(self._input_params['fwhm']) + '_projection.mgh') for h in hemi]) missing_files = [] missing_files_string_error = '' for img in self._images: @@ -361,7 +354,6 @@ def get_images(self): return self._images def get_x(self): - from clinica.utils.stream import cprint """ Returns numpy 2D array """ @@ -398,35 +390,27 @@ def save_weights_as_datasurface(self, weights, output_dir): def save_weights_as_nifti(self, weights, output_dir): pass + @staticmethod + def get_default_parameters(): -class CAPSTSVBasedInput(CAPSInput): + parameters_dict = super(CAPSVertexBasedInput, CAPSVertexBasedInput).get_default_parameters() - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset, - pvc=None, precomputed_kernel=None): - """ + new_parameters = {'fwhm': 0} - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - atlas: - precomputed_kernel: - """ + parameters_dict.update(new_parameters) - super(CAPSTSVBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel) + return parameters_dict - self._atlas = atlas - self._pvc = pvc - self._dataset = dataset - self._orig_shape = None - self._data_mask = None +class CAPSTSVBasedInput(CAPSInput): + + def __init__(self, input_params): + + super().__init__(input_params) - if atlas not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: - raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers' ") + if self._input_params['atlas'] not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: + raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', " + "'AICHA', 'LPBA40', 'Hammers' ") def get_images(self): """ @@ -447,12 +431,14 @@ def get_x(self): # if self._x is not None: # return self._x - print('Loading TSV subjects') - string = str('group-' + self._group_id + '_T1w_space-' + self._atlas + '_map-graymatter') + cprint('Loading TSV subjects') + string = str('group-' + self._input_params['group_id'] + '_T1w_space-' + self._input_params['atlas'] + + '_map-graymatter') - self._x = tbio.load_data(string, self._caps_directory, self._subjects, self._sessions, self._dataset) + self._x = tbio.load_data(string, self._input_params['caps_directory'], self._subjects, self._sessions, + self._input_params['dataset']) - print('Subjects loaded') + cprint('Subjects loaded') return self._x @@ -469,40 +455,24 @@ def save_weights_as_nifti(self, weights, output_dir): # output_filename = path.join(output_dir, 'weights.nii.gz') - # rbio.weights_to_nifti(weights, self._atlas, output_filename) + # rbio.weights_to_nifti(weights, self._input_params['atlas'], output_filename) pass + @staticmethod + def get_default_parameters(): -class CAPSVoxelBasedInputREGSVM(CAPSInput): + parameters_dict = super(CAPSTSVBasedInput, CAPSTSVBasedInput).get_default_parameters() - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, fwhm=0, - modulated="on", pvc=None, mask_zeros=True, precomputed_kernel=None): - """ + new_parameters = {'atlas': None, + 'pvc': None, + 'dataset': None} - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - fwhm: - modulated: - mask_zeros: - precomputed_kernel: - """ - - super(CAPSVoxelBasedInputREGSVM, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel=precomputed_kernel) - - self._fwhm = fwhm - self._modulated = modulated - self._pvc = pvc - self._mask_zeros = mask_zeros - self._orig_shape = None - self._data_mask = None + parameters_dict.update(new_parameters) - if modulated not in ['on', 'off']: - raise Exception("Incorrect modulation parameter. It must be one of the values 'on' or 'off'") + return parameters_dict + + +class CAPSVoxelBasedInputREGSVM(CAPSVoxelBasedInput): def get_images(self): """ @@ -513,21 +483,22 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'T1': - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) + if self._input_params['image_type'] == 'T1': + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) - self._images = [path.join(self._caps_directory, + self._images = [path.join(self._input_params['caps_directory'], 'regul_%s_%s_T1w_segm-graymatter_space-Ixi549Space_modulated-%s%s_probability.nii' - % (self._subjects[i], self._sessions[i], self._modulated, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['modulated'], fwhm)) for i in range(len(self._subjects))] else: - pvc = '' if self._pvc is None else '_pvc-%s' % self._pvc - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) - suvr = 'pons' if self._image_type == 'fdg' else 'cerebellumPons' - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 'pet/preprocessing/group-' + self._group_id, + pvc = '' if self._input_params['pvc'] is None else '_pvc-%s' % self._input_params['pvc'] + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) + suvr = 'pons' if self._input_params['image_type'] == 'fdg' else 'cerebellumPons' + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet/preprocessing/group-' + self._input_params['group_id'], '%s_%s_task-rest_acq-%s_pet_space-Ixi549Space%s_suvr-%s_mask-brain%s_pet.nii.gz' - % (self._subjects[i], self._sessions[i], self._image_type, pvc, suvr, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['image_type'], + pvc, suvr, fwhm)) for i in range(len(self._subjects))] for image in self._images: @@ -536,26 +507,49 @@ def get_images(self): return self._images + +class TsvInput(base.MLInput): + + def __init__(self, input_params): + + super().__init__(input_params) + + import pandas as pd + + self._dataframe = pd.io.parsers.read_csv(input_params['data_tsv'], sep='\t') + + if not input_params['columns']: + raise Exception("List of columns to use as input can not be empty.") + def get_x(self): - """ + self._x = self._dataframe.as_matrix(self._input_params['columns']) + return self._x - Returns: a numpy 2d-array. + def get_y(self): + unique = list(set(self._dataframe["diagnosis"])) + self._y = np.array([unique.index(x) for x in self._dataframe["diagnosis"]]) + return self._y + def get_kernel(self, kernel_function=utils.gram_matrix_linear, recompute_if_exists=False): + """ + Returns: a numpy 2d-array. """ - if self._x is not None: - return self._x - print('Loading ' + str(len(self.get_images())) + ' subjects') - self._x, self._orig_shape, self._data_mask = vbio.load_data(self._images, mask=self._mask_zeros) - print('Subjects loaded') + if self._kernel is not None and not recompute_if_exists: + return self._kernel - return self._x + if self._x is None: + self.get_x() - def save_weights_as_nifti(self, weights, output_dir): + cprint("Computing kernel ...") + self._kernel = kernel_function(self._x) + cprint("Kernel computed") + return self._kernel - if self._images is None: - self.get_images() + @staticmethod + def get_default_parameters(): - output_filename = path.join(output_dir, 'weights.nii.gz') - data = vbio.revert_mask(weights, self._data_mask, self._orig_shape) - vbio.weights_to_nifti(data, self._images[0], output_filename) + parameters_dict = {'data_tsv': None, + 'columns': None} + + return parameters_dict diff --git a/clinica/pipelines/machine_learning/ml_utils.py b/clinica/pipelines/machine_learning/ml_utils.py index ea818f766..0076ccdaa 100644 --- a/clinica/pipelines/machine_learning/ml_utils.py +++ b/clinica/pipelines/machine_learning/ml_utils.py @@ -82,3 +82,61 @@ def evaluate_prediction_multiclass(y, y_hat): 'balanced_accuracy': balanced_accuracy} return results + + +def metric_distribution(metric, labels, output_path, num_classes=2, metric_label='balanced accuracy'): + """ + + Distribution plots of various metrics such as balanced accuracy! + + metric is expected to be ndarray of size [num_repetitions, num_datasets] + + """ + # from __future__ import print_function, division + + import numpy as np + import matplotlib.pyplot as plt + from matplotlib import cm + from matplotlib.backends.backend_pdf import PdfPages + + num_repetitions = metric.shape[0] + num_datasets = metric.shape[1] + assert len(labels) == num_datasets, "Differing number of features and labels!" + method_ticks = 1.0 + np.arange(num_datasets) + + fig, ax = plt.subplots(figsize=[9, 9]) + line_coll = ax.violinplot(metric, widths=0.8, bw_method=0.2, + showmedians=True, showextrema=False, + positions=method_ticks) + + cmap = cm.get_cmap('Paired', num_datasets) + for cc, ln in enumerate(line_coll['bodies']): + ln.set_facecolor(cmap(cc)) + ln.set_label(labels[cc]) + + plt.legend(loc=2, ncol=num_datasets) + + ax.tick_params(axis='both', which='major', labelsize=15) + ax.grid(axis='y', which='major') + + lower_lim = np.round(np.min([np.float64(0.9 / num_classes), metric.min()]), 3) + upper_lim = np.round(np.max([1.01, metric.max()]), 3) + step_tick = 0.1 + ax.set_ylim(lower_lim, upper_lim) + + ax.set_xticks(method_ticks) + ax.set_xlim(np.min(method_ticks) - 1, np.max(method_ticks) + 1) + ax.set_xticklabels(labels, rotation=45) # 'vertical' + + ax.set_yticks(np.arange(lower_lim, upper_lim, step_tick)) + ax.set_yticklabels(np.arange(lower_lim, upper_lim, step_tick)) + # plt.xlabel(xlabel, fontsize=16) + plt.ylabel(metric_label, fontsize=16) + + fig.tight_layout() + + pp1 = PdfPages(output_path + '.pdf') + pp1.savefig() + pp1.close() + + return diff --git a/clinica/pipelines/machine_learning/ml_workflows.py b/clinica/pipelines/machine_learning/ml_workflows.py index c54a5afcd..bb1e1ba7d 100644 --- a/clinica/pipelines/machine_learning/ml_workflows.py +++ b/clinica/pipelines/machine_learning/ml_workflows.py @@ -16,318 +16,87 @@ __email__ = "jorge.samper-gonzalez@inria.fr" __status__ = "Development" -# This code is an example of implementation of machine learning pipelines - -class VB_KFold_DualSVM(base.MLWorkflow): - - # First of all, input has to be chosen. According to it (CAPSVoxelBasedInput or CAPSRegionBasedInput), - # all the necessary inputs can be found in input.py +class VoxelBasedKFoldDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_folds=10, - grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - - # Here some parameters selected for this task - - self._output_dir = output_dir - self._n_threads = n_threads - self._n_folds = n_folds - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - # In this case we are running a voxel based input approach - # - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - - # Validation and algorithm will be selected in the next part of code - - self._validation = None - self._algorithm = None - - def run(self): - - # Call on parameters already computed - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - # Now algorithm has been selected, in this case Dual SVM algorithm. - # Look at algorithm.py to understand the input necessary for each method - # input parameters were chosen previously - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - # Here validation type is selected, it's the K fold cross-validation - - self._validation = validation.KFoldCV(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_folds=self._n_folds, n_threads=self._n_threads) - - # Creation of the path where all the results will be saved - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - # Here we have selected whant we wanted save - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) + grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - # self._input.save_weights_as_nifti(weights) + super(VoxelBasedKFoldDualSVM, self).__init__(input.CAPSVoxelBasedInput, + validation.KFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) -class VB_RepKFold_DualSVM(base.MLWorkflow): +class VoxelBasedRepKFoldDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, - modulated="on", precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, n_folds=10, - grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._n_folds = n_folds - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) + modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, + n_folds=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._input.save_weights_as_nifti(weights, classifier_dir) + super(VoxelBasedRepKFoldDualSVM, self).__init__(input.CAPSVoxelBasedInput, + validation.RepeatedKFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) -class VB_RepHoldOut_DualSVM(base.MLWorkflow): +class VoxelBasedRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) + test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), + splits_indices=None): - self._validation.save_results(self._output_dir) + super().__init__(input.CAPSVoxelBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) -class VertexB_RepHoldOut_dualSVM(base.MLWorkflow): +class VertexBasedRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, output_dir, image_type='fdg', fwhm=20, precomputed_kernel=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-10, 2, 1000), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSVertexBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, fwhm, - image_type, precomputed_kernel) - - self._validation = None - self._algorithm = None + super(VertexBasedRepHoldOutDualSVM, self).__init__(input.CAPSVertexBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._input.save_weights_as_datasurface(weights, classifier_dir) - self._validation.save_results(self._output_dir) - - -class RB_RepHoldOut_DualSVM(base.MLWorkflow): +class RegionBasedRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._input.save_weights_as_nifti(weights, classifier_dir) + super(RegionBasedRepHoldOutDualSVM, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - self._validation.save_results(self._output_dir) - -class RB_RepHoldOut_LogisticRegression(base.MLWorkflow): +class RegionBasedRepHoldOutLogisticRegression(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.LogisticReg(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) + super(RegionBasedRepHoldOutLogisticRegression, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedHoldOut, + algorithm.LogisticReg, + locals(), + output_dir) - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - self._classifier = classifier - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class RB_RepHoldOut_RandomForest(base.MLWorkflow): +class RegionBasedRepHoldOutRandomForest(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, @@ -335,518 +104,108 @@ def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, max_depth_range=[None], min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None + super(RegionBasedRepHoldOutRandomForest, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedHoldOut, + algorithm.RandomForest, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - n_estimators_range=self._n_estimators_range, - max_depth_range=self._max_depth_range, - min_samples_split_range=self._min_samples_split_range, - max_features_range=self._max_features_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class RB_LearningCurveRepHoldOut_DualSVM(base.MLWorkflow): +class RegionBasedLearningCurveRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, precomputed_kernel=None, n_threads=15, n_iterations=100, test_size=0.3, n_learning_points=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._n_learning_points = n_learning_points - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() + super(RegionBasedLearningCurveRepHoldOutDualSVM, self).__init__(input.CAPSRegionBasedInput, + validation.LearningCurveRepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - self._validation = validation.LearningCurveRepeatedHoldOut(self._algorithm, - n_iterations=self._n_iterations, - test_size=self._test_size, - n_learning_points=self._n_learning_points) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads) - - for learning_point in range(self._n_learning_points): - - learning_point_dir = path.join(self._output_dir, 'learning_split-' + str(learning_point)) - - classifier_dir = path.join(learning_point_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier[learning_point], classifier_dir) - self._algorithm.save_parameters(best_params[learning_point], classifier_dir) - weights = self._algorithm.save_weights(classifier[learning_point], x, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class VB_LearningCurveRepHoldOut_DualSVM(base.MLWorkflow): +class VoxelBasedLearningCurveRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - test_size=0.3, n_learning_points=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._n_learning_points = n_learning_points - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) + test_size=0.3, n_learning_points=10, grid_search_folds=10, balanced=True, + c_range=np.logspace(-6, 2, 17)): - self._validation = None - self._algorithm = None + super(VoxelBasedLearningCurveRepHoldOutDualSVM, self).__init__(input.CAPSVoxelBasedInput, + validation.LearningCurveRepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.LearningCurveRepeatedHoldOut(self._algorithm, - n_iterations=self._n_iterations, - test_size=self._test_size, - n_learning_points=self._n_learning_points) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads) - - for learning_point in range(self._n_learning_points): - - learning_point_dir = path.join(self._output_dir, 'learning_split-' + str(learning_point)) - - classifier_dir = path.join(learning_point_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier[learning_point], classifier_dir) - self._algorithm.save_parameters(best_params[learning_point], classifier_dir) - weights = self._algorithm.save_weights(classifier[learning_point], x, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class RB_RepKFold_DualSVM(base.MLWorkflow): +class RegionBasedRepKFoldDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, n_folds=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._n_folds = n_folds - self._splits_indices = splits_indices - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None + super(RegionBasedRepKFoldDualSVM, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedKFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - -class TB_RepHoldOut_DualSVM(base.MLWorkflow): +class CAPSTsvRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSTSVBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, - atlas, dataset, pvc) - - self._validation = None - self._algorithm = None - def run(self): + super(CAPSTsvRepHoldOutDualSVM, self).__init__(input.CAPSTSVBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class TB_RepHoldOut_RandomForest(base.MLWorkflow): +class CAPSTsvRepHoldOutRandomForest(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), max_depth_range=[None], min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._splits_indices = splits_indices - - self._input = input.CAPSTSVBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, dataset, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - n_estimators_range=self._n_estimators_range, - max_depth_range=self._max_depth_range, - min_samples_split_range=self._min_samples_split_range, - max_features_range=self._max_features_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - - # self._input.save_weights_as_nifti(weights, classifier_dir) - - # self._validation.save_results(self._output_dir) - + super(CAPSTsvRepHoldOutRandomForest, self).__init__(input.CAPSTSVBasedInput, + validation.RepeatedHoldOut, + algorithm.RandomForest, + locals(), + output_dir) # SVM reg -class VBREG_RepKFold_DualSVM(base.MLWorkflow): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, - modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - n_folds=10, - test_size=0.1, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), - splits_indices=None): +class VoxelBasedREGRepKFoldDualSVM(base.MLWorkflow): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - self._n_folds = n_folds - self._input = input.CAPSVoxelBasedInputREGSVM(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV(self._algorithm) - print('K fold') - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - -class RB_RepHoldOut_RandomForest_Multiclass(base.MLWorkflow): - - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, - output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, - grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), - max_depth_range=[None], min_samples_split_range=[2], - max_features_range=('auto', 0.25, 0.5), splits_indices=None): - - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - n_estimators_range=self._n_estimators_range, - max_depth_range=self._max_depth_range, - min_samples_split_range=self._min_samples_split_range, - max_features_range=self._max_features_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class VBREG_RepKfold_SVMOV0(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, n_folds=10, - test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), + test_size=0.1, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - self._n_folds = n_folds - self._input = input.CAPSVoxelBasedInputREGSVM(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): + super(VoxelBasedREGRepKFoldDualSVM, self).__init__(input.CAPSTSVBasedInput, + validation.RepeatedKFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - self._algorithm = algorithm.OneVsOneSVM(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) +# TSV - self._validation = validation.RepeatedKFoldCV_Multiclass(self._algorithm) +class TsvRepHoldOutRandomForest(base.MLWorkflow): - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class VBREG_RepKfold_SVMOVR(base.MLWorkflow): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, - modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - n_folds=10, - test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), - splits_indices=None): + def __init__(self, data_tsv, columns, output_dir, n_threads=20, n_iterations=250, test_size=0.2, + grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), max_depth_range=[None], + min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None, + inner_cv=False): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - self._n_folds = n_folds - self._input = input.CAPSVoxelBasedInputREGSVM(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.OneVsRestSVM(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV_Multiclass(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) + super(TsvRepHoldOutRandomForest, self).__init__(input.TsvInput, + validation.RepeatedHoldOut, + algorithm.RandomForest, + locals(), + output_dir) diff --git a/clinica/pipelines/machine_learning/validation.py b/clinica/pipelines/machine_learning/validation.py index d743600dc..56d32a65e 100644 --- a/clinica/pipelines/machine_learning/validation.py +++ b/clinica/pipelines/machine_learning/validation.py @@ -21,41 +21,32 @@ class KFoldCV(base.MLValidation): - def __init__(self, ml_algorithm): - self._ml_algorithm = ml_algorithm - self._fold_results = [] - self._classifier = None - self._best_params = None - self._cv = None + def validate(self, y): - def validate(self, y, n_folds=10, splits_indices=None, n_threads=15): + if self._validation_params['splits_indices'] is None: + skf = StratifiedKFold(n_splits=self._validation_params['n_folds'], shuffle=True) + self._validation_params['splits_indices'] = list(skf.split(np.zeros(len(y)), y)) - if splits_indices is None: - skf = StratifiedKFold(n_splits=n_folds, shuffle=True) - self._cv = list(skf.split(np.zeros(len(y)), y)) - else: - self._cv = splits_indices - - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - for i in range(n_folds): + for i in range(self._validation_params['n_folds']): - train_index, test_index = self._cv[i] + train_index, test_index = self._validation_params['splits_indices'][i] async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) async_pool.close() async_pool.join() - for i in range(n_folds): - self._fold_results.append(async_result[i].get()) + for i in range(self._validation_params['n_folds']): + self._validation_results.append(async_result[i].get()) - self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._fold_results) + self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._validation_results) - return self._classifier, self._best_params, self._fold_results + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._fold_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") subjects_folds = [] @@ -65,27 +56,27 @@ def save_results(self, output_dir): if not path.exists(container_dir): os.makedirs(container_dir) - for i in range(len(self._fold_results)): - subjects_df = pd.DataFrame({'y': self._fold_results[i]['y'], - 'y_hat': self._fold_results[i]['y_hat'], - 'y_index': self._fold_results[i]['y_index']}) + for i in range(len(self._validation_results)): + subjects_df = pd.DataFrame({'y': self._validation_results[i]['y'], + 'y_hat': self._validation_results[i]['y_hat'], + 'y_index': self._validation_results[i]['y_index']}) subjects_df.to_csv(path.join(container_dir, 'subjects_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') subjects_folds.append(subjects_df) - results_df = pd.DataFrame({'balanced_accuracy': self._fold_results[i]['evaluation']['balanced_accuracy'], - 'auc': self._fold_results[i]['auc'], - 'accuracy': self._fold_results[i]['evaluation']['accuracy'], - 'sensitivity': self._fold_results[i]['evaluation']['sensitivity'], - 'specificity': self._fold_results[i]['evaluation']['specificity'], - 'ppv': self._fold_results[i]['evaluation']['ppv'], - 'npv': self._fold_results[i]['evaluation']['npv'], - 'train_balanced_accuracy': self._fold_results[i]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._fold_results[i]['evaluation_train']['accuracy'], - 'train_sensitivity': self._fold_results[i]['evaluation_train']['sensitivity'], - 'train_specificity': self._fold_results[i]['evaluation_train']['specificity'], - 'train_ppv': self._fold_results[i]['evaluation_train']['ppv'], - 'train_npv': self._fold_results[i]['evaluation_train']['npv'] + results_df = pd.DataFrame({'balanced_accuracy': self._validation_results[i]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[i]['auc'], + 'accuracy': self._validation_results[i]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[i]['evaluation']['sensitivity'], + 'specificity': self._validation_results[i]['evaluation']['specificity'], + 'ppv': self._validation_results[i]['evaluation']['ppv'], + 'npv': self._validation_results[i]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[i]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[i]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[i]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[i]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[i]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[i]['evaluation_train']['npv'] }, index=['i', ]) results_df.to_csv(path.join(container_dir, 'results_fold-' + str(i) + '.tsv'), @@ -110,53 +101,66 @@ def save_results(self, output_dir): print("sensitivity: %s" % (mean_results['sensitivity'].to_string(index=False))) print("auc: %s" % (mean_results['auc'].to_string(index=False))) + @staticmethod + def get_default_parameters(): + + parameters_dict = {'n_folds': 10, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} + + return parameters_dict + class RepeatedKFoldCV(base.MLValidation): - def __init__(self, ml_algorithm): - self._ml_algorithm = ml_algorithm - self._repeated_fold_results = [] - self._classifier = None - self._best_params = None - self._cv = None + def validate(self, y): - def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): + if self._validation_params['splits_indices'] is None: + self._validation_params['splits_indices'] = [] + + for i in range(self._validation_params['n_iterations']): + skf = StratifiedKFold(n_splits=self._validation_params['n_folds'], shuffle=True) + self._validation_params['splits_indices'].append(list(skf.split(np.zeros(len(y)), y))) - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - self._cv = [] - for r in range(n_iterations): - skf = StratifiedKFold(n_splits=n_folds, shuffle=True) - self._cv.append(list(skf.split(np.zeros(len(y)), y))) + for i in range(self._validation_params['n_iterations']): + + train_index, test_index = self._validation_params['splits_indices'][i] + async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) + + for r in range(self._validation_params['n_iterations']): + async_result[r] = {} - self._repeated_fold_results.append([]) + self._validation_results.append([]) - for i in range(n_folds): + for i in range(self._validation_params['n_folds']): - train_index, test_index = self._cv[r][i] + train_index, test_index = self._validation_params['splits_indices'][r][i] async_result[r][i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) async_pool.close() async_pool.join() - for r in range(n_iterations): - for i in range(n_folds): - self._repeated_fold_results[r].append(async_result[r][i].get()) + for r in range(self._validation_params['n_iterations']): + for i in range(self._validation_params['n_folds']): + self._validation_results[r].append(async_result[r][i].get()) # TODO Find a better way to estimate best parameter - flat_results = [result for fold in self._repeated_fold_results for result in fold] + flat_results = [result for fold in self._validation_results for result in fold] self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(flat_results) - return self._classifier, self._best_params, self._repeated_fold_results + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._repeated_fold_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") all_results_list = [] all_subjects_list = [] - for iteration in range(len(self._repeated_fold_results)): + for iteration in range(len(self._validation_results)): iteration_dir = path.join(output_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): @@ -169,28 +173,28 @@ def save_results(self, output_dir): if not path.exists(folds_dir): os.makedirs(folds_dir) - for i in range(len(self._repeated_fold_results[iteration])): - subjects_df = pd.DataFrame({'y': self._repeated_fold_results[iteration][i]['y'], - 'y_hat': self._repeated_fold_results[iteration][i]['y_hat'], - 'y_index': self._repeated_fold_results[iteration][i]['y_index']}) + for i in range(len(self._validation_results[iteration])): + subjects_df = pd.DataFrame({'y': self._validation_results[iteration][i]['y'], + 'y_hat': self._validation_results[iteration][i]['y_hat'], + 'y_index': self._validation_results[iteration][i]['y_index']}) subjects_df.to_csv(path.join(folds_dir, 'subjects_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') iteration_subjects_list.append(subjects_df) results_df = pd.DataFrame( - {'balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation']['balanced_accuracy'], - 'auc': self._repeated_fold_results[iteration][i]['auc'], - 'accuracy': self._repeated_fold_results[iteration][i]['evaluation']['accuracy'], - 'sensitivity': self._repeated_fold_results[iteration][i]['evaluation']['sensitivity'], - 'specificity': self._repeated_fold_results[iteration][i]['evaluation']['specificity'], - 'ppv': self._repeated_fold_results[iteration][i]['evaluation']['ppv'], - 'npv': self._repeated_fold_results[iteration][i]['evaluation']['npv'], - 'train_balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['accuracy'], - 'train_sensitivity': self._repeated_fold_results[iteration][i]['evaluation_train']['sensitivity'], - 'train_specificity': self._repeated_fold_results[iteration][i]['evaluation_train']['specificity'], - 'train_ppv': self._repeated_fold_results[iteration][i]['evaluation_train']['ppv'], - 'train_npv': self._repeated_fold_results[iteration][i]['evaluation_train']['npv'] + {'balanced_accuracy': self._validation_results[iteration][i]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[iteration][i]['auc'], + 'accuracy': self._validation_results[iteration][i]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[iteration][i]['evaluation']['sensitivity'], + 'specificity': self._validation_results[iteration][i]['evaluation']['specificity'], + 'ppv': self._validation_results[iteration][i]['evaluation']['ppv'], + 'npv': self._validation_results[iteration][i]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[iteration][i]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[iteration][i]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[iteration][i]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[iteration][i]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[iteration][i]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[iteration][i]['evaluation_train']['npv'] }, index=['i', ]) results_df.to_csv(path.join(folds_dir, 'results_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') @@ -230,36 +234,34 @@ def save_results(self, output_dir): print("sensitivity: %s" % (mean_results_df['sensitivity'].to_string(index=False))) print("auc: %s" % (mean_results_df['auc'].to_string(index=False))) + @staticmethod + def get_default_parameters(): + + parameters_dict = {'n_iterations': 100, + 'n_folds': 10, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} + + return parameters_dict + class RepeatedHoldOut(base.MLValidation): - def __init__(self, ml_algorithm, n_iterations=100, test_size=0.3): - self._ml_algorithm = ml_algorithm - self._split_results = [] - self._classifier = None - self._best_params = None - self._cv = None - self._n_iterations = n_iterations - self._test_size = test_size - self._error_resampled_t = None - self._error_corrected_resampled_t = None - self._bal_accuracy_resampled_t = None - self._bal_accuracy_corrected_resampled_t = None - - def validate(self, y, n_threads=15, splits_indices=None, inner_cv=True): - - if splits_indices is None: - splits = StratifiedShuffleSplit(n_splits=self._n_iterations, test_size=self._test_size) - self._cv = list(splits.split(np.zeros(len(y)), y)) - else: - self._cv = splits_indices - async_pool = ThreadPool(n_threads) + def validate(self, y): + + if self._validation_params['splits_indices'] is None: + splits = StratifiedShuffleSplit(n_splits=self._validation_params['n_iterations'], + test_size=self._validation_params['test_size']) + self._validation_params['splits_indices'] = list(splits.split(np.zeros(len(y)), y)) + + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - for i in range(self._n_iterations): + for i in range(self._validation_params['n_iterations']): - train_index, test_index = self._cv[i] - if inner_cv: + train_index, test_index = self._validation_params['splits_indices'][i] + if self._validation_params['inner_cv']: async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) else: async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate_no_cv, (train_index, test_index)) @@ -267,55 +269,55 @@ def validate(self, y, n_threads=15, splits_indices=None, inner_cv=True): async_pool.close() async_pool.join() - for i in range(self._n_iterations): - self._split_results.append(async_result[i].get()) + for i in range(self._validation_params['n_iterations']): + self._validation_results.append(async_result[i].get()) - self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._split_results) - return self._classifier, self._best_params, self._split_results + self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._validation_results) + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._split_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") all_results_list = [] all_train_subjects_list = [] all_test_subjects_list = [] - for iteration in range(len(self._split_results)): + for iteration in range(len(self._validation_results)): iteration_dir = path.join(output_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): os.makedirs(iteration_dir) iteration_train_subjects_df = pd.DataFrame({'iteration': iteration, - 'y': self._split_results[iteration]['y_train'], - 'y_hat': self._split_results[iteration]['y_hat_train'], - 'subject_index': self._split_results[iteration]['x_index']}) + 'y': self._validation_results[iteration]['y_train'], + 'y_hat': self._validation_results[iteration]['y_hat_train'], + 'subject_index': self._validation_results[iteration]['x_index']}) iteration_train_subjects_df.to_csv(path.join(iteration_dir, 'train_subjects.tsv'), index=False, sep='\t', encoding='utf-8') all_train_subjects_list.append(iteration_train_subjects_df) iteration_test_subjects_df = pd.DataFrame({'iteration': iteration, - 'y': self._split_results[iteration]['y'], - 'y_hat': self._split_results[iteration]['y_hat'], - 'subject_index': self._split_results[iteration]['y_index']}) + 'y': self._validation_results[iteration]['y'], + 'y_hat': self._validation_results[iteration]['y_hat'], + 'subject_index': self._validation_results[iteration]['y_index']}) iteration_test_subjects_df.to_csv(path.join(iteration_dir, 'test_subjects.tsv'), index=False, sep='\t', encoding='utf-8') all_test_subjects_list.append(iteration_test_subjects_df) iteration_results_df = pd.DataFrame( - {'balanced_accuracy': self._split_results[iteration]['evaluation']['balanced_accuracy'], - 'auc': self._split_results[iteration]['auc'], - 'accuracy': self._split_results[iteration]['evaluation']['accuracy'], - 'sensitivity': self._split_results[iteration]['evaluation']['sensitivity'], - 'specificity': self._split_results[iteration]['evaluation']['specificity'], - 'ppv': self._split_results[iteration]['evaluation']['ppv'], - 'npv': self._split_results[iteration]['evaluation']['npv'], - 'train_balanced_accuracy': self._split_results[iteration]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._split_results[iteration]['evaluation_train']['accuracy'], - 'train_sensitivity': self._split_results[iteration]['evaluation_train']['sensitivity'], - 'train_specificity': self._split_results[iteration]['evaluation_train']['specificity'], - 'train_ppv': self._split_results[iteration]['evaluation_train']['ppv'], - 'train_npv': self._split_results[iteration]['evaluation_train']['npv'] + {'balanced_accuracy': self._validation_results[iteration]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[iteration]['auc'], + 'accuracy': self._validation_results[iteration]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[iteration]['evaluation']['sensitivity'], + 'specificity': self._validation_results[iteration]['evaluation']['specificity'], + 'ppv': self._validation_results[iteration]['evaluation']['ppv'], + 'npv': self._validation_results[iteration]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[iteration]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[iteration]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[iteration]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[iteration]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[iteration]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[iteration]['evaluation_train']['npv'] }, index=['i', ]) iteration_results_df.to_csv(path.join(iteration_dir, 'results.tsv'), index=False, sep='\t', encoding='utf-8') @@ -349,156 +351,99 @@ def save_results(self, output_dir): print("sensitivity: %s" % (mean_results_df['sensitivity'].to_string(index=False))) print("auc: %s" % (mean_results_df['auc'].to_string(index=False))) - self.compute_error_variance() - self.compute_accuracy_variance() + @staticmethod + def get_default_parameters(): - variance_df = pd.DataFrame({'bal_accuracy_resampled_t': self._bal_accuracy_resampled_t, - 'bal_accuracy_corrected_resampled_t': self._bal_accuracy_corrected_resampled_t, - 'error_resampled_t': self._error_resampled_t, - 'error_corrected_resampled_t': self._error_corrected_resampled_t}, index=[0, ]) + parameters_dict = {'n_iterations': 100, + 'test_size': 0.2, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} - variance_df.to_csv(path.join(output_dir, 'variance.tsv'), - index=False, sep='\t', encoding='utf-8') - - def _compute_variance(self, test_error_split): - - # compute average test error - num_split = len(self._split_results) # J in the paper - - # compute mu_{n_1}^{n_2} - average_test_error = np.mean(test_error_split) - - approx_variance = np.sum((test_error_split - average_test_error)**2)/(num_split - 1) - - # compute variance (point 2 and 6 of Nadeau's paper) - resampled_t = approx_variance / num_split - corrected_resampled_t = (1/num_split + self._test_size/(1 - self._test_size)) * approx_variance - - return resampled_t, corrected_resampled_t - - def compute_error_variance(self): - num_split = len(self._split_results) - test_error_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_error_split[i] = self._compute_average_test_error(self._split_results[i]['y'], - self._split_results[i]['y_hat']) - - self._error_resampled_t, self._error_corrected_resampled_t = self._compute_variance(test_error_split) - - return self._error_resampled_t, self._error_corrected_resampled_t + return parameters_dict - def _compute_average_test_error(self, y_list, yhat_list): - # return the average test error (denoted mu_j hat) - return float(len(np.where(y_list != yhat_list)[0]))/float(len(y_list)) - def compute_accuracy_variance(self): - num_split = len(self._split_results) - test_accuracy_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_accuracy_split[i] = self._compute_average_test_accuracy(self._split_results[i]['y'], - self._split_results[i]['y_hat']) - - self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t = self._compute_variance(test_accuracy_split) - - return self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t - - def _compute_average_test_accuracy(self, y_list, yhat_list): - - from clinica.pipelines.machine_learning.ml_utils import evaluate_prediction - - return evaluate_prediction(y_list, yhat_list)['balanced_accuracy'] +class LearningCurveRepeatedHoldOut(base.MLValidation): + def validate(self, y): -class LearningCurveRepeatedHoldOut(base.MLValidation): + if self._validation_params['splits_indices'] is None: + splits = StratifiedShuffleSplit(n_splits=self._validation_params['n_iterations'], + test_size=self._validation_params['test_size']) + self._validation_params['splits_indices'] = list(splits.split(np.zeros(len(y)), y)) - def __init__(self, ml_algorithm, n_iterations=100, test_size=0.3, n_learning_points=10): - self._ml_algorithm = ml_algorithm - self._split_results = [] - self._classifier = None - self._best_params = None - self._cv = None - self._n_iterations = n_iterations - self._test_size = test_size - self._n_learning_points = n_learning_points - self._error_resampled_t = None - self._error_corrected_resampled_t = None - self._bal_accuracy_resampled_t = None - self._bal_accuracy_corrected_resampled_t = None - - def validate(self, y, n_threads=15): - - splits = StratifiedShuffleSplit(n_splits=self._n_iterations, test_size=self._test_size) - self._cv = list(splits.split(np.zeros(len(y)), y)) - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - for i in range(self._n_iterations): - train_index, test_index = self._cv[i] + for i in range(self._validation_params['n_iterations']): + train_index, test_index = self._validation_params['splits_indices'][i] async_result[i] = {} - skf = StratifiedKFold(n_splits=self._n_learning_points, shuffle=False) - inner_cv = list(skf.split(np.zeros(len(y[train_index])), y[train_index])) + skf = StratifiedKFold(n_splits=self._validation_params['n_learning_points'], shuffle=False) + inner_cv_splits = list(skf.split(np.zeros(len(y[train_index])), y[train_index])) - for j in range(self._n_learning_points): - inner_train_index = np.concatenate([indexes[1] for indexes in inner_cv[:j + 1]]).ravel() - async_result[i][j] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index[inner_train_index], test_index)) + for j in range(self._validation_params['n_learning_points']): + inner_train_index = np.concatenate([indexes[1] for indexes in + inner_cv_splits[:j + 1]]).ravel() + async_result[i][j] = async_pool.apply_async(self._ml_algorithm.evaluate, + (train_index[inner_train_index], test_index)) async_pool.close() async_pool.join() - for j in range(self._n_learning_points): + for j in range(self._validation_params['n_learning_points']): learning_point_results = [] - for i in range(self._n_iterations): + for i in range(self._validation_params['n_iterations']): learning_point_results.append(async_result[i][j].get()) - self._split_results.append(learning_point_results) + self._validation_results.append(learning_point_results) self._classifier = [] self._best_params = [] - for j in range(self._n_learning_points): - classifier, best_params = self._ml_algorithm.apply_best_parameters(self._split_results[j]) + for j in range(self._validation_params['n_learning_points']): + classifier, best_params = self._ml_algorithm.apply_best_parameters(self._validation_results[j]) self._classifier.append(classifier) self._best_params.append(best_params) - return self._classifier, self._best_params, self._split_results + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._split_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") - for learning_point in range(self._n_learning_points): + for learning_point in range(self._validation_params['n_learning_points']): all_results_list = [] all_subjects_list = [] learning_point_dir = path.join(output_dir, 'learning_split-' + str(learning_point)) - for iteration in range(self._n_iterations): + for iteration in range(self._validation_params['n_iterations']): iteration_dir = path.join(learning_point_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): os.makedirs(iteration_dir) - iteration_subjects_df = pd.DataFrame({'y': self._split_results[learning_point][iteration]['y'], - 'y_hat': self._split_results[learning_point][iteration]['y_hat'], - 'y_index': self._split_results[learning_point][iteration]['y_index']}) + iteration_subjects_df = pd.DataFrame({'y': self._validation_results[learning_point][iteration]['y'], + 'y_hat': self._validation_results[learning_point][iteration]['y_hat'], + 'y_index': self._validation_results[learning_point][iteration]['y_index']}) iteration_subjects_df.to_csv(path.join(iteration_dir, 'subjects.tsv'), index=False, sep='\t', encoding='utf-8') all_subjects_list.append(iteration_subjects_df) iteration_results_df = pd.DataFrame( - {'balanced_accuracy': self._split_results[learning_point][iteration]['evaluation']['balanced_accuracy'], - 'auc': self._split_results[learning_point][iteration]['auc'], - 'accuracy': self._split_results[learning_point][iteration]['evaluation']['accuracy'], - 'sensitivity': self._split_results[learning_point][iteration]['evaluation']['sensitivity'], - 'specificity': self._split_results[learning_point][iteration]['evaluation']['specificity'], - 'ppv': self._split_results[learning_point][iteration]['evaluation']['ppv'], - 'npv': self._split_results[learning_point][iteration]['evaluation']['npv'], - 'train_balanced_accuracy': self._split_results[learning_point][iteration]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._split_results[learning_point][iteration]['evaluation_train']['accuracy'], - 'train_sensitivity': self._split_results[learning_point][iteration]['evaluation_train']['sensitivity'], - 'train_specificity': self._split_results[learning_point][iteration]['evaluation_train']['specificity'], - 'train_ppv': self._split_results[learning_point][iteration]['evaluation_train']['ppv'], - 'train_npv': self._split_results[learning_point][iteration]['evaluation_train']['npv']}, index=['i', ]) + {'balanced_accuracy': self._validation_results[learning_point][iteration]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[learning_point][iteration]['auc'], + 'accuracy': self._validation_results[learning_point][iteration]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[learning_point][iteration]['evaluation']['sensitivity'], + 'specificity': self._validation_results[learning_point][iteration]['evaluation']['specificity'], + 'ppv': self._validation_results[learning_point][iteration]['evaluation']['ppv'], + 'npv': self._validation_results[learning_point][iteration]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[learning_point][iteration]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[learning_point][iteration]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[learning_point][iteration]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[learning_point][iteration]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[learning_point][iteration]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[learning_point][iteration]['evaluation_train']['npv']}, index=['i', ]) iteration_results_df.to_csv(path.join(iteration_dir, 'results.tsv'), index=False, sep='\t', encoding='utf-8') @@ -522,78 +467,31 @@ def save_results(self, output_dir): mean_results_df.to_csv(path.join(learning_point_dir, 'mean_results.tsv'), index=False, sep='\t', encoding='utf-8') - self.compute_error_variance(learning_point) - self.compute_accuracy_variance(learning_point) - - variance_df = pd.DataFrame({'bal_accuracy_resampled_t': self._bal_accuracy_resampled_t, - 'bal_accuracy_corrected_resampled_t': self._bal_accuracy_corrected_resampled_t, - 'error_resampled_t': self._error_resampled_t, - 'error_corrected_resampled_t': self._error_corrected_resampled_t}, index=[0, ]) - - variance_df.to_csv(path.join(learning_point_dir, 'variance.tsv'), - index=False, sep='\t', encoding='utf-8') - - def _compute_variance(self, test_error_split): - - # compute average test error - num_split = self._n_iterations # J in the paper - - # compute mu_{n_1}^{n_2} - average_test_error = np.mean(test_error_split) - - approx_variance = np.sum((test_error_split - average_test_error)**2)/(num_split - 1) - - # compute variance (point 2 and 6 of Nadeau's paper) - resampled_t = approx_variance / num_split - corrected_resampled_t = (1/num_split + self._test_size/(1 - self._test_size)) * approx_variance - - return resampled_t, corrected_resampled_t - - def compute_error_variance(self, learning_point): - num_split = self._n_iterations - test_error_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_error_split[i] = self._compute_average_test_error(self._split_results[learning_point][i]['y'], - self._split_results[learning_point][i]['y_hat']) - - self._error_resampled_t, self._error_corrected_resampled_t = self._compute_variance(test_error_split) - - return self._error_resampled_t, self._error_corrected_resampled_t - - def _compute_average_test_error(self, y_list, yhat_list): - # return the average test error (denoted mu_j hat) - return float(len(np.where(y_list != yhat_list)[0]))/float(len(y_list)) - - def compute_accuracy_variance(self, learning_point): - num_split = self._n_iterations - test_accuracy_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_accuracy_split[i] = self._compute_average_test_accuracy(self._split_results[learning_point][i]['y'], - self._split_results[learning_point][i]['y_hat']) - - self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t = self._compute_variance(test_accuracy_split) - - return self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t - - def _compute_average_test_accuracy(self, y_list, yhat_list): + @staticmethod + def get_default_parameters(): - from clinica.pipelines.machine_learning.ml_utils import evaluate_prediction + parameters_dict = {'n_iterations': 100, + 'test_size': 0.2, + 'n_learning_points': 10, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} - return evaluate_prediction(y_list, yhat_list)['balanced_accuracy'] + return parameters_dict class RepeatedKFoldCV_Multiclass(base.MLValidation): def __init__(self, ml_algorithm): self._ml_algorithm = ml_algorithm - self._repeated_fold_results = [] + self._repeated_validation_results = [] self._classifier = None self._best_params = None self._cv = None def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} self._cv = [] @@ -601,7 +499,7 @@ def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): skf = StratifiedKFold(n_splits=n_folds, shuffle=True) self._cv.append(list(skf.split(np.zeros(len(y)), y))) async_result[r] = {} - self._repeated_fold_results.append([]) + self._repeated_validation_results.append([]) for i in range(n_folds): @@ -612,22 +510,22 @@ def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): async_pool.join() for r in range(n_iterations): for i in range(n_folds): - self._repeated_fold_results[r].append(async_result[r][i].get()) + self._repeated_validation_results[r].append(async_result[r][i].get()) # TODO Find a better way to estimate best parameter - flat_results = [result for fold in self._repeated_fold_results for result in fold] + flat_results = [result for fold in self._repeated_validation_results for result in fold] self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(flat_results) - return self._classifier, self._best_params, self._repeated_fold_results + return self._classifier, self._best_params, self._repeated_validation_results def save_results(self, output_dir): - if self._repeated_fold_results is None: + if self._repeated_validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") all_results_list = [] all_subjects_list = [] - for iteration in range(len(self._repeated_fold_results)): + for iteration in range(len(self._repeated_validation_results)): iteration_dir = path.join(output_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): @@ -640,19 +538,19 @@ def save_results(self, output_dir): if not path.exists(folds_dir): os.makedirs(folds_dir) - for i in range(len(self._repeated_fold_results[iteration])): - subjects_df = pd.DataFrame({'y': self._repeated_fold_results[iteration][i]['y'], - 'y_hat': self._repeated_fold_results[iteration][i]['y_hat'], - 'y_index': self._repeated_fold_results[iteration][i]['y_index']}) + for i in range(len(self._repeated_validation_results[iteration])): + subjects_df = pd.DataFrame({'y': self._repeated_validation_results[iteration][i]['y'], + 'y_hat': self._repeated_validation_results[iteration][i]['y_hat'], + 'y_index': self._repeated_validation_results[iteration][i]['y_index']}) subjects_df.to_csv(path.join(folds_dir, 'subjects_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') iteration_subjects_list.append(subjects_df) results_df = pd.DataFrame( - {'balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation']['balanced_accuracy'], - 'accuracy': self._repeated_fold_results[iteration][i]['evaluation']['accuracy'], - 'train_balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['accuracy'] + {'balanced_accuracy': self._repeated_validation_results[iteration][i]['evaluation']['balanced_accuracy'], + 'accuracy': self._repeated_validation_results[iteration][i]['evaluation']['accuracy'], + 'train_balanced_accuracy': self._repeated_validation_results[iteration][i]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._repeated_validation_results[iteration][i]['evaluation_train']['accuracy'] }, index=['i', ]) results_df.to_csv(path.join(folds_dir, 'results_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') diff --git a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py index 3193f3dfb..5ca904836 100644 --- a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py +++ b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py @@ -16,21 +16,31 @@ def define_description(self): def define_options(self): """Define the sub-command arguments.""" + from colorama import Fore from clinica.engine.cmdparser import PIPELINE_CATEGORIES # Clinica compulsory arguments (e.g. BIDS, CAPS, group_id) clinica_comp = self._args.add_argument_group(PIPELINE_CATEGORIES['CLINICA_COMPULSORY']) clinica_comp.add_argument("caps_directory", help='Path to the CAPS directory.') - clinica_comp.add_argument("group_id", + clinica_comp.add_argument("group_label", help='User-defined identifier for the provided group of subjects.') + clinica_comp.add_argument("orig_input_data", + help='''Origin of input data. Type + 't1-volume' to use gray matter maps or + 'pet-volume' to use SUVr maps.''', + choices=['t1-volume', 'pet-volume'], + ) # Optional arguments - optional = self._args.add_argument_group(PIPELINE_CATEGORIES['OPTIONAL']) - optional.add_argument("-it", "--image_type", - default='t1', - help='Imaging modality. Can be t1 or pet (default: --image_type %(default)s)') - optional.add_argument("-pt", "--pet_tracer", - default='fdg', - help='PET tracer. Can be fdg or av45 (default: --pet_tracer %(default)s)') + optional_pet = self._args.add_argument_group( + '%sPipeline options if you use inputs from pet-volume pipeline%s' % + (Fore.BLUE, Fore.RESET) + ) + optional_pet.add_argument("-pt", "--pet_tracer", + default='fdg', + help='PET tracer. Can be fdg or av45 (default: --pet_tracer %(default)s)') + optional_pet.add_argument("-no_pvc", "--no_pvc", + action='store_true', default=False, + help="Force the use of non PVC PET data (by default, PVC PET data are used)") # Clinica standard arguments (e.g. --n_procs) self.add_clinica_standard_arguments() # Advanced arguments (i.e. tricky parameters) @@ -40,9 +50,6 @@ def define_options(self): help='Amount of regularization (in mm). In practice, we found the default value ' '(--full_width_half_maximum %(default)s) to be optimal. We therefore ' 'do not recommend to change it unless you have a specific reason to do so.') - advanced.add_argument("-no_pvc", "--no_pvc", - action='store_true', default=False, - help="Force the use of non PVC PET data (by default, PVC PET data are used)") def run_command(self, args): """Run the pipeline with defined args.""" @@ -51,11 +58,11 @@ def run_command(self, args): from clinica.utils.ux import print_end_pipeline, print_crash_files_and_exit parameters = { - 'group_id': args.group_id, - 'fwhm': args.fwhm, - 'image_type': args.image_type, + 'group_label': args.group_label, + 'orig_input_data': args.orig_input_data, 'pet_tracer': args.pet_tracer, - 'no_pvc': args.no_pvc + 'no_pvc': args.no_pvc, + 'fwhm': args.fwhm, } pipeline = SpatialSVM( caps_directory=self.absolute_path(args.caps_directory), diff --git a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py index 1fc757887..e7f12d49b 100644 --- a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py +++ b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py @@ -6,11 +6,6 @@ class SpatialSVM(cpe.Pipeline): """SpatialSVM - Prepare input data for SVM with spatial and anatomical regularization. - Args: - input_dir: A BIDS directory. - output_dir: An empty output directory where CAPS structured data will be written. - subjects_sessions_list: The Subjects-Sessions list file (in .tsv format). - Returns: A clinica pipeline object containing the SpatialSVM pipeline. @@ -21,18 +16,18 @@ def check_pipeline_parameters(self): """Check pipeline parameters.""" from clinica.utils.group import check_group_label - if 'group_id' not in self.parameters.keys(): - raise KeyError('Missing compulsory group_id key in pipeline parameter.') + if 'group_label' not in self.parameters.keys(): + raise KeyError('Missing compulsory group_label key in pipeline parameter.') + if 'orig_input_data' not in self.parameters.keys(): + raise KeyError('Missing compulsory orig_input_data key in pipeline parameter.') if 'fwhm' not in self.parameters.keys(): self.parameters['fwhm'] = 4 - if 'image_type' not in self.parameters.keys(): - self.parameters['image_type'] = 't1' if 'pet_tracer' not in self.parameters.keys(): self.parameters['pet_tracer'] = 'fdg' if 'no_pvc' not in self.parameters.keys(): self.parameters['no_pvc'] = False - check_group_label(self.parameters['group_id']) + check_group_label(self.parameters['group_label']) def check_custom_dependencies(self): """Check dependencies that can not be listed in the `info.json` file. @@ -70,11 +65,11 @@ def build_input_node(self): from clinica.utils.ux import print_groups_in_caps_directory # Check that group already exists - if not os.path.exists(os.path.join(self.caps_directory, 'groups', 'group-' + self.parameters['group_id'])): + if not os.path.exists(os.path.join(self.caps_directory, 'groups', 'group-' + self.parameters['group_label'])): print_groups_in_caps_directory(self.caps_directory) raise ClinicaException( '%sGroup %s does not exist. Did you run pet-volume, t1-volume or t1-volume-create-dartel pipeline?%s' % - (Fore.RED, self.parameters['group_id'], Fore.RESET) + (Fore.RED, self.parameters['group_label'], Fore.RESET) ) read_parameters_node = npe.Node(name="LoadingCLIArguments", @@ -82,30 +77,30 @@ def build_input_node(self): mandatory_inputs=True)) all_errors = [] - if self.parameters['image_type'] == 't1': + if self.parameters['orig_input_data'] == 't1-volume': caps_files_information = { - 'pattern': os.path.join('t1', 'spm', 'dartel', 'group-' + self.parameters['group_id'], + 'pattern': os.path.join('t1', 'spm', 'dartel', 'group-' + self.parameters['group_label'], '*_T1w_segm-graymatter_space-Ixi549Space_modulated-on_probability.nii.gz'), 'description': 'graymatter tissue segmented in T1w MRI in Ixi549 space', 'needed_pipeline': 't1-volume-tissue-segmentation' } - elif self.parameters['image_type'] is 'pet': + elif self.parameters['orig_input_data'] is 'pet-volume': if self.parameters['no_pvc']: caps_files_information = { - 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_id'], + 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_label'], '*_pet_space-Ixi549Space_suvr-pons_pet.nii.gz'), 'description': self.parameters['pet_tracer'] + ' PET in Ixi549 space', 'needed_pipeline': 'pet-volume' } else: caps_files_information = { - 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_id'], + 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_label'], '*_pet_space-Ixi549Space_pvc-rbv_suvr-pons_pet.nii.gz'), 'description': self.parameters['pet_tracer'] + ' PET partial volume corrected (RBV) in Ixi549 space', 'needed_pipeline': 'pet-volume with PVC' } else: - raise ValueError('Image type ' + self.parameters['image_type'] + ' unknown.') + raise ValueError('Image type ' + self.parameters['orig_input_data'] + ' unknown.') try: input_image = clinica_file_reader(self.subjects, @@ -117,7 +112,7 @@ def build_input_node(self): try: dartel_input = clinica_group_reader(self.caps_directory, - t1_volume_final_group_template(self.parameters['group_id'])) + t1_volume_final_group_template(self.parameters['group_label'])) except ClinicaException as e: all_errors.append(e) @@ -175,33 +170,33 @@ def build_core_nodes(self): name='sinker') datasink.inputs.base_directory = self.caps_directory datasink.inputs.parameterization = True - if self.parameters['image_type'] == 't1': + if self.parameters['orig_input_data'] == 't1-volume': datasink.inputs.regexp_substitutions = [ (r'(.*)/regularized_image/.*/(.*(sub-(.*)_ses-(.*))_T1w(.*)_probability(.*))$', r'\1/subjects/sub-\4/ses-\5/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'/\3_T1w\6_spatialregularization\7'), + 'group_label'] + r'/\3_T1w\6_spatialregularization\7'), (r'(.*)json_file/(output_data.json)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'_space-Ixi549Space_parameters.json'), + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ + 'group_label'] + r'_space-Ixi549Space_parameters.json'), (r'(.*)fisher_tensor_path/(output_fisher_tensor.npy)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'_space-Ixi549Space_gram.npy') + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ + 'group_label'] + r'_space-Ixi549Space_gram.npy') ] - elif self.parameters['image_type'] == 'pet': + elif self.parameters['orig_input_data'] == 'pet-volume': datasink.inputs.regexp_substitutions = [ (r'(.*)/regularized_image/.*/(.*(sub-(.*)_ses-(.*))_(task.*)_pet(.*))$', r'\1/subjects/sub-\4/ses-\5/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'/\3_\6_spatialregularization\7'), + 'group_label'] + r'/\3_\6_spatialregularization\7'), (r'(.*)json_file/(output_data.json)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + - self.parameters['group_id'] + r'_space-Ixi549Space_parameters.json'), + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + + self.parameters['group_label'] + r'_space-Ixi549Space_parameters.json'), (r'(.*)fisher_tensor_path/(output_fisher_tensor.npy)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'_space-Ixi549Space_gram.npy') + 'group_label'] + r'_space-Ixi549Space_gram.npy') ] # Connection # ========== diff --git a/clinica/pipelines/t1_linear/t1_linear_cli.py b/clinica/pipelines/t1_linear/t1_linear_cli.py index cda8cb289..804901f66 100644 --- a/clinica/pipelines/t1_linear/t1_linear_cli.py +++ b/clinica/pipelines/t1_linear/t1_linear_cli.py @@ -32,8 +32,10 @@ def define_options(self): # Clinica optional arguments optional = self._args.add_argument_group(PIPELINE_CATEGORIES['OPTIONAL']) - optional.add_argument("-cp", "--crop_image", - help='Crop the image using a template (suggested for using with DL models)', + optional.add_argument("-ui", "--uncropped_image", + help='''Do not crop the image with template + (cropped image are suggested for using with DL + models)''', action='store_true', default=False) @@ -47,7 +49,7 @@ def run_command(self, args): from clinica.utils.ux import print_end_pipeline, print_crash_files_and_exit parameters = { - 'crop_image': args.crop_image + 'uncropped_image': args.uncropped_image } # Most of the time, you will want to instantiate your pipeline with a diff --git a/clinica/pipelines/t1_linear/t1_linear_pipeline.py b/clinica/pipelines/t1_linear/t1_linear_pipeline.py index 63c51e8fe..5cc18e290 100644 --- a/clinica/pipelines/t1_linear/t1_linear_pipeline.py +++ b/clinica/pipelines/t1_linear/t1_linear_pipeline.py @@ -125,8 +125,8 @@ def build_output_node(self): import nipype.interfaces.utility as nutil from nipype.interfaces.io import DataSink import nipype.pipeline.engine as npe - from clinica.utils.nipype import fix_join - from .t1_linear_utils import (container_from_filename, get_substitutions_datasink) + from clinica.utils.nipype import (fix_join, container_from_filename) + from .t1_linear_utils import get_substitutions_datasink # Writing node write_node = npe.Node( @@ -162,7 +162,7 @@ def build_output_node(self): (self.output_node, write_node, [('affine_mat', '@affine_mat')]), ]) - if (self.parameters.get('crop_image')): + if not (self.parameters.get('uncropped_image')): self.connect([ (self.output_node, write_node, [('outfile_crop', '@outfile_crop')]), ]) @@ -224,7 +224,7 @@ def build_core_nodes(self): (self.input_node, image_id_node, [('t1w', 'filename')]), (self.input_node, n4biascorrection, [('t1w', 'input_image')]), (n4biascorrection, ants_registration_node, [('output_image', 'moving_image')]), - (image_id_node , ants_registration_node, [('image_id', 'output_prefix')]), + (image_id_node, ants_registration_node, [('image_id', 'output_prefix')]), # Connect to DataSink (image_id_node, self.output_node, [('image_id', 'image_id')]), @@ -232,7 +232,7 @@ def build_core_nodes(self): (n4biascorrection, self.output_node, [('output_image', 'outfile_corr')]), (ants_registration_node, self.output_node, [('warped_image', 'outfile_reg')]), ]) - if (self.parameters.get('crop_image')): + if not (self.parameters.get('uncropped_image')): self.connect([ (ants_registration_node, cropnifti, [('warped_image', 'input_img')]), (cropnifti, self.output_node, [('output_img', 'outfile_crop')]), diff --git a/clinica/pipelines/t1_linear/t1_linear_utils.py b/clinica/pipelines/t1_linear/t1_linear_utils.py index 8ba191d87..15abc203d 100644 --- a/clinica/pipelines/t1_linear/t1_linear_utils.py +++ b/clinica/pipelines/t1_linear/t1_linear_utils.py @@ -7,35 +7,6 @@ """ -# Get containers to produce the CAPS structure -# Warning!!! This file should be in the future in the utils package of Clinica -def container_from_filename(bids_or_caps_filename): - """Extract container from BIDS or CAPS file. - Args: - bids_or_caps_filename (str): full path to BIDS or CAPS filename. - Returns: - Container path of the form "subjects//" - Examples: - >>> from clinica.utils.nipype import container_from_filename - >>> container_from_filename('/path/to/bids/sub-CLNC01/ses-M00/anat/sub-CLNC01_ses-M00_T1w.nii.gz') - 'subjects/sub-CLNC01/ses-M00' - >>> container_from_filename('caps/subjects/sub-CLNC01/ses-M00/dwi/preprocessing/sub-CLNC01_ses-M00_preproc.nii') - 'subjects/sub-CLNC01/ses-M00' - """ - - import os - import re - m = re.search(r'(sub-[a-zA-Z0-9]+)/(ses-[a-zA-Z0-9]+)', bids_or_caps_filename) - if m is None: - raise ValueError( - 'Input filename is not in a BIDS or CAPS compliant format.' - 'It does not contain the participant and session ID.' - ) - subject = m.group(1) - session = m.group(2) - return os.path.join('subjects', subject, session) - - def get_substitutions_datasink(bids_file): substitutions_ls = [ # registration diff --git a/clinica/utils/atlas.py b/clinica/utils/atlas.py index 852b9c068..117d1a379 100644 --- a/clinica/utils/atlas.py +++ b/clinica/utils/atlas.py @@ -1,5 +1,18 @@ # coding: utf8 +""" +This module contains utilities to handle atlases in Clinica. + +An atlas is currently defined by its name, a set of labels in a template space and +the map of this template space (e.g. T1w, FA map derived from DWI). + +This current implementation has some drawbacks: +- Atlas is misleading: it is only a set of labels in a template space +- This implementation can not handle case where there are several maps (e.g. both T1w and T2w) in template space + +Either a refactoring of this module or the use of an external API +(e.g. TemplateFlow - https://www.templateflow.org/) needs to be considered. +""" import abc diff --git a/clinica/utils/check_dependency.py b/clinica/utils/check_dependency.py index cf01271d2..6ec942c8a 100644 --- a/clinica/utils/check_dependency.py +++ b/clinica/utils/check_dependency.py @@ -1,9 +1,11 @@ # coding: utf8 -"""This module contains utilities to check dependencies of the different -neuroimaging tools.""" +""" +This module contains utilities to check dependencies before running Clinica. +These functions can check binaries, software (e.g. FreeSurfer) or toolboxes (e.g. SPM). +""" def is_binary_present(binary): """ diff --git a/clinica/utils/exceptions.py b/clinica/utils/exceptions.py index c554388bd..45fb92da7 100644 --- a/clinica/utils/exceptions.py +++ b/clinica/utils/exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 """ -Clinica exceptions +This module handles Clinica exceptions. """ diff --git a/clinica/utils/freesurfer.py b/clinica/utils/freesurfer.py index 537db3715..3c112623f 100644 --- a/clinica/utils/freesurfer.py +++ b/clinica/utils/freesurfer.py @@ -1,7 +1,7 @@ # coding: utf8 - - -"""This module contains FreeSurfer utilities.""" +""" +This module contains FreeSurfer utilities. +""" def extract_image_id_from_longitudinal_segmentation(freesurfer_id): diff --git a/clinica/utils/group.py b/clinica/utils/group.py index 6e57c7253..410a6158e 100644 --- a/clinica/utils/group.py +++ b/clinica/utils/group.py @@ -1,6 +1,13 @@ # coding: utf8 +""" +This module contains utilities to handle groups in Clinica. + +See CAPS specifications for details about groups. +""" + + def check_group_label(group_label): """Check that `group_label` is compliant with specifications.""" if not group_label.isalnum(): diff --git a/clinica/utils/input_files.py b/clinica/utils/input_files.py index 394dd03f1..d6f3f0719 100644 --- a/clinica/utils/input_files.py +++ b/clinica/utils/input_files.py @@ -1,8 +1,9 @@ # coding: utf8 """ -Describe files to grab, to use with inputs.clinica_file_reader() and inputs.clinica_group_reader() +This module contains dictionaries used in inputs.py::clinica_{file|group}_reader(). +These dictionaries describe files to grab. """ """ T1w """ @@ -93,8 +94,18 @@ 'description': 'right hemisphere surface-based Desikan parcellation (label/rh.aparc.annot) generated with t1-freesurfer-longitudinal.', 'needed_pipeline': 't1-freesurfer and t1-freesurfer-longitudinal'} +T1W_LINEAR = {'pattern': '*space-MNI152NLin2009cSym_res-1x1x1_T1w.nii.gz', + 'description': 'T1w image registered in MNI152NLin2009cSym space using t1-linear pipeline', + 'needed_pipeline': 't1-linear'} + +T1W_LINEAR_CROPPED = {'pattern': '*space-MNI152NLin2009cSym_desc-Crop_res-1x1x1_T1w.nii.gz', + 'description': 'T1W Image registered using t1-linear and cropped ' + '(matrix size 169×208×179, 1 mm isotropic voxels)', + 'needed_pipeline': 't1-linear'} # T1-Volume + + def t1_volume_native_tpm(tissue_number): from .spm import INDEX_TISSUE_MAP import os diff --git a/clinica/utils/inputs.py b/clinica/utils/inputs.py index 0cd81dc3e..b230241c6 100644 --- a/clinica/utils/inputs.py +++ b/clinica/utils/inputs.py @@ -1,5 +1,9 @@ # coding: utf8 +""" +This module contains utilities to grab or download files for Clinica. +""" + import hashlib from collections import namedtuple diff --git a/clinica/utils/nipype.py b/clinica/utils/nipype.py index 09dd03af1..bb1683fec 100644 --- a/clinica/utils/nipype.py +++ b/clinica/utils/nipype.py @@ -1,5 +1,34 @@ # coding: utf8 +# Get containers to produce the CAPS structure + + +def container_from_filename(bids_or_caps_filename): + """Extract container from BIDS or CAPS file. + Args: + bids_or_caps_filename (str): full path to BIDS or CAPS filename. + Returns: + Container path of the form "subjects//" + Examples: + >>> from clinica.utils.nipype import container_from_filename + >>> container_from_filename('/path/to/bids/sub-CLNC01/ses-M00/anat/sub-CLNC01_ses-M00_T1w.nii.gz') + 'subjects/sub-CLNC01/ses-M00' + >>> container_from_filename('caps/subjects/sub-CLNC01/ses-M00/dwi/preprocessing/sub-CLNC01_ses-M00_preproc.nii') + 'subjects/sub-CLNC01/ses-M00' + """ + + import os + import re + m = re.search(r'(sub-[a-zA-Z0-9]+)/(ses-[a-zA-Z0-9]+)', bids_or_caps_filename) + if m is None: + raise ValueError( + 'Input filename is not in a BIDS or CAPS compliant format.' + 'It does not contain the participant and session ID.' + ) + subject = m.group(1) + session = m.group(2) + return os.path.join('subjects', subject, session) + def fix_join(path, *paths): # This workaround is used in pipelines like DWIPreprocessingUsingT1 diff --git a/clinica/utils/participant.py b/clinica/utils/participant.py index cd9a51fb0..4490dc98a 100644 --- a/clinica/utils/participant.py +++ b/clinica/utils/participant.py @@ -1,4 +1,11 @@ # coding: utf8 + +""" +This module contains utilities for longitudinal pipelines. + +See CAPS specifications for details about long ID. +""" + from clinica.utils.filemanip import read_participant_tsv diff --git a/clinica/utils/spm.py b/clinica/utils/spm.py index 8c6a989f9..d9518991d 100644 --- a/clinica/utils/spm.py +++ b/clinica/utils/spm.py @@ -1,5 +1,9 @@ # coding: utf8 +""" +This module contains SPM utilities. +""" + INDEX_TISSUE_MAP = { 1: 'graymatter', 2: 'whitematter', diff --git a/clinica/utils/statistics.py b/clinica/utils/statistics.py index a23ef89d8..cb11b1e81 100644 --- a/clinica/utils/statistics.py +++ b/clinica/utils/statistics.py @@ -1,5 +1,10 @@ # coding: utf8 +""" +This module contains utilities for statistics. + +Currently, it contains one function to generate TSV file containing mean map based on a parcellation. +""" def statistics_on_atlas(in_normalized_map, in_atlas, out_file=None): """ @@ -43,9 +48,9 @@ def statistics_on_atlas(in_normalized_map, in_atlas, out_file=None): img = nib.load(in_normalized_map) img_data = img.get_data() - atlas_correspondance = pandas.io.parsers.read_csv(in_atlas.get_tsv_roi(), sep='\t') - label_name = list(atlas_correspondance.roi_name) - label_value = list(atlas_correspondance.roi_value) # TODO create roi_value column in lut_*.txt and remove irrelevant RGB information + atlas_correspondence = pandas.io.parsers.read_csv(in_atlas.get_tsv_roi(), sep='\t') + label_name = list(atlas_correspondence.roi_name) + label_value = list(atlas_correspondence.roi_value) # TODO create roi_value column in lut_*.txt and remove irrelevant RGB information mean_signal_value = [] for label in label_value: diff --git a/clinica/utils/stream.py b/clinica/utils/stream.py index 383d4bf49..9d6167074 100644 --- a/clinica/utils/stream.py +++ b/clinica/utils/stream.py @@ -1,8 +1,9 @@ # coding: utf8 """ -Redirect stream and log +This module handles stream and log redirection. """ + import sys clinica_verbose = False diff --git a/clinica/utils/ux.py b/clinica/utils/ux.py index e670558e7..11cc0dfd8 100644 --- a/clinica/utils/ux.py +++ b/clinica/utils/ux.py @@ -1,5 +1,10 @@ # coding: utf8 +""" +This module gathers formatted messages that are displayed when running Clinica. + +These functions are mainly called by the pipelines. +""" LINES_TO_DISPLAY = 25 diff --git a/environment.yml b/environment.yml index 4a1c45043..b90313735 100644 --- a/environment.yml +++ b/environment.yml @@ -5,25 +5,4 @@ dependencies: - python=3.6 - pip - pip: - - nibabel>=2.3.3 - - nipype>=1.4.0 - - pybids==0.5.1 - - argcomplete>=1.9.4 - - pandas>=0.24.2 - - jinja2>=2.10.1 - - xvfbwrapper==0.2.9 - - numpy==1.17 - - scikit-learn>=0.20.0 - - nipy>=0.4.2 - - nilearn>=0.6.0 - - colorama>=0.4.1 - - xgboost==0.80 - - xlrd>=1.2.0 - - scipy==1.2.3 - - matplotlib - - niflow-nipype1-workflows - - scikit-image - - pytest # Dev only - - pytest-timeout # Dev only - - pytest-xdist # Dev only - - pycodestyle # Dev only + - -r requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt index 7308e1a1a..4700a0f21 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,23 +1,8 @@ -################ CLINICA requirements for installation ###################### -##### Requirements with Version Specifiers ###### +# *************** Clinica requirements for installation *************** +# ***** Requirements with Version Specifiers ***** # See https://www.python.org/dev/peps/pep-0440/#version-specifiers -nibabel >= 2.3.3 -nipype >= 1.4.0 -pybids == 0.5.1 -argcomplete >= 1.9.4 -pandas >= 0.24.2 -jinja2 >= 2.10.1 -xvfbwrapper == 0.2.9 -numpy == 1.17 -scikit-learn >= 0.20.0 -nipy >= 0.4.2 -nilearn >= 0.6.0 -colorama >= 0.4.1 -xgboost == 0.80 -xlrd >= 1.2.0 -scipy == 1.2.3 -matplotlib -niflow-nipype1-workflows -scikit-image +-r requirements.txt pytest +pytest-timeout pytest-xdist +pycodestyle diff --git a/requirements.txt b/requirements.txt index 217520d26..bcd288bef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -################ CLINICA requirements for installation ###################### -##### Requirements with Version Specifiers ###### +# *************** Clinica requirements for installation *************** +# ***** Requirements with Version Specifiers ***** # See https://www.python.org/dev/peps/pep-0440/#version-specifiers nibabel >= 2.3.3 nipype >= 1.4.0 @@ -18,4 +18,6 @@ xlrd >= 1.2.0 scipy == 1.2.3 matplotlib niflow-nipype1-workflows -scikit-image +scikit-image == 0.16.2 +torch > 1.0 +torchvision diff --git a/test/instantiation/test_instantiate_all_pipelines.py b/test/instantiation/test_instantiate_all_pipelines.py index 99a878dbb..50ceb3418 100644 --- a/test/instantiation/test_instantiate_all_pipelines.py +++ b/test/instantiation/test_instantiate_all_pipelines.py @@ -295,12 +295,29 @@ def test_instantiate_InputsML(): atlases = ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers'] possible_psf = [0, 5, 10, 15, 20, 25] - voxel_input = [CAPSVoxelBasedInput(caps_dir, tsv, diagnoses_tsv, group_id, im, fwhm=8) + voxel_input = [CAPSVoxelBasedInput({'caps_directory': caps_dir, + 'subjects_visits_tsv': tsv, + 'diagnoses_tsv': diagnoses_tsv, + 'group_id': group_id, + 'image_type': im, + 'fwhm': 8}) for im in image_type] - region_input = [CAPSRegionBasedInput(caps_dir, tsv, diagnoses_tsv, group_id, im, at) + + region_input = [CAPSRegionBasedInput({'caps_directory': caps_dir, + 'subjects_visits_tsv': tsv, + 'diagnoses_tsv': diagnoses_tsv, + 'group_id': group_id, + 'image_type': im, + 'atlas': at}) for im in image_type for at in atlases] - vertex_input = [CAPSVertexBasedInput(caps_dir, tsv, diagnoses_tsv, group_id, fwhm, 'fdg') + + vertex_input = [CAPSVertexBasedInput({'caps_directory': caps_dir, + 'subjects_visits_tsv': tsv, + 'diagnoses_tsv': diagnoses_tsv, + 'group_id': group_id, + 'image_type': 'fdg', + 'fwhm': fwhm}) for fwhm in possible_psf] # Check that each file exists @@ -323,7 +340,8 @@ def test_instantiate_SpatialSVM(): root = join(root, 'data', 'SpatialSVM') parameters = { - 'group_id': 'ADNIbl' + 'group_label': 'ADNIbl', + 'orig_input_data': 't1-volume' } pipeline = SpatialSVM( caps_directory=join(root, 'in', 'caps'), @@ -368,6 +386,10 @@ def test_instantiate_T1Linear(): root = dirname(abspath(join(abspath(__file__), pardir))) root = join(root, 'data', 'T1Linear') + parameters = { + 'uncropped_image': False + } + pipeline = T1Linear( bids_directory=join(root, 'in', 'bids'), caps_directory=join(root, 'in', 'caps'), @@ -376,6 +398,24 @@ def test_instantiate_T1Linear(): pipeline.build() +def test_instantiate_DLPrepareData(): + from os.path import dirname, join, abspath + from clinica.pipelines.deeplearning_prepare_data.deeplearning_prepare_data_pipeline import DeepLearningPrepareData + + root = dirname(abspath(join(abspath(__file__), pardir))) + root = join(root, 'data', 'DeepLearningPrepareData') + + parameters = { + 'extract_method': 'whole' + } + pipeline = DeepLearningPrepareData( + caps_directory=join(root, 'in', 'caps'), + tsv_file=join(root, 'in', 'subjects.tsv'), + parameters=parameters + ) + pipeline.build() + + def test_instantiate_StatisticsVolume(): from os.path import dirname, join, abspath from clinica.pipelines.statistics_volume.statistics_volume_pipeline import StatisticsVolume diff --git a/test/nonregression/test_run_pipelines.py b/test/nonregression/test_run_pipelines.py index d21956f9f..b5c1e1298 100644 --- a/test/nonregression/test_run_pipelines.py +++ b/test/nonregression/test_run_pipelines.py @@ -685,10 +685,10 @@ def test_run_PETSurfaceCrossSectional(cmdopt): def test_run_WorkflowsML(cmdopt): - from clinica.pipelines.machine_learning.ml_workflows import (RB_RepHoldOut_LogisticRegression, - VertexB_RepHoldOut_dualSVM, - RB_RepHoldOut_RandomForest, - VB_KFold_DualSVM) + from clinica.pipelines.machine_learning.ml_workflows import (RegionBasedRepHoldOutLogisticRegression, + VertexBasedRepHoldOutDualSVM, + RegionBasedRepHoldOutRandomForest, + VoxelBasedKFoldDualSVM) from os.path import dirname, join, abspath import shutil import warnings @@ -706,31 +706,31 @@ def test_run_WorkflowsML(cmdopt): diagnoses_tsv = join(root_input, 'in', 'diagnosis.tsv') group_id = 'allADNIdartel' - output_dir1 = join(root, 'out', 'VertexB_RepHoldOut_dualSVM') + output_dir1 = join(root, 'out', 'VertexBasedRepHoldOutDualSVM') clean_folder(output_dir1, recreate=True) - wf1 = VertexB_RepHoldOut_dualSVM(caps_dir, tsv, diagnoses_tsv, group_id, output_dir1, image_type='fdg', fwhm=20, - n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) + wf1 = VertexBasedRepHoldOutDualSVM(caps_dir, tsv, diagnoses_tsv, group_id, output_dir1, image_type='fdg', fwhm=20, + n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) wf1.run() shutil.rmtree(output_dir1) - output_dir2 = join(root, 'out', 'RB_RepHoldOut_LogisticRegression') + output_dir2 = join(root, 'out', 'RegionBasedRepHoldOutLogisticRegression') clean_folder(output_dir2, recreate=True) - wf2 = RB_RepHoldOut_LogisticRegression(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', 'AICHA', output_dir2, - n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) + wf2 = RegionBasedRepHoldOutLogisticRegression(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', 'AICHA', output_dir2, + n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) wf2.run() shutil.rmtree(output_dir2) - output_dir3 = join(root, 'out', 'RB_RepHoldOut_RandomForest') + output_dir3 = join(root, 'out', 'RegionBasedRepHoldOutRandomForest') clean_folder(output_dir3, recreate=True) - wf3 = RB_RepHoldOut_RandomForest(caps_dir, tsv, diagnoses_tsv, group_id, 'T1', 'AAL2', output_dir3, n_threads=8, - n_iterations=10, grid_search_folds=3, test_size=0.3) + wf3 = RegionBasedRepHoldOutRandomForest(caps_dir, tsv, diagnoses_tsv, group_id, 'T1', 'AAL2', output_dir3, + n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) wf3.run() shutil.rmtree(output_dir3) - output_dir4 = join(root, 'out', 'VB_KFold_DualSVM') + output_dir4 = join(root, 'out', 'VoxelBasedKFoldDualSVM') clean_folder(output_dir4, recreate=True) - wf4 = VB_KFold_DualSVM(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', output_dir4, fwhm=8, n_threads=8, n_folds=5, - grid_search_folds=3) + wf4 = VoxelBasedKFoldDualSVM(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', output_dir4, fwhm=8, n_threads=8, + n_folds=5, grid_search_folds=3) wf4.run() shutil.rmtree(output_dir4) @@ -754,7 +754,8 @@ def test_run_SpatialSVM(cmdopt): shutil.copytree(join(root, 'in', 'caps'), join(root, 'out', 'caps')) parameters = { - 'group_id': 'ADNIbl' + 'group_label': 'ADNIbl', + 'orig_input_data': 't1-volume' } # Instantiate pipeline and run() pipeline = SpatialSVM( @@ -801,7 +802,7 @@ def test_run_T1Linear(cmdopt): shutil.copytree(join(root, 'in', 'caps'), join(root, 'out', 'caps')) parameters = { - 'crop_image': True + 'uncropped_image': False } # Instantiate pipeline pipeline = T1Linear( @@ -814,16 +815,88 @@ def test_run_T1Linear(cmdopt): pipeline.run(plugin='MultiProc', plugin_args={'n_procs': 4}, bypass_check=True) # Check output vs ref - + out_folder = join(root, 'out') - ref_folder = join(root, 'out') - + ref_folder = join(root, 'out') + compare_folders(out_folder, ref_folder, shared_folder_name='caps') clean_folder(join(root, 'out', 'caps'), recreate=False) clean_folder(join(working_dir, 'T1Linear'), recreate=False) +def test_run_DLPrepareData(cmdopt): + from os.path import dirname, join, abspath + import shutil + from clinica.pipelines.deeplearning_prepare_data.deeplearning_prepare_data_pipeline import DeepLearningPrepareData + import nibabel as nib + import numpy as np + + working_dir = cmdopt + root = dirname(abspath(join(abspath(__file__), pardir))) + root = join(root, 'data', 'DeepLearningPrepareData') + + # Remove potential residual of previous UT + clean_folder(join(working_dir, 'DeepLearningPrepareData')) + clean_folder(join(root, 'out', 'caps'), recreate=False) + + # Copy necessary data from in to out + shutil.copytree(join(root, 'in', 'caps'), join(root, 'out', 'caps')) + + # Test the transformation of the complete T1 MRI + parameters = { + 'extract_method': 'image' + } + # Instantiate pipeline + pipeline = DeepLearningPrepareData( + caps_directory=join(root, 'out', 'caps'), + tsv_file=join(root, 'in', 'subjects.tsv'), + base_dir=join(working_dir, 'DeepLearningPrepareData'), + parameters=parameters + ) + pipeline.run(plugin='MultiProc', plugin_args={'n_procs': 4}, bypass_check=True) + + # Test the patch extraction + parameters = { + 'extract_method': 'patch', + 'patch_size': 50, + 'stride_size': 50 + } + # Instantiate pipeline + pipeline = DeepLearningPrepareData( + caps_directory=join(root, 'out', 'caps'), + tsv_file=join(root, 'in', 'subjects.tsv'), + base_dir=join(working_dir, 'DeepLearningPrepareData'), + parameters=parameters + ) + pipeline.run(plugin='MultiProc', plugin_args={'n_procs': 4}, bypass_check=True) + + # Test the slice extraction + parameters = { + 'extract_method': 'slice', + 'slice_mode': 'rgb', + 'slice_direction': 0 + } + # Instantiate pipeline + pipeline = DeepLearningPrepareData( + caps_directory=join(root, 'out', 'caps'), + tsv_file=join(root, 'in', 'subjects.tsv'), + base_dir=join(working_dir, 'DeepLearningPrepareData'), + parameters=parameters + ) + pipeline.run(plugin='MultiProc', plugin_args={'n_procs': 4}, bypass_check=True) + # Check output vs ref + + out_folder = join(root, 'out') + ref_folder = join(root, 'out') + + compare_folders(out_folder, ref_folder, shared_folder_name='caps') + + clean_folder(join(root, 'out', 'caps'), recreate=False) + clean_folder(join(working_dir, 'DeepLearningPrepareData'), recreate=False) + + + def test_run_StatisticsVolume(cmdopt): from os.path import dirname, join, abspath import shutil @@ -910,7 +983,7 @@ def test_run_StatisticsVolumeCorrection(cmdopt): pipeline.build() pipeline.run(plugin='MultiProc', plugin_args={'n_procs': 4}, bypass_check=True) compare_folders(join(root, 'out'), join(root, 'ref'), 'caps') - + # Remove data in out folder clean_folder(join(root, 'out', 'caps'), recreate=True) clean_folder(join(working_dir, 'StatisticsVolumeCorrection'), recreate=False)