From 7ede3f49d227af248912bcdc6947f0a6226ed5d3 Mon Sep 17 00:00:00 2001 From: Alexandre Routier Date: Tue, 12 May 2020 14:07:34 +0200 Subject: [PATCH 01/69] Clarify how to deactivate How many threads.. message --- clinica/pipelines/engine.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/clinica/pipelines/engine.py b/clinica/pipelines/engine.py index 850432ef4..777433f40 100644 --- a/clinica/pipelines/engine.py +++ b/clinica/pipelines/engine.py @@ -1,7 +1,9 @@ # coding: utf8 """ +This module contains the Pipeline abstract class needed for Clinica. +Subclasses are located in clinica/pipeline//_pipeline.py """ import abc @@ -509,7 +511,7 @@ def human2bytes(s): + 'Running anyway...' + Fore.RESET) def update_parallelize_info(self, plugin_args): - """ Peforms some checks of the number of threads given in parameters, + """ Performs some checks of the number of threads given in parameters, given the number of CPUs of the machine in which clinica is running. We force the use of plugin MultiProc @@ -557,7 +559,7 @@ def update_parallelize_info(self, plugin_args): cprint('How many threads do you want to use? If you do not ' + 'answer within ' + str(timeout) + ' sec, default value of ' + str(n_cpu - 1) - + ' will be taken.') + + ' will be taken. Use --n_procs argument if you want to disable this message next time.') stdin_answer, __, ___ = select.select([sys.stdin], [], [], timeout) if stdin_answer: answer = str(sys.stdin.readline().strip()) @@ -568,7 +570,7 @@ def update_parallelize_info(self, plugin_args): break cprint(Fore.RED + 'Your answer must be a positive integer.' + Fore.RESET) - # If plugin_args is None, create the dictionnary + # If plugin_args is None, create the dictionary # If it already a dict, just update (or create -it is the same # code-) the correct key / value if plugin_args is None: From b4ea6a2d4e160fa9d5a6f4e81ada7d821a81ca49 Mon Sep 17 00:00:00 2001 From: Alexandre Routier Date: Wed, 13 May 2020 11:00:51 +0200 Subject: [PATCH 02/69] Add modules description in utils/ folder --- clinica/utils/atlas.py | 13 +++++++++++++ clinica/utils/check_dependency.py | 6 ++++-- clinica/utils/exceptions.py | 2 +- clinica/utils/freesurfer.py | 6 +++--- clinica/utils/group.py | 7 +++++++ clinica/utils/input_files.py | 3 ++- clinica/utils/inputs.py | 4 ++++ clinica/utils/participant.py | 7 +++++++ clinica/utils/spm.py | 4 ++++ clinica/utils/statistics.py | 11 ++++++++--- clinica/utils/stream.py | 3 ++- clinica/utils/ux.py | 5 +++++ 12 files changed, 60 insertions(+), 11 deletions(-) diff --git a/clinica/utils/atlas.py b/clinica/utils/atlas.py index 852b9c068..117d1a379 100644 --- a/clinica/utils/atlas.py +++ b/clinica/utils/atlas.py @@ -1,5 +1,18 @@ # coding: utf8 +""" +This module contains utilities to handle atlases in Clinica. + +An atlas is currently defined by its name, a set of labels in a template space and +the map of this template space (e.g. T1w, FA map derived from DWI). + +This current implementation has some drawbacks: +- Atlas is misleading: it is only a set of labels in a template space +- This implementation can not handle case where there are several maps (e.g. both T1w and T2w) in template space + +Either a refactoring of this module or the use of an external API +(e.g. TemplateFlow - https://www.templateflow.org/) needs to be considered. +""" import abc diff --git a/clinica/utils/check_dependency.py b/clinica/utils/check_dependency.py index cf01271d2..6ec942c8a 100644 --- a/clinica/utils/check_dependency.py +++ b/clinica/utils/check_dependency.py @@ -1,9 +1,11 @@ # coding: utf8 -"""This module contains utilities to check dependencies of the different -neuroimaging tools.""" +""" +This module contains utilities to check dependencies before running Clinica. +These functions can check binaries, software (e.g. FreeSurfer) or toolboxes (e.g. SPM). +""" def is_binary_present(binary): """ diff --git a/clinica/utils/exceptions.py b/clinica/utils/exceptions.py index c554388bd..45fb92da7 100644 --- a/clinica/utils/exceptions.py +++ b/clinica/utils/exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 """ -Clinica exceptions +This module handles Clinica exceptions. """ diff --git a/clinica/utils/freesurfer.py b/clinica/utils/freesurfer.py index 537db3715..3c112623f 100644 --- a/clinica/utils/freesurfer.py +++ b/clinica/utils/freesurfer.py @@ -1,7 +1,7 @@ # coding: utf8 - - -"""This module contains FreeSurfer utilities.""" +""" +This module contains FreeSurfer utilities. +""" def extract_image_id_from_longitudinal_segmentation(freesurfer_id): diff --git a/clinica/utils/group.py b/clinica/utils/group.py index 6e57c7253..410a6158e 100644 --- a/clinica/utils/group.py +++ b/clinica/utils/group.py @@ -1,6 +1,13 @@ # coding: utf8 +""" +This module contains utilities to handle groups in Clinica. + +See CAPS specifications for details about groups. +""" + + def check_group_label(group_label): """Check that `group_label` is compliant with specifications.""" if not group_label.isalnum(): diff --git a/clinica/utils/input_files.py b/clinica/utils/input_files.py index 394dd03f1..5f7fc3f7c 100644 --- a/clinica/utils/input_files.py +++ b/clinica/utils/input_files.py @@ -1,8 +1,9 @@ # coding: utf8 """ -Describe files to grab, to use with inputs.clinica_file_reader() and inputs.clinica_group_reader() +This module contains dictionaries used in inputs.py::clinica_{file|group}_reader(). +These dictionaries describe files to grab. """ """ T1w """ diff --git a/clinica/utils/inputs.py b/clinica/utils/inputs.py index 0cd81dc3e..b230241c6 100644 --- a/clinica/utils/inputs.py +++ b/clinica/utils/inputs.py @@ -1,5 +1,9 @@ # coding: utf8 +""" +This module contains utilities to grab or download files for Clinica. +""" + import hashlib from collections import namedtuple diff --git a/clinica/utils/participant.py b/clinica/utils/participant.py index cd9a51fb0..4490dc98a 100644 --- a/clinica/utils/participant.py +++ b/clinica/utils/participant.py @@ -1,4 +1,11 @@ # coding: utf8 + +""" +This module contains utilities for longitudinal pipelines. + +See CAPS specifications for details about long ID. +""" + from clinica.utils.filemanip import read_participant_tsv diff --git a/clinica/utils/spm.py b/clinica/utils/spm.py index 8c6a989f9..d9518991d 100644 --- a/clinica/utils/spm.py +++ b/clinica/utils/spm.py @@ -1,5 +1,9 @@ # coding: utf8 +""" +This module contains SPM utilities. +""" + INDEX_TISSUE_MAP = { 1: 'graymatter', 2: 'whitematter', diff --git a/clinica/utils/statistics.py b/clinica/utils/statistics.py index a23ef89d8..cb11b1e81 100644 --- a/clinica/utils/statistics.py +++ b/clinica/utils/statistics.py @@ -1,5 +1,10 @@ # coding: utf8 +""" +This module contains utilities for statistics. + +Currently, it contains one function to generate TSV file containing mean map based on a parcellation. +""" def statistics_on_atlas(in_normalized_map, in_atlas, out_file=None): """ @@ -43,9 +48,9 @@ def statistics_on_atlas(in_normalized_map, in_atlas, out_file=None): img = nib.load(in_normalized_map) img_data = img.get_data() - atlas_correspondance = pandas.io.parsers.read_csv(in_atlas.get_tsv_roi(), sep='\t') - label_name = list(atlas_correspondance.roi_name) - label_value = list(atlas_correspondance.roi_value) # TODO create roi_value column in lut_*.txt and remove irrelevant RGB information + atlas_correspondence = pandas.io.parsers.read_csv(in_atlas.get_tsv_roi(), sep='\t') + label_name = list(atlas_correspondence.roi_name) + label_value = list(atlas_correspondence.roi_value) # TODO create roi_value column in lut_*.txt and remove irrelevant RGB information mean_signal_value = [] for label in label_value: diff --git a/clinica/utils/stream.py b/clinica/utils/stream.py index 383d4bf49..9d6167074 100644 --- a/clinica/utils/stream.py +++ b/clinica/utils/stream.py @@ -1,8 +1,9 @@ # coding: utf8 """ -Redirect stream and log +This module handles stream and log redirection. """ + import sys clinica_verbose = False diff --git a/clinica/utils/ux.py b/clinica/utils/ux.py index e670558e7..11cc0dfd8 100644 --- a/clinica/utils/ux.py +++ b/clinica/utils/ux.py @@ -1,5 +1,10 @@ # coding: utf8 +""" +This module gathers formatted messages that are displayed when running Clinica. + +These functions are mainly called by the pipelines. +""" LINES_TO_DISPLAY = 25 From 541df283f49a61ab76c94a96f4218dd6feddb015 Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Thu, 14 May 2020 10:09:01 +0200 Subject: [PATCH 03/69] Fix. Freeze the scikit-image version (installation issue in the latest release, 17.X) --- environment.yml | 23 +---------------------- requirements-dev.txt | 25 +++++-------------------- requirements.txt | 6 +++--- 3 files changed, 9 insertions(+), 45 deletions(-) diff --git a/environment.yml b/environment.yml index 4a1c45043..b90313735 100644 --- a/environment.yml +++ b/environment.yml @@ -5,25 +5,4 @@ dependencies: - python=3.6 - pip - pip: - - nibabel>=2.3.3 - - nipype>=1.4.0 - - pybids==0.5.1 - - argcomplete>=1.9.4 - - pandas>=0.24.2 - - jinja2>=2.10.1 - - xvfbwrapper==0.2.9 - - numpy==1.17 - - scikit-learn>=0.20.0 - - nipy>=0.4.2 - - nilearn>=0.6.0 - - colorama>=0.4.1 - - xgboost==0.80 - - xlrd>=1.2.0 - - scipy==1.2.3 - - matplotlib - - niflow-nipype1-workflows - - scikit-image - - pytest # Dev only - - pytest-timeout # Dev only - - pytest-xdist # Dev only - - pycodestyle # Dev only + - -r requirements-dev.txt diff --git a/requirements-dev.txt b/requirements-dev.txt index 7308e1a1a..4700a0f21 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,23 +1,8 @@ -################ CLINICA requirements for installation ###################### -##### Requirements with Version Specifiers ###### +# *************** Clinica requirements for installation *************** +# ***** Requirements with Version Specifiers ***** # See https://www.python.org/dev/peps/pep-0440/#version-specifiers -nibabel >= 2.3.3 -nipype >= 1.4.0 -pybids == 0.5.1 -argcomplete >= 1.9.4 -pandas >= 0.24.2 -jinja2 >= 2.10.1 -xvfbwrapper == 0.2.9 -numpy == 1.17 -scikit-learn >= 0.20.0 -nipy >= 0.4.2 -nilearn >= 0.6.0 -colorama >= 0.4.1 -xgboost == 0.80 -xlrd >= 1.2.0 -scipy == 1.2.3 -matplotlib -niflow-nipype1-workflows -scikit-image +-r requirements.txt pytest +pytest-timeout pytest-xdist +pycodestyle diff --git a/requirements.txt b/requirements.txt index 217520d26..16d6b65fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -################ CLINICA requirements for installation ###################### -##### Requirements with Version Specifiers ###### +# *************** Clinica requirements for installation *************** +# ***** Requirements with Version Specifiers ***** # See https://www.python.org/dev/peps/pep-0440/#version-specifiers nibabel >= 2.3.3 nipype >= 1.4.0 @@ -18,4 +18,4 @@ xlrd >= 1.2.0 scipy == 1.2.3 matplotlib niflow-nipype1-workflows -scikit-image +scikit-image == 0.16.2 From 91de78799d936f43995c46ad58f92774760426c9 Mon Sep 17 00:00:00 2001 From: mdiazmel Date: Tue, 26 May 2020 09:06:20 +0200 Subject: [PATCH 04/69] Improve CI (#94) * Add test reports to testing section in CI web interface * Add condition to recreate the python environment * Fix groovy syntax for Jenkinsfile * Fix space at the end of the line * Fix REGEXP for commit changeset when running the CI * Fix folder name with reports * Add folder name forgotten in Jenkinsfile (similar to last commit) --- Jenkinsfile | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 64a16385b..0a73863e4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -14,7 +14,9 @@ pipeline { environment { PATH = "$HOME/miniconda/bin:$PATH" } - when { changeset "environment.yml" } + when { + changeset "requirements.txt" + } steps { echo 'Building Conda environment... ${BRANCH_NAME}' sh 'ls' @@ -26,7 +28,9 @@ pipeline { environment { PATH = "$HOME/miniconda3/bin:$PATH" } - when { changeset "environment.yml" } + when { + changeset "requirements.txt" + } steps { echo 'Building Conda environment...' + 'env.BRANCH_NAME' sh 'ls' @@ -114,6 +118,7 @@ pipeline { cd test ln -s /mnt/data/ci/data_ci_linux ./data taskset -c 0-21 pytest \ + --junitxml=./test-reports/instantation_linux.xml \ --verbose \ --working_directory=$WORK_DIR_LINUX \ --disable-warnings \ @@ -124,6 +129,11 @@ pipeline { conda deactivate ''' } + post { + always { + junit 'test/test-reports/*.xml' + } + } } stage('Instantiate Mac') { agent { label 'macos' } @@ -143,11 +153,20 @@ pipeline { module load clinica.all cd test ln -s /Volumes/data/data_ci ./data - pytest --verbose --disable-warnings -k 'test_instantiate' + pytest \ + --verbose \ + --junitxml=./test-reports/instantation_mac.xml \ + --disable-warnings \ + -k 'test_instantiate' module purge conda deactivate ''' } + post { + always { + junit 'test/test-reports/*.xml' + } + } } } } From f90de25923d3461a08700ff7ac0906f9dbcf9f75 Mon Sep 17 00:00:00 2001 From: Alexandre Routier Date: Tue, 26 May 2020 15:22:18 +0200 Subject: [PATCH 05/69] Change how input data are chosen in ML-Prepare-Data --- .../spatial_svm_cli.py | 37 +++++++----- .../spatial_svm_pipeline.py | 57 +++++++++---------- .../test_instantiate_all_pipelines.py | 3 +- test/nonregression/test_run_pipelines.py | 3 +- 4 files changed, 52 insertions(+), 48 deletions(-) diff --git a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py index 3193f3dfb..5ca904836 100644 --- a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py +++ b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_cli.py @@ -16,21 +16,31 @@ def define_description(self): def define_options(self): """Define the sub-command arguments.""" + from colorama import Fore from clinica.engine.cmdparser import PIPELINE_CATEGORIES # Clinica compulsory arguments (e.g. BIDS, CAPS, group_id) clinica_comp = self._args.add_argument_group(PIPELINE_CATEGORIES['CLINICA_COMPULSORY']) clinica_comp.add_argument("caps_directory", help='Path to the CAPS directory.') - clinica_comp.add_argument("group_id", + clinica_comp.add_argument("group_label", help='User-defined identifier for the provided group of subjects.') + clinica_comp.add_argument("orig_input_data", + help='''Origin of input data. Type + 't1-volume' to use gray matter maps or + 'pet-volume' to use SUVr maps.''', + choices=['t1-volume', 'pet-volume'], + ) # Optional arguments - optional = self._args.add_argument_group(PIPELINE_CATEGORIES['OPTIONAL']) - optional.add_argument("-it", "--image_type", - default='t1', - help='Imaging modality. Can be t1 or pet (default: --image_type %(default)s)') - optional.add_argument("-pt", "--pet_tracer", - default='fdg', - help='PET tracer. Can be fdg or av45 (default: --pet_tracer %(default)s)') + optional_pet = self._args.add_argument_group( + '%sPipeline options if you use inputs from pet-volume pipeline%s' % + (Fore.BLUE, Fore.RESET) + ) + optional_pet.add_argument("-pt", "--pet_tracer", + default='fdg', + help='PET tracer. Can be fdg or av45 (default: --pet_tracer %(default)s)') + optional_pet.add_argument("-no_pvc", "--no_pvc", + action='store_true', default=False, + help="Force the use of non PVC PET data (by default, PVC PET data are used)") # Clinica standard arguments (e.g. --n_procs) self.add_clinica_standard_arguments() # Advanced arguments (i.e. tricky parameters) @@ -40,9 +50,6 @@ def define_options(self): help='Amount of regularization (in mm). In practice, we found the default value ' '(--full_width_half_maximum %(default)s) to be optimal. We therefore ' 'do not recommend to change it unless you have a specific reason to do so.') - advanced.add_argument("-no_pvc", "--no_pvc", - action='store_true', default=False, - help="Force the use of non PVC PET data (by default, PVC PET data are used)") def run_command(self, args): """Run the pipeline with defined args.""" @@ -51,11 +58,11 @@ def run_command(self, args): from clinica.utils.ux import print_end_pipeline, print_crash_files_and_exit parameters = { - 'group_id': args.group_id, - 'fwhm': args.fwhm, - 'image_type': args.image_type, + 'group_label': args.group_label, + 'orig_input_data': args.orig_input_data, 'pet_tracer': args.pet_tracer, - 'no_pvc': args.no_pvc + 'no_pvc': args.no_pvc, + 'fwhm': args.fwhm, } pipeline = SpatialSVM( caps_directory=self.absolute_path(args.caps_directory), diff --git a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py index 1fc757887..e7f12d49b 100644 --- a/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py +++ b/clinica/pipelines/machine_learning_spatial_svm/spatial_svm_pipeline.py @@ -6,11 +6,6 @@ class SpatialSVM(cpe.Pipeline): """SpatialSVM - Prepare input data for SVM with spatial and anatomical regularization. - Args: - input_dir: A BIDS directory. - output_dir: An empty output directory where CAPS structured data will be written. - subjects_sessions_list: The Subjects-Sessions list file (in .tsv format). - Returns: A clinica pipeline object containing the SpatialSVM pipeline. @@ -21,18 +16,18 @@ def check_pipeline_parameters(self): """Check pipeline parameters.""" from clinica.utils.group import check_group_label - if 'group_id' not in self.parameters.keys(): - raise KeyError('Missing compulsory group_id key in pipeline parameter.') + if 'group_label' not in self.parameters.keys(): + raise KeyError('Missing compulsory group_label key in pipeline parameter.') + if 'orig_input_data' not in self.parameters.keys(): + raise KeyError('Missing compulsory orig_input_data key in pipeline parameter.') if 'fwhm' not in self.parameters.keys(): self.parameters['fwhm'] = 4 - if 'image_type' not in self.parameters.keys(): - self.parameters['image_type'] = 't1' if 'pet_tracer' not in self.parameters.keys(): self.parameters['pet_tracer'] = 'fdg' if 'no_pvc' not in self.parameters.keys(): self.parameters['no_pvc'] = False - check_group_label(self.parameters['group_id']) + check_group_label(self.parameters['group_label']) def check_custom_dependencies(self): """Check dependencies that can not be listed in the `info.json` file. @@ -70,11 +65,11 @@ def build_input_node(self): from clinica.utils.ux import print_groups_in_caps_directory # Check that group already exists - if not os.path.exists(os.path.join(self.caps_directory, 'groups', 'group-' + self.parameters['group_id'])): + if not os.path.exists(os.path.join(self.caps_directory, 'groups', 'group-' + self.parameters['group_label'])): print_groups_in_caps_directory(self.caps_directory) raise ClinicaException( '%sGroup %s does not exist. Did you run pet-volume, t1-volume or t1-volume-create-dartel pipeline?%s' % - (Fore.RED, self.parameters['group_id'], Fore.RESET) + (Fore.RED, self.parameters['group_label'], Fore.RESET) ) read_parameters_node = npe.Node(name="LoadingCLIArguments", @@ -82,30 +77,30 @@ def build_input_node(self): mandatory_inputs=True)) all_errors = [] - if self.parameters['image_type'] == 't1': + if self.parameters['orig_input_data'] == 't1-volume': caps_files_information = { - 'pattern': os.path.join('t1', 'spm', 'dartel', 'group-' + self.parameters['group_id'], + 'pattern': os.path.join('t1', 'spm', 'dartel', 'group-' + self.parameters['group_label'], '*_T1w_segm-graymatter_space-Ixi549Space_modulated-on_probability.nii.gz'), 'description': 'graymatter tissue segmented in T1w MRI in Ixi549 space', 'needed_pipeline': 't1-volume-tissue-segmentation' } - elif self.parameters['image_type'] is 'pet': + elif self.parameters['orig_input_data'] is 'pet-volume': if self.parameters['no_pvc']: caps_files_information = { - 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_id'], + 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_label'], '*_pet_space-Ixi549Space_suvr-pons_pet.nii.gz'), 'description': self.parameters['pet_tracer'] + ' PET in Ixi549 space', 'needed_pipeline': 'pet-volume' } else: caps_files_information = { - 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_id'], + 'pattern': os.path.join('pet', 'preprocessing', 'group-' + self.parameters['group_label'], '*_pet_space-Ixi549Space_pvc-rbv_suvr-pons_pet.nii.gz'), 'description': self.parameters['pet_tracer'] + ' PET partial volume corrected (RBV) in Ixi549 space', 'needed_pipeline': 'pet-volume with PVC' } else: - raise ValueError('Image type ' + self.parameters['image_type'] + ' unknown.') + raise ValueError('Image type ' + self.parameters['orig_input_data'] + ' unknown.') try: input_image = clinica_file_reader(self.subjects, @@ -117,7 +112,7 @@ def build_input_node(self): try: dartel_input = clinica_group_reader(self.caps_directory, - t1_volume_final_group_template(self.parameters['group_id'])) + t1_volume_final_group_template(self.parameters['group_label'])) except ClinicaException as e: all_errors.append(e) @@ -175,33 +170,33 @@ def build_core_nodes(self): name='sinker') datasink.inputs.base_directory = self.caps_directory datasink.inputs.parameterization = True - if self.parameters['image_type'] == 't1': + if self.parameters['orig_input_data'] == 't1-volume': datasink.inputs.regexp_substitutions = [ (r'(.*)/regularized_image/.*/(.*(sub-(.*)_ses-(.*))_T1w(.*)_probability(.*))$', r'\1/subjects/sub-\4/ses-\5/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'/\3_T1w\6_spatialregularization\7'), + 'group_label'] + r'/\3_T1w\6_spatialregularization\7'), (r'(.*)json_file/(output_data.json)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'_space-Ixi549Space_parameters.json'), + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ + 'group_label'] + r'_space-Ixi549Space_parameters.json'), (r'(.*)fisher_tensor_path/(output_fisher_tensor.npy)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'_space-Ixi549Space_gram.npy') + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ + 'group_label'] + r'_space-Ixi549Space_gram.npy') ] - elif self.parameters['image_type'] == 'pet': + elif self.parameters['orig_input_data'] == 'pet-volume': datasink.inputs.regexp_substitutions = [ (r'(.*)/regularized_image/.*/(.*(sub-(.*)_ses-(.*))_(task.*)_pet(.*))$', r'\1/subjects/sub-\4/ses-\5/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'/\3_\6_spatialregularization\7'), + 'group_label'] + r'/\3_\6_spatialregularization\7'), (r'(.*)json_file/(output_data.json)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + - self.parameters['group_id'] + r'_space-Ixi549Space_parameters.json'), + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + + self.parameters['group_label'] + r'_space-Ixi549Space_parameters.json'), (r'(.*)fisher_tensor_path/(output_fisher_tensor.npy)$', - r'\1/groups/group-' + self.parameters['group_id'] + r'/machine_learning/input_spatial_svm/group-' + + r'\1/groups/group-' + self.parameters['group_label'] + r'/machine_learning/input_spatial_svm/group-' + self.parameters[ - 'group_id'] + r'_space-Ixi549Space_gram.npy') + 'group_label'] + r'_space-Ixi549Space_gram.npy') ] # Connection # ========== diff --git a/test/instantiation/test_instantiate_all_pipelines.py b/test/instantiation/test_instantiate_all_pipelines.py index 99a878dbb..563531ec9 100644 --- a/test/instantiation/test_instantiate_all_pipelines.py +++ b/test/instantiation/test_instantiate_all_pipelines.py @@ -323,7 +323,8 @@ def test_instantiate_SpatialSVM(): root = join(root, 'data', 'SpatialSVM') parameters = { - 'group_id': 'ADNIbl' + 'group_label': 'ADNIbl', + 'orig_input_data': 't1-volume' } pipeline = SpatialSVM( caps_directory=join(root, 'in', 'caps'), diff --git a/test/nonregression/test_run_pipelines.py b/test/nonregression/test_run_pipelines.py index d21956f9f..45aa1cc78 100644 --- a/test/nonregression/test_run_pipelines.py +++ b/test/nonregression/test_run_pipelines.py @@ -754,7 +754,8 @@ def test_run_SpatialSVM(cmdopt): shutil.copytree(join(root, 'in', 'caps'), join(root, 'out', 'caps')) parameters = { - 'group_id': 'ADNIbl' + 'group_label': 'ADNIbl', + 'orig_input_data': 't1-volume' } # Instantiate pipeline and run() pipeline = SpatialSVM( From 00331bb9b3a02b9c19be199ca7c54869b5fa9649 Mon Sep 17 00:00:00 2001 From: Jorge Samper-Gonzalez Date: Wed, 3 Jun 2020 09:40:01 +0200 Subject: [PATCH 06/69] Refactor machine learning workflows (#57) * Update machine learning pipeline * Update machine learning pipeline tests * Fix bugs in machine learning module input parameters --- .../pipelines/machine_learning/algorithm.py | 372 ++++---- clinica/pipelines/machine_learning/base.py | 173 ++-- clinica/pipelines/machine_learning/input.py | 374 ++++---- .../pipelines/machine_learning/ml_utils.py | 58 ++ .../machine_learning/ml_workflows.py | 835 ++---------------- .../pipelines/machine_learning/validation.py | 506 +++++------ .../test_instantiate_all_pipelines.py | 23 +- test/nonregression/test_run_pipelines.py | 32 +- 8 files changed, 887 insertions(+), 1486 deletions(-) diff --git a/clinica/pipelines/machine_learning/algorithm.py b/clinica/pipelines/machine_learning/algorithm.py index a62390bb1..b3c8372d0 100644 --- a/clinica/pipelines/machine_learning/algorithm.py +++ b/clinica/pipelines/machine_learning/algorithm.py @@ -33,17 +33,9 @@ class DualSVMAlgorithm(base.MLAlgorithm): - def __init__(self, kernel, y, balanced=True, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - self._kernel = kernel - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads - def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): - if self._balanced: + if self._algorithm_params['balanced']: svc = SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced') else: svc = SVC(C=c, kernel='precomputed', probability=True, tol=1e-6) @@ -87,15 +79,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -105,7 +97,7 @@ def evaluate(self, train_index, test_index): x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) @@ -146,7 +138,7 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: + if self._algorithm_params['balanced']: svc = SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced') else: svc = SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6) @@ -177,29 +169,30 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return True -class LogisticReg(base.MLAlgorithm): + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': True, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} - def __init__(self, x, y, penalty='l2', balanced=False, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - """ penalty can either be 'l2' or 'l1'""" - self._penalty = penalty - self._x = x - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads + return parameters_dict - def _launch_logistic_reg(self, x_train, x_test, y_train, y_test, c, shared_x=None, train_indices=None, - test_indices=None): - # x_train_, mean_x, std_x = centered_normalised_data(x_train) - # x_test_ = (x_test - mean_x)/std_x +class LogisticReg(base.MLAlgorithm): - if self._balanced: - classifier = LogisticRegression(penalty=self._penalty, tol=1e-6, C=c, class_weight='balanced') + def _launch_logistic_reg(self, x_train, x_test, y_train, y_test, c): + + if self._algorithm_params['balanced']: + classifier = LogisticRegression(penalty=self._algorithm_params['penalty'], tol=1e-6, C=c, + class_weight='balanced') else: - classifier = LogisticRegression(penalty=self._penalty, tol=1e-6, C=c) + classifier = LogisticRegression(penalty=self._algorithm_params['penalty'], tol=1e-6, C=c) classifier.fit(x_train, y_train) y_hat_train = classifier.predict(x_train) @@ -240,15 +233,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -259,7 +252,7 @@ def evaluate(self, train_index, test_index): y_train_inner = y_train[inner_train_index] y_test_inner = y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (x_train_inner, x_test_inner, y_train_inner, y_test_inner, c)) @@ -300,10 +293,11 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: - classifier = LogisticRegression(C=best_c, penalty=self._penalty, tol=1e-6, class_weight='balanced') + if self._algorithm_params['balanced']: + classifier = LogisticRegression(C=best_c, penalty=self._algorithm_params['penalty'], tol=1e-6, + class_weight='balanced') else: - classifier = LogisticRegression(C=best_c, penalty=self._penalty, tol=1e-6) + classifier = LogisticRegression(C=best_c, penalty=self._algorithm_params['penalty'], tol=1e-6) classifier.fit(self._x, self._y) @@ -314,7 +308,7 @@ def save_classifier(self, classifier, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.coef_.transpose()) np.savetxt(path.join(output_dir, 'intercept.txt'), classifier.intercept_) - def save_weights(self, classifier, output_dir): + def save_weights(self, classifier, x, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.coef_.transpose()) return classifier.coef_.transpose() @@ -332,35 +326,34 @@ def _centered_normalised_data(features): features_bis = (features - mean)/std return features_bis, mean, std + @staticmethod + def uses_kernel(): + return False + + @staticmethod + def get_default_parameters(): + parameters_dict = {'penalty': 'l2', + 'balanced': False, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} + + return parameters_dict + class RandomForest(base.MLAlgorithm): - def __init__(self, x, y, balanced=False, grid_search_folds=10, - n_estimators_range=(10, 25, 50, 100, 150, 200, 500), - max_depth_range=(None, 6, 8, 10, 12), - min_samples_split_range=(2, 4, 6, 8), - max_features_range=('auto', 0.1, 0.2, 0.3, 0.4, 0.5), - n_threads=15): - self._x = x - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._n_threads = n_threads - - def _launch_random_forest(self, x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, max_features): - - if self._balanced: + def _launch_random_forest(self, x_train, x_test, y_train, y_test, n_estimators, max_depth, min_samples_split, + max_features): + + if self._algorithm_params['balanced']: classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features, - class_weight='balanced', n_jobs=self._n_threads) + class_weight='balanced', n_jobs=self._algorithm_params['n_threads']) else: classifier = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features, - n_jobs=self._n_threads) + n_jobs=self._algorithm_params['n_threads']) classifier.fit(x_train, y_train) y_hat_train = classifier.predict(x_train) @@ -405,9 +398,6 @@ def _select_best_parameter(self, async_result): params_list.append(best_params) accuracies.append(best_acc) - # TODO For exploratory purpose only. Erase later - # pd.concat(all_params_acc).to_csv('all_params_acc_%s.tsv' % datetime.datetime.now(), sep='\t', index=False, encoding='utf-8') - best_acc = np.mean(accuracies) best_n_estimators = int(round(np.mean([x[0] for x in params_list]))) best_max_depth = int(round(np.mean([x[1] if x[1] is not None else 50 for x in params_list]))) @@ -435,21 +425,21 @@ def max_feature_to_float(m): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) - parameters_combinations = list(itertools.product(self._n_estimators_range, - self._max_depth_range, - self._min_samples_split_range, - self._max_features_range)) + parameters_combinations = list(itertools.product(self._algorithm_params['n_estimators_range'], + self._algorithm_params['max_depth_range'], + self._algorithm_params['min_samples_split_range'], + self._algorithm_params['max_features_range'])) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] @@ -499,16 +489,16 @@ def evaluate_no_cv(self, train_index, test_index): y_test = self._y[test_index] best_parameter = dict() - best_parameter['n_estimators'] = self._n_estimators_range - best_parameter['max_depth'] = self._max_depth_range - best_parameter['min_samples_split'] = self._min_samples_split_range - best_parameter['max_features'] = self._max_features_range + best_parameter['n_estimators'] = self._algorithm_params['n_estimators_range'] + best_parameter['max_depth'] = self._algorithm_params['max_depth_range'] + best_parameter['min_samples_split'] = self._algorithm_params['min_samples_split_range'] + best_parameter['max_features'] = self._algorithm_params['max_features_range'] _, y_hat, auc, y_hat_train = self._launch_random_forest(x_train, x_test, y_train, y_test, - self._n_estimators_range, - self._max_depth_range, - self._min_samples_split_range, - self._max_features_range) + self._algorithm_params['n_estimators_range'], + self._algorithm_params['max_depth_range'], + self._algorithm_params['min_samples_split_range'], + self._algorithm_params['max_features_range']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) @@ -528,8 +518,11 @@ def apply_best_parameters(self, results_list): mean_bal_acc = np.mean([result['best_parameter']['balanced_accuracy'] for result in results_list]) best_n_estimators = int(round(np.mean([result['best_parameter']['n_estimators'] for result in results_list]))) - best_max_depth = int(round(np.mean([result['best_parameter']['max_depth'] if result['best_parameter']['max_depth'] is not None else 50 for result in results_list]))) - best_min_samples_split = int(round(np.mean([result['best_parameter']['min_samples_split'] for result in results_list]))) + best_max_depth = int(round(np.mean([result['best_parameter']['max_depth'] + if result['best_parameter']['max_depth'] is not None + else 50 for result in results_list]))) + best_min_samples_split = int(round(np.mean([result['best_parameter']['min_samples_split'] + for result in results_list]))) max_feat = [] n_features = self._x.shape[1] @@ -552,14 +545,16 @@ def apply_best_parameters(self, results_list): max_feat.append(max_features) best_max_features = np.mean(max_feat) - if self._balanced: + if self._algorithm_params['balanced']: classifier = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, - min_samples_split=best_min_samples_split, max_features=best_max_features, - class_weight='balanced', n_jobs=self._n_threads) + min_samples_split=best_min_samples_split, + max_features=best_max_features, + class_weight='balanced', n_jobs=self._algorithm_params['n_threads']) else: classifier = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, - min_samples_split=best_min_samples_split, max_features=best_max_features, - n_jobs=self._n_threads) + min_samples_split=best_min_samples_split, + max_features=best_max_features, + n_jobs=self._algorithm_params['n_threads']) classifier.fit(self._x, self._y) @@ -574,7 +569,7 @@ def save_classifier(self, classifier, output_dir): # print classifier.estimators_ # np.savetxt(path.join(output_dir, 'estimators.txt'), str(classifier.estimators_)) - def save_weights(self, classifier, output_dir): + def save_weights(self, classifier, x, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.feature_importances_) return classifier.feature_importances_ @@ -584,41 +579,43 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return False + + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': False, + 'grid_search_folds': 10, + 'n_estimators_range': (10, 25, 50, 100, 150, 200, 500), + 'max_depth_range': (None, 6, 8, 10, 12), + 'min_samples_split_range': (2, 4, 6, 8), + 'max_features_range': ('auto', 0.1, 0.2, 0.3, 0.4, 0.5), + 'n_threads': 15} + + return parameters_dict + class XGBoost(base.MLAlgorithm): - def __init__(self, x, y, balanced=False, grid_search_folds=10, - max_depth_range=(0, 6), - learning_rate_range=(0.1, 0.3), - n_estimators_range=(100, 200), - colsample_bytree_range=(0.5, 1), - reg_alpha=0, - reg_lambda=1, - n_threads=15): - self._x = x - self._y = y - self._balanced = balanced - self._scale_pos_weight = float(len(self._y - sum(self._y)) / sum(self._y)) - self._grid_search_folds = grid_search_folds - self._max_depth_range = max_depth_range - self._learning_rate_range = learning_rate_range - self._n_estimators_range = n_estimators_range - self._colsample_bytree_range = colsample_bytree_range - self._reg_alpha = reg_alpha - self._reg_lambda = reg_lambda - self._n_threads = n_threads - - def _launch_xgboost(self, x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, colsample_bytree): - if self._balanced: + + def _launch_xgboost(self, x_train, x_test, y_train, y_test, max_depth, learning_rate, n_estimators, + colsample_bytree): + + if self._algorithm_params['balanced']: # set scale_pos_weight # http://xgboost.readthedocs.io/en/latest//how_to/param_tuning.html + scale_pos_weight = float(len(self._y - sum(self._y)) / sum(self._y)) classifier = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, - n_jobs=self._n_threads, colsample_bytree=colsample_bytree, - reg_alpha=self._reg_alpha, reg_lambda=self._reg_lambda, - scale_pos_weight=self._scale_pos_weight) + n_jobs=self._algorithm_params['n_threads'], colsample_bytree=colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda'], + scale_pos_weight=scale_pos_weight) else: classifier = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, - n_jobs=self._n_threads, colsample_bytree=colsample_bytree, - reg_alpha=self._reg_alpha, reg_lambda=self._reg_lambda) + n_jobs=self._algorithm_params['n_threads'], colsample_bytree=colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda']) classifier.fit(x_train, y_train) y_hat_train = classifier.predict(x_train) @@ -677,21 +674,21 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} x_train = self._x[train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) - parameters_combinations = list(itertools.product(self._max_depth_range, - self._learning_rate_range, - self._n_estimators_range, - self._colsample_bytree_range)) + parameters_combinations = list(itertools.product(self._algorithm_params['max_depth_range'], + self._algorithm_params['learning_rate_range'], + self._algorithm_params['n_estimators_range'], + self._algorithm_params['colsample_bytree_range'])) for i in range(len(inner_cv)): inner_train_index, inner_test_index = inner_cv[i] @@ -741,16 +738,16 @@ def evaluate_no_cv(self, train_index, test_index): y_test = self._y[test_index] best_parameter = dict() - best_parameter['max_depth'] = self._max_depth_range - best_parameter['learning_rate'] = self._learning_rate_range - best_parameter['n_estimators'] = self._n_estimators_range - best_parameter['colsample_bytree'] = self._colsample_bytree_range + best_parameter['max_depth'] = self._algorithm_params['max_depth_range'] + best_parameter['learning_rate'] = self._algorithm_params['learning_rate_range'] + best_parameter['n_estimators'] = self._algorithm_params['n_estimators_range'] + best_parameter['colsample_bytree'] = self._algorithm_params['colsample_bytree_range'] _, y_hat, auc, y_hat_train = self._launch_xgboost(x_train, x_test, y_train, y_test, - self._max_depth_range, - self._learning_rate_range, - self._n_estimators_range, - self._colsample_bytree_range) + self._algorithm_params['max_depth_range'], + self._algorithm_params['learning_rate_range'], + self._algorithm_params['n_estimators_range'], + self._algorithm_params['colsample_bytree_range']) result = dict() result['best_parameter'] = best_parameter result['evaluation'] = utils.evaluate_prediction(y_test, y_hat) @@ -775,16 +772,21 @@ def apply_best_parameters(self, results_list): best_n_estimators = int(round(np.mean([result['best_parameter']['n_estimators'] for result in results_list]))) best_colsample_bytree = np.mean([result['best_parameter']['colsample_bytree'] for result in results_list]) - if self._balanced: + if self._algorithm_params['balanced']: + scale_pos_weight = float(len(self._y - sum(self._y)) / sum(self._y)) + classifier = XGBClassifier(max_depth=best_max_depth, learning_rate=best_learning_rate, - n_estimators=best_n_estimators, n_jobs=self._n_threads, - colsample_bytree=best_colsample_bytree, reg_alpha=self._reg_alpha, - reg_lambda=self._reg_lambda, scale_pos_weight=self._scale_pos_weight) + n_estimators=best_n_estimators, n_jobs=self._algorithm_params['n_threads'], + colsample_bytree=best_colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda'], + scale_pos_weight=scale_pos_weight) else: classifier = XGBClassifier(max_depth=best_max_depth, learning_rate=best_learning_rate, - n_estimators=best_n_estimators, n_jobs=self._n_threads, - colsample_bytree=best_colsample_bytree, reg_alpha=self._reg_alpha, - reg_lambda=self._reg_lambda) + n_estimators=best_n_estimators, n_jobs=self._algorithm_params['n_threads'], + colsample_bytree=best_colsample_bytree, + reg_alpha=self._algorithm_params['reg_alpha'], + reg_lambda=self._algorithm_params['reg_lambda']) classifier.fit(self._x, self._y) @@ -799,7 +801,7 @@ def save_classifier(self, classifier, output_dir): # print classifier.estimators_ # np.savetxt(path.join(output_dir, 'estimators.txt'), str(classifier.estimators_)) - def save_weights(self, classifier, output_dir): + def save_weights(self, classifier, x, output_dir): np.savetxt(path.join(output_dir, 'weights.txt'), classifier.feature_importances_) return classifier.feature_importances_ @@ -809,20 +811,32 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return False + + @staticmethod + def get_default_parameters(): + parameters_dict = {'balanced': False, + 'grid_search_folds': 10, + 'max_depth_range': (0, 6), + 'learning_rate_range': (0.1, 0.3), + 'n_estimators_range': (100, 200), + 'colsample_bytree_range': (0.5, 1), + 'reg_alpha': 0, + 'reg_lambda': 1, + 'n_threads': 15} + + return parameters_dict + class OneVsOneSVM(base.MLAlgorithm): - def __init__(self, kernel, y, balanced=True, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - self._kernel = kernel - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): - if self._balanced: - svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6)) @@ -865,15 +879,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -883,7 +897,7 @@ def evaluate(self, train_index, test_index): x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) @@ -923,8 +937,9 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: - svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6)) @@ -953,20 +968,28 @@ def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + @staticmethod + def uses_kernel(): + return True + + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': True, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} + + return parameters_dict + class OneVsRestSVM(base.MLAlgorithm): - def __init__(self, kernel, y, balanced=True, grid_search_folds=10, c_range=np.logspace(-6, 2, 17), n_threads=15): - self._kernel = kernel - self._y = y - self._balanced = balanced - self._grid_search_folds = grid_search_folds - self._c_range = c_range - self._n_threads = n_threads def _launch_svc(self, kernel_train, x_test, y_train, y_test, c): - if self._balanced: - svc = OneVsRestClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsRestClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsRestClassifier(SVC(C=c, kernel='precomputed', probability=True, tol=1e-6)) @@ -1009,15 +1032,15 @@ def _select_best_parameter(self, async_result): def evaluate(self, train_index, test_index): - inner_pool = ThreadPool(self._n_threads) + inner_pool = ThreadPool(self._algorithm_params['n_threads']) async_result = {} - for i in range(self._grid_search_folds): + for i in range(self._algorithm_params['grid_search_folds']): async_result[i] = {} outer_kernel = self._kernel[train_index, :][:, train_index] y_train = self._y[train_index] - skf = StratifiedKFold(n_splits=self._grid_search_folds, shuffle=True) + skf = StratifiedKFold(n_splits=self._algorithm_params['grid_search_folds'], shuffle=True) inner_cv = list(skf.split(np.zeros(len(y_train)), y_train)) for i in range(len(inner_cv)): @@ -1027,7 +1050,7 @@ def evaluate(self, train_index, test_index): x_test_inner = outer_kernel[inner_test_index, :][:, inner_train_index] y_train_inner, y_test_inner = y_train[inner_train_index], y_train[inner_test_index] - for c in self._c_range: + for c in self._algorithm_params['c_range']: async_result[i][c] = inner_pool.apply_async(self._grid_search, (inner_kernel, x_test_inner, y_train_inner, y_test_inner, c)) @@ -1067,8 +1090,9 @@ def apply_best_parameters(self, results_list): # Mean balanced accuracy mean_bal_acc = np.mean(bal_acc_list) - if self._balanced: - svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, class_weight='balanced')) + if self._algorithm_params['balanced']: + svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6, + class_weight='balanced')) else: svc = OneVsOneClassifier(SVC(C=best_c, kernel='precomputed', probability=True, tol=1e-6)) @@ -1096,3 +1120,17 @@ def save_weights(self, classifier, x, output_dir): def save_parameters(self, parameters_dict, output_dir): with open(path.join(output_dir, 'best_parameters.json'), 'w') as f: json.dump(parameters_dict, f) + + @staticmethod + def uses_kernel(): + return True + + @staticmethod + def get_default_parameters(): + + parameters_dict = {'balanced': True, + 'grid_search_folds': 10, + 'c_range': np.logspace(-6, 2, 17), + 'n_threads': 15} + + return parameters_dict diff --git a/clinica/pipelines/machine_learning/base.py b/clinica/pipelines/machine_learning/base.py index aff46e809..033492d43 100644 --- a/clinica/pipelines/machine_learning/base.py +++ b/clinica/pipelines/machine_learning/base.py @@ -1,7 +1,7 @@ # coding: utf8 -import abc +from abc import ABC, abstractmethod __author__ = "Jorge Samper-Gonzalez" __copyright__ = "Copyright 2016-2019 The Aramis Lab Team" @@ -13,114 +13,151 @@ __status__ = "Development" -class MLWorkflow: - __metaclass__ = abc.ABCMeta +class MLWorkflow(ABC): - # def __init__(self, ml_input, ml_validation, ml_algorithm, output_dir): - # self._ml_input = ml_input - # self._ml_validation = ml_validation - # self._ml_algorithm = ml_algorithm - # self._output_dir = output_dir + def __init__(self, input_class, validation_class, algorithm_class, all_params, output_dir): - @abc.abstractmethod - def run(self): - pass + self._input_class = input_class + self._validation_class = validation_class + self._algorithm_class = algorithm_class - def save_image(self): + self._input_params = self.create_parameters_dict(all_params, input_class) + self._validation_params = self.create_parameters_dict(all_params, validation_class) + self._algorithm_params = self.create_parameters_dict(all_params, algorithm_class) - import os - import pandas as pd + self._output_dir = output_dir - pd.io.parsers.read_csv(os.path.join(self._output_dir, 'results.tsv'), sep='\t') + self._input = None + self._validation = None + self._algorithm = None - @staticmethod - def metric_distribution(metric, labels, output_path, num_classes=2, metric_label='balanced accuracy'): - """ + def run(self): - Distribution plots of various metrics such as balanced accuracy! + from os import path, makedirs - metric is expected to be ndarray of size [num_repetitions, num_datasets] + # Instantiating input class + self._input = self._input_class(self._input_params) - """ - import numpy as np - import matplotlib.pyplot as plt - from matplotlib import cm - from matplotlib.backends.backend_pdf import PdfPages + # Computing input values + x = self._input.get_x() + y = self._input.get_y() - num_repetitions = metric.shape[0] - num_datasets = metric.shape[1] - assert len(labels) == num_datasets, "Differing number of features and labels!" - method_ticks = 1.0 + np.arange(num_datasets) + # Instantiating classification algorithm + if self._algorithm_class.uses_kernel(): + kernel = self._input.get_kernel() + self._algorithm = self._algorithm_class(kernel, y, self._algorithm_params) + else: + self._algorithm = self._algorithm_class(x, y, self._algorithm_params) - fig, ax = plt.subplots(figsize=[9, 9]) - line_coll = ax.violinplot(metric, widths=0.8, bw_method=0.2, - showmedians=True, showextrema=False, - positions=method_ticks) + # Instantiating cross-validation method and classification algorithm + self._validation = self._validation_class(self._algorithm, self._validation_params) - cmap = cm.get_cmap('Paired', num_datasets) - for cc, ln in enumerate(line_coll['bodies']): - ln.set_facecolor(cmap(cc)) - ln.set_label(labels[cc]) + # Launching classification with selected cross-validation + classifier, best_params, results = self._validation.validate(y) - plt.legend(loc=2, ncol=num_datasets) + # Creation of the directory to save results + classifier_dir = path.join(self._output_dir, 'classifier') + if not path.exists(classifier_dir): + makedirs(classifier_dir) - ax.tick_params(axis='both', which='major', labelsize=15) - ax.grid(axis='y', which='major') + # Saving algorithm trained classifier + self._algorithm.save_classifier(classifier, classifier_dir) + self._algorithm.save_weights(classifier, x, classifier_dir) + self._algorithm.save_parameters(best_params, classifier_dir) - lower_lim = np.round(np.min([np.float64(0.9 / num_classes), metric.min()]), 3) - upper_lim = np.round(np.max([1.01, metric.max()]), 3) - step_tick = 0.1 - ax.set_ylim(lower_lim, upper_lim) + # Saving validation trained classifier + self._validation.save_results(self._output_dir) - ax.set_xticks(method_ticks) - ax.set_xlim(np.min(method_ticks) - 1, np.max(method_ticks) + 1) - ax.set_xticklabels(labels, rotation=45) # 'vertical' + @staticmethod + def create_parameters_dict(locals_dictionary, component_class): - ax.set_yticks(np.arange(lower_lim, upper_lim, step_tick)) - ax.set_yticklabels(np.arange(lower_lim, upper_lim, step_tick)) - # plt.xlabel(xlabel, fontsize=16) - plt.ylabel(metric_label, fontsize=16) + default_parameters = component_class.get_default_parameters() + for key in locals_dictionary: + if key in default_parameters: + default_parameters[key] = locals_dictionary[key] + return default_parameters - fig.tight_layout() - pp1 = PdfPages(output_path + '.pdf') - pp1.savefig() - pp1.close() +class MLInput(ABC): - return + def __init__(self, input_params): + self._input_params = self.get_default_parameters() + self._input_params.update(input_params) -class MLInput: - __metaclass__ = abc.ABCMeta + self._x = None + self._y = None + self._kernel = None - @abc.abstractmethod + @abstractmethod def get_x(self): pass - @abc.abstractmethod + @abstractmethod def get_y(self): pass + @staticmethod + @abstractmethod + def get_default_parameters(): + pass -class MLValidation: - __metaclass__ = abc.ABCMeta - @abc.abstractmethod +class MLValidation(ABC): + + def __init__(self, ml_algorithm, validation_params): + + self._ml_algorithm = ml_algorithm + + self._validation_params = self.get_default_parameters() + self._validation_params.update(validation_params) + + self._validation_results = [] + self._classifier = None + self._best_params = None + + @abstractmethod def validate(self, y): pass + @staticmethod + @abstractmethod + def get_default_parameters(): + pass + + +class MLAlgorithm(ABC): + + def __init__(self, input_data, y, algorithm_params): + + self._algorithm_params = self.get_default_parameters() + self._algorithm_params.update(algorithm_params) + + if self.uses_kernel(): + self._kernel = input_data + else: + self._x = input_data -class MLAlgorithm: - __metaclass__ = abc.ABCMeta + self._y = y - @abc.abstractmethod + @staticmethod + @abstractmethod + def uses_kernel(): + pass + + @abstractmethod def evaluate(self, train_index, test_index): pass - @abc.abstractmethod + @abstractmethod def save_classifier(self, classifier, output_dir): pass - @abc.abstractmethod + @abstractmethod def save_parameters(self, parameters, output_dir): pass + + @staticmethod + @abstractmethod + def get_default_parameters(): + pass diff --git a/clinica/pipelines/machine_learning/input.py b/clinica/pipelines/machine_learning/input.py index 669026fb8..81c4ceb7f 100644 --- a/clinica/pipelines/machine_learning/input.py +++ b/clinica/pipelines/machine_learning/input.py @@ -7,6 +7,7 @@ import numpy as np from pandas.io import parsers +from clinica.utils.stream import cprint from clinica.pipelines.machine_learning import base import clinica.pipelines.machine_learning.voxel_based_io as vbio import clinica.pipelines.machine_learning.vertex_based_io as vtxbio @@ -15,6 +16,8 @@ import clinica.pipelines.machine_learning.ml_utils as utils + + __author__ = "Jorge Samper-Gonzalez" __copyright__ = "Copyright 2016-2019 The Aramis Lab Team" __credits__ = ["Jorge Samper-Gonzalez", "Simona Bottani"] @@ -27,50 +30,37 @@ class CAPSInput(base.MLInput): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, precomputed_kernel=None): - """ + def __init__(self, input_params): - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - precomputed_kernel: - """ + super().__init__(input_params) - self._caps_directory = caps_directory - self._group_id = group_id - self._image_type = image_type self._images = None - self._x = None - self._y = None - self._kernel = None - subjects_visits = parsers.read_csv(subjects_visits_tsv, sep='\t') + subjects_visits = parsers.read_csv(self._input_params['subjects_visits_tsv'], sep='\t') if list(subjects_visits.columns.values) != ['participant_id', 'session_id']: raise Exception('Subjects and visits file is not in the correct format.') self._subjects = list(subjects_visits.participant_id) self._sessions = list(subjects_visits.session_id) - diagnoses = parsers.read_csv(diagnoses_tsv, sep='\t') + diagnoses = parsers.read_csv(self._input_params['diagnoses_tsv'], sep='\t') if 'diagnosis' not in list(diagnoses.columns.values): raise Exception('Diagnoses file is not in the correct format.') self._diagnoses = list(diagnoses.diagnosis) - if image_type not in ['T1', 'fdg', 'av45', 'pib', 'flute', 'dwi']: - raise Exception("Incorrect image type. It must be one of the values 'T1', 'fdg', 'av45', 'pib', 'flute' or 'dwi'") + if self._input_params['image_type'] not in ['T1', 'fdg', 'av45', 'pib', 'flute', 'dwi']: + raise Exception("Incorrect image type. It must be one of the values 'T1', 'fdg', 'av45', " + "'pib', 'flute' or 'dwi'") - if precomputed_kernel is not None: - if type(precomputed_kernel) == np.ndarray: - if precomputed_kernel.shape == (len(self._subjects), len(self._subjects)): - self._kernel = precomputed_kernel + if self._input_params['precomputed_kernel'] is not None: + if type(self._input_params['precomputed_kernel']) == np.ndarray: + if self._input_params['precomputed_kernel'].shape == (len(self._subjects), len(self._subjects)): + self._kernel = self._input_params['precomputed_kernel'] else: raise Exception("""Precomputed kernel provided is not in the correct format. It must be a numpy.ndarray object with number of rows and columns equal to the number of subjects, or a filename to a numpy txt file containing an object with the described format.""") - elif type(precomputed_kernel == str): - self._kernel = np.loadtxt(precomputed_kernel) + elif type(self._input_params['precomputed_kernel'] == str): + self._kernel = np.loadtxt(self._input_params['precomputed_kernel']) else: raise Exception("""Precomputed kernel provided is not in the correct format. It must be a numpy.ndarray object with number of rows and columns equal to the number of subjects, @@ -119,9 +109,9 @@ def get_kernel(self, kernel_function=utils.gram_matrix_linear, recompute_if_exis if self._x is None: self.get_x() - print("Computing kernel ...") + cprint("Computing kernel ...") self._kernel = kernel_function(self._x) - print("Kernel computed") + cprint("Kernel computed") return self._kernel def save_kernel(self, output_dir): @@ -143,36 +133,29 @@ def save_kernel(self, output_dir): def save_weights_as_nifti(self, weights, output_dir): pass + @staticmethod + def get_default_parameters(): + + parameters_dict = {'caps_directory': None, + 'subjects_visits_tsv': None, + 'diagnoses_tsv': None, + 'group_id': None, + 'image_type': None, + 'precomputed_kernel': None} + + return parameters_dict + class CAPSVoxelBasedInput(CAPSInput): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, fwhm=0, - modulated="on", pvc=None, mask_zeros=True, precomputed_kernel=None): - """ + def __init__(self, input_params): + + super().__init__(input_params) - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - fwhm: - modulated: - mask_zeros: - precomputed_kernel: - """ - - super(CAPSVoxelBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel=precomputed_kernel) - - self._fwhm = fwhm - self._modulated = modulated - self._pvc = pvc - self._mask_zeros = mask_zeros self._orig_shape = None self._data_mask = None - if modulated not in ['on', 'off']: + if self._input_params['modulated'] not in ['on', 'off']: raise Exception("Incorrect modulation parameter. It must be one of the values 'on' or 'off'") def get_images(self): @@ -184,23 +167,24 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'T1': - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) + if self._input_params['image_type'] == 'T1': + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 't1/spm/dartel/group-' + self._group_id, + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 't1/spm/dartel/group-' + self._input_params['group_id'], '%s_%s_T1w_segm-graymatter_space-Ixi549Space_modulated-%s%s_probability.nii.gz' - % (self._subjects[i], self._sessions[i], self._modulated, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['modulated'], fwhm)) for i in range(len(self._subjects))] else: - pvc = '' if self._pvc is None else '_pvc-%s' % self._pvc - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) - suvr = 'pons' if self._image_type == 'fdg' else 'cerebellumPons' + pvc = '' if self._input_params['pvc'] is None else '_pvc-%s' % self._input_params['pvc'] + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) + suvr = 'pons' if self._input_params['image_type'] == 'fdg' else 'cerebellumPons' - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 'pet/preprocessing/group-' + self._group_id, + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet/preprocessing/group-' + self._input_params['group_id'], '%s_%s_task-rest_acq-%s_pet_space-Ixi549Space%s_suvr-%s_mask-brain%s_pet.nii.gz' - % (self._subjects[i], self._sessions[i], self._image_type, pvc, suvr, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['image_type'], pvc, + suvr, fwhm)) for i in range(len(self._subjects))] for image in self._images: @@ -218,9 +202,9 @@ def get_x(self): if self._x is not None: return self._x - print('Loading ' + str(len(self.get_images())) + ' subjects') - self._x, self._orig_shape, self._data_mask = vbio.load_data(self._images, mask=self._mask_zeros) - print('Subjects loaded') + cprint('Loading ' + str(len(self.get_images())) + ' subjects') + self._x, self._orig_shape, self._data_mask = vbio.load_data(self._images, mask=self._input_params['mask_zeros']) + cprint('Subjects loaded') return self._x @@ -233,33 +217,30 @@ def save_weights_as_nifti(self, weights, output_dir): data = vbio.revert_mask(weights, self._data_mask, self._orig_shape) vbio.weights_to_nifti(data, self._images[0], output_filename) + @staticmethod + def get_default_parameters(): -class CAPSRegionBasedInput(CAPSInput): + parameters_dict = super(CAPSVoxelBasedInput, CAPSVoxelBasedInput).get_default_parameters() - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, - pvc=None, precomputed_kernel=None): - """ + new_parameters = {'fwhm': 0, + 'modulated': "on", + 'pvc': None, + 'mask_zeros': True} - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - atlas: - precomputed_kernel: - """ + parameters_dict.update(new_parameters) - super(CAPSRegionBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel=precomputed_kernel) + return parameters_dict - self._atlas = atlas - self._pvc = pvc - self._orig_shape = None - self._data_mask = None - if atlas not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: - raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers' ") +class CAPSRegionBasedInput(CAPSInput): + + def __init__(self, input_params): + + super().__init__(input_params) + + if self._input_params['atlas'] not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: + raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', " + "'AICHA', 'LPBA40', 'Hammers' ") def get_images(self): """ @@ -270,20 +251,21 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'T1': - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 't1/spm/dartel/group-' + self._group_id, + if self._input_params['image_type'] == 'T1': + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 't1/spm/dartel/group-' + self._input_params['group_id'], 'atlas_statistics/', '%s_%s_T1w_space-%s_map-graymatter_statistics.tsv' - % (self._subjects[i], self._sessions[i], self._atlas)) + % (self._subjects[i], self._sessions[i], self._input_params['atlas'])) for i in range(len(self._subjects))] else: - pvc = '' if self._pvc is None else '_pvc-%s' % self._pvc - suvr = 'pons' if self._image_type == 'fdg' else 'cerebellumPons' - - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 'pet/preprocessing/group-' + self._group_id, 'atlas_statistics', - '%s_%s_task-rest_acq-%s_pet_space-%s%s_suvr-%s_statistics.tsv' - % (self._subjects[i], self._sessions[i], self._image_type, self._atlas, pvc, suvr)) + pvc = '' if self._input_params['pvc'] is None else '_pvc-%s' % self._input_params['pvc'] + suvr = 'pons' if self._input_params['image_type'] == 'fdg' else 'cerebellumPons' + + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet/preprocessing/group-' + self._input_params['group_id'], + 'atlas_statistics', '%s_%s_task-rest_acq-%s_pet_space-%s%s_suvr-%s_statistics.tsv' + % (self._subjects[i], self._sessions[i], self._input_params['image_type'], + self._input_params['atlas'], pvc, suvr)) for i in range(len(self._subjects))] for image in self._images: @@ -301,9 +283,9 @@ def get_x(self): if self._x is not None: return self._x - print('Loading ' + str(len(self.get_images())) + ' subjects') + cprint('Loading ' + str(len(self.get_images())) + ' subjects') self._x = rbio.load_data(self._images, self._subjects) - print('Subjects loaded') + cprint('Subjects loaded') return self._x @@ -319,17 +301,27 @@ def save_weights_as_nifti(self, weights, output_dir): """ output_filename = path.join(output_dir, 'weights.nii.gz') - rbio.weights_to_nifti(weights, self._atlas, output_filename) + rbio.weights_to_nifti(weights, self._input_params['atlas'], output_filename) + + @staticmethod + def get_default_parameters(): + + parameters_dict = super(CAPSRegionBasedInput, CAPSRegionBasedInput).get_default_parameters() + + new_parameters = {'atlas': None, + 'pvc': None, + 'mask_zeros': True} + + parameters_dict.update(new_parameters) + + return parameters_dict class CAPSVertexBasedInput(CAPSInput): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, fwhm, image_type, precomputed_kernel=None): - super(CAPSVertexBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel) - self._fwhm = fwhm - self._image_type = image_type - self._caps_directory = caps_directory + def __init__(self, input_params): + + super().__init__(input_params) def get_images(self): import os @@ -340,14 +332,15 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'fdg' and self._images is None: + if self._input_params['image_type'] == 'fdg' and self._images is None: self._images = [] hemi = ['lh', 'rh'] for i in range(len(self._subjects)): - self._images.append([os.path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], 'pet', - 'surface', self._subjects[i] + '_' + self._sessions[i] - + '_task-rest_acq-fdg_pet_space-fsaverage_suvr-pons_pvc-iy_hemi-' + h - + '_fwhm-' + str(self._fwhm) + '_projection.mgh') for h in hemi]) + self._images.append([os.path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet', 'surface', self._subjects[i] + '_' + + self._sessions[i] + '_task-rest_acq-fdg_pet_space-fsaverage_' + 'suvr-pons_pvc-iy_hemi-' + h + '_fwhm-' + + str(self._input_params['fwhm']) + '_projection.mgh') for h in hemi]) missing_files = [] missing_files_string_error = '' for img in self._images: @@ -361,7 +354,6 @@ def get_images(self): return self._images def get_x(self): - from clinica.utils.stream import cprint """ Returns numpy 2D array """ @@ -398,35 +390,27 @@ def save_weights_as_datasurface(self, weights, output_dir): def save_weights_as_nifti(self, weights, output_dir): pass + @staticmethod + def get_default_parameters(): -class CAPSTSVBasedInput(CAPSInput): + parameters_dict = super(CAPSVertexBasedInput, CAPSVertexBasedInput).get_default_parameters() - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset, - pvc=None, precomputed_kernel=None): - """ + new_parameters = {'fwhm': 0} - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - atlas: - precomputed_kernel: - """ + parameters_dict.update(new_parameters) - super(CAPSTSVBasedInput, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel) + return parameters_dict - self._atlas = atlas - self._pvc = pvc - self._dataset = dataset - self._orig_shape = None - self._data_mask = None +class CAPSTSVBasedInput(CAPSInput): + + def __init__(self, input_params): + + super().__init__(input_params) - if atlas not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: - raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers' ") + if self._input_params['atlas'] not in ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers']: + raise Exception("Incorrect atlas name. It must be one of the values 'AAL2', 'Neuromorphometrics', " + "'AICHA', 'LPBA40', 'Hammers' ") def get_images(self): """ @@ -447,12 +431,14 @@ def get_x(self): # if self._x is not None: # return self._x - print('Loading TSV subjects') - string = str('group-' + self._group_id + '_T1w_space-' + self._atlas + '_map-graymatter') + cprint('Loading TSV subjects') + string = str('group-' + self._input_params['group_id'] + '_T1w_space-' + self._input_params['atlas'] + + '_map-graymatter') - self._x = tbio.load_data(string, self._caps_directory, self._subjects, self._sessions, self._dataset) + self._x = tbio.load_data(string, self._input_params['caps_directory'], self._subjects, self._sessions, + self._input_params['dataset']) - print('Subjects loaded') + cprint('Subjects loaded') return self._x @@ -469,40 +455,24 @@ def save_weights_as_nifti(self, weights, output_dir): # output_filename = path.join(output_dir, 'weights.nii.gz') - # rbio.weights_to_nifti(weights, self._atlas, output_filename) + # rbio.weights_to_nifti(weights, self._input_params['atlas'], output_filename) pass + @staticmethod + def get_default_parameters(): -class CAPSVoxelBasedInputREGSVM(CAPSInput): + parameters_dict = super(CAPSTSVBasedInput, CAPSTSVBasedInput).get_default_parameters() - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, fwhm=0, - modulated="on", pvc=None, mask_zeros=True, precomputed_kernel=None): - """ + new_parameters = {'atlas': None, + 'pvc': None, + 'dataset': None} - Args: - caps_directory: - subjects_visits_tsv: - diagnoses_tsv: - group_id: - image_type: 'T1', 'fdg', 'av45', 'pib' or 'flute' - fwhm: - modulated: - mask_zeros: - precomputed_kernel: - """ - - super(CAPSVoxelBasedInputREGSVM, self).__init__(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, precomputed_kernel=precomputed_kernel) - - self._fwhm = fwhm - self._modulated = modulated - self._pvc = pvc - self._mask_zeros = mask_zeros - self._orig_shape = None - self._data_mask = None + parameters_dict.update(new_parameters) - if modulated not in ['on', 'off']: - raise Exception("Incorrect modulation parameter. It must be one of the values 'on' or 'off'") + return parameters_dict + + +class CAPSVoxelBasedInputREGSVM(CAPSVoxelBasedInput): def get_images(self): """ @@ -513,21 +483,22 @@ def get_images(self): if self._images is not None: return self._images - if self._image_type == 'T1': - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) + if self._input_params['image_type'] == 'T1': + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) - self._images = [path.join(self._caps_directory, + self._images = [path.join(self._input_params['caps_directory'], 'regul_%s_%s_T1w_segm-graymatter_space-Ixi549Space_modulated-%s%s_probability.nii' - % (self._subjects[i], self._sessions[i], self._modulated, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['modulated'], fwhm)) for i in range(len(self._subjects))] else: - pvc = '' if self._pvc is None else '_pvc-%s' % self._pvc - fwhm = '' if self._fwhm == 0 else '_fwhm-%dmm' % int(self._fwhm) - suvr = 'pons' if self._image_type == 'fdg' else 'cerebellumPons' - self._images = [path.join(self._caps_directory, 'subjects', self._subjects[i], self._sessions[i], - 'pet/preprocessing/group-' + self._group_id, + pvc = '' if self._input_params['pvc'] is None else '_pvc-%s' % self._input_params['pvc'] + fwhm = '' if self._input_params['fwhm'] == 0 else '_fwhm-%dmm' % int(self._input_params['fwhm']) + suvr = 'pons' if self._input_params['image_type'] == 'fdg' else 'cerebellumPons' + self._images = [path.join(self._input_params['caps_directory'], 'subjects', self._subjects[i], + self._sessions[i], 'pet/preprocessing/group-' + self._input_params['group_id'], '%s_%s_task-rest_acq-%s_pet_space-Ixi549Space%s_suvr-%s_mask-brain%s_pet.nii.gz' - % (self._subjects[i], self._sessions[i], self._image_type, pvc, suvr, fwhm)) + % (self._subjects[i], self._sessions[i], self._input_params['image_type'], + pvc, suvr, fwhm)) for i in range(len(self._subjects))] for image in self._images: @@ -536,26 +507,49 @@ def get_images(self): return self._images + +class TsvInput(base.MLInput): + + def __init__(self, input_params): + + super().__init__(input_params) + + import pandas as pd + + self._dataframe = pd.io.parsers.read_csv(input_params['data_tsv'], sep='\t') + + if not input_params['columns']: + raise Exception("List of columns to use as input can not be empty.") + def get_x(self): - """ + self._x = self._dataframe.as_matrix(self._input_params['columns']) + return self._x - Returns: a numpy 2d-array. + def get_y(self): + unique = list(set(self._dataframe["diagnosis"])) + self._y = np.array([unique.index(x) for x in self._dataframe["diagnosis"]]) + return self._y + def get_kernel(self, kernel_function=utils.gram_matrix_linear, recompute_if_exists=False): + """ + Returns: a numpy 2d-array. """ - if self._x is not None: - return self._x - print('Loading ' + str(len(self.get_images())) + ' subjects') - self._x, self._orig_shape, self._data_mask = vbio.load_data(self._images, mask=self._mask_zeros) - print('Subjects loaded') + if self._kernel is not None and not recompute_if_exists: + return self._kernel - return self._x + if self._x is None: + self.get_x() - def save_weights_as_nifti(self, weights, output_dir): + cprint("Computing kernel ...") + self._kernel = kernel_function(self._x) + cprint("Kernel computed") + return self._kernel - if self._images is None: - self.get_images() + @staticmethod + def get_default_parameters(): - output_filename = path.join(output_dir, 'weights.nii.gz') - data = vbio.revert_mask(weights, self._data_mask, self._orig_shape) - vbio.weights_to_nifti(data, self._images[0], output_filename) + parameters_dict = {'data_tsv': None, + 'columns': None} + + return parameters_dict diff --git a/clinica/pipelines/machine_learning/ml_utils.py b/clinica/pipelines/machine_learning/ml_utils.py index ea818f766..0076ccdaa 100644 --- a/clinica/pipelines/machine_learning/ml_utils.py +++ b/clinica/pipelines/machine_learning/ml_utils.py @@ -82,3 +82,61 @@ def evaluate_prediction_multiclass(y, y_hat): 'balanced_accuracy': balanced_accuracy} return results + + +def metric_distribution(metric, labels, output_path, num_classes=2, metric_label='balanced accuracy'): + """ + + Distribution plots of various metrics such as balanced accuracy! + + metric is expected to be ndarray of size [num_repetitions, num_datasets] + + """ + # from __future__ import print_function, division + + import numpy as np + import matplotlib.pyplot as plt + from matplotlib import cm + from matplotlib.backends.backend_pdf import PdfPages + + num_repetitions = metric.shape[0] + num_datasets = metric.shape[1] + assert len(labels) == num_datasets, "Differing number of features and labels!" + method_ticks = 1.0 + np.arange(num_datasets) + + fig, ax = plt.subplots(figsize=[9, 9]) + line_coll = ax.violinplot(metric, widths=0.8, bw_method=0.2, + showmedians=True, showextrema=False, + positions=method_ticks) + + cmap = cm.get_cmap('Paired', num_datasets) + for cc, ln in enumerate(line_coll['bodies']): + ln.set_facecolor(cmap(cc)) + ln.set_label(labels[cc]) + + plt.legend(loc=2, ncol=num_datasets) + + ax.tick_params(axis='both', which='major', labelsize=15) + ax.grid(axis='y', which='major') + + lower_lim = np.round(np.min([np.float64(0.9 / num_classes), metric.min()]), 3) + upper_lim = np.round(np.max([1.01, metric.max()]), 3) + step_tick = 0.1 + ax.set_ylim(lower_lim, upper_lim) + + ax.set_xticks(method_ticks) + ax.set_xlim(np.min(method_ticks) - 1, np.max(method_ticks) + 1) + ax.set_xticklabels(labels, rotation=45) # 'vertical' + + ax.set_yticks(np.arange(lower_lim, upper_lim, step_tick)) + ax.set_yticklabels(np.arange(lower_lim, upper_lim, step_tick)) + # plt.xlabel(xlabel, fontsize=16) + plt.ylabel(metric_label, fontsize=16) + + fig.tight_layout() + + pp1 = PdfPages(output_path + '.pdf') + pp1.savefig() + pp1.close() + + return diff --git a/clinica/pipelines/machine_learning/ml_workflows.py b/clinica/pipelines/machine_learning/ml_workflows.py index c54a5afcd..bb1e1ba7d 100644 --- a/clinica/pipelines/machine_learning/ml_workflows.py +++ b/clinica/pipelines/machine_learning/ml_workflows.py @@ -16,318 +16,87 @@ __email__ = "jorge.samper-gonzalez@inria.fr" __status__ = "Development" -# This code is an example of implementation of machine learning pipelines - -class VB_KFold_DualSVM(base.MLWorkflow): - - # First of all, input has to be chosen. According to it (CAPSVoxelBasedInput or CAPSRegionBasedInput), - # all the necessary inputs can be found in input.py +class VoxelBasedKFoldDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_folds=10, - grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - - # Here some parameters selected for this task - - self._output_dir = output_dir - self._n_threads = n_threads - self._n_folds = n_folds - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - # In this case we are running a voxel based input approach - # - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - - # Validation and algorithm will be selected in the next part of code - - self._validation = None - self._algorithm = None - - def run(self): - - # Call on parameters already computed - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - # Now algorithm has been selected, in this case Dual SVM algorithm. - # Look at algorithm.py to understand the input necessary for each method - # input parameters were chosen previously - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - # Here validation type is selected, it's the K fold cross-validation - - self._validation = validation.KFoldCV(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_folds=self._n_folds, n_threads=self._n_threads) - - # Creation of the path where all the results will be saved - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - # Here we have selected whant we wanted save - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) + grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - # self._input.save_weights_as_nifti(weights) + super(VoxelBasedKFoldDualSVM, self).__init__(input.CAPSVoxelBasedInput, + validation.KFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) -class VB_RepKFold_DualSVM(base.MLWorkflow): +class VoxelBasedRepKFoldDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, - modulated="on", precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, n_folds=10, - grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._n_folds = n_folds - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) + modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, + n_folds=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._input.save_weights_as_nifti(weights, classifier_dir) + super(VoxelBasedRepKFoldDualSVM, self).__init__(input.CAPSVoxelBasedInput, + validation.RepeatedKFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) -class VB_RepHoldOut_DualSVM(base.MLWorkflow): +class VoxelBasedRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) + test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), + splits_indices=None): - self._validation.save_results(self._output_dir) + super().__init__(input.CAPSVoxelBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) -class VertexB_RepHoldOut_dualSVM(base.MLWorkflow): +class VertexBasedRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, output_dir, image_type='fdg', fwhm=20, precomputed_kernel=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-10, 2, 1000), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSVertexBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, fwhm, - image_type, precomputed_kernel) - - self._validation = None - self._algorithm = None + super(VertexBasedRepHoldOutDualSVM, self).__init__(input.CAPSVertexBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._input.save_weights_as_datasurface(weights, classifier_dir) - self._validation.save_results(self._output_dir) - - -class RB_RepHoldOut_DualSVM(base.MLWorkflow): +class RegionBasedRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._input.save_weights_as_nifti(weights, classifier_dir) + super(RegionBasedRepHoldOutDualSVM, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - self._validation.save_results(self._output_dir) - -class RB_RepHoldOut_LogisticRegression(base.MLWorkflow): +class RegionBasedRepHoldOutLogisticRegression(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.LogisticReg(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) + super(RegionBasedRepHoldOutLogisticRegression, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedHoldOut, + algorithm.LogisticReg, + locals(), + output_dir) - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - self._classifier = classifier - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class RB_RepHoldOut_RandomForest(base.MLWorkflow): +class RegionBasedRepHoldOutRandomForest(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, @@ -335,518 +104,108 @@ def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, max_depth_range=[None], min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None + super(RegionBasedRepHoldOutRandomForest, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedHoldOut, + algorithm.RandomForest, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - n_estimators_range=self._n_estimators_range, - max_depth_range=self._max_depth_range, - min_samples_split_range=self._min_samples_split_range, - max_features_range=self._max_features_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class RB_LearningCurveRepHoldOut_DualSVM(base.MLWorkflow): +class RegionBasedLearningCurveRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, precomputed_kernel=None, n_threads=15, n_iterations=100, test_size=0.3, n_learning_points=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._n_learning_points = n_learning_points - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() + super(RegionBasedLearningCurveRepHoldOutDualSVM, self).__init__(input.CAPSRegionBasedInput, + validation.LearningCurveRepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - self._validation = validation.LearningCurveRepeatedHoldOut(self._algorithm, - n_iterations=self._n_iterations, - test_size=self._test_size, - n_learning_points=self._n_learning_points) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads) - - for learning_point in range(self._n_learning_points): - - learning_point_dir = path.join(self._output_dir, 'learning_split-' + str(learning_point)) - - classifier_dir = path.join(learning_point_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier[learning_point], classifier_dir) - self._algorithm.save_parameters(best_params[learning_point], classifier_dir) - weights = self._algorithm.save_weights(classifier[learning_point], x, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class VB_LearningCurveRepHoldOut_DualSVM(base.MLWorkflow): +class VoxelBasedLearningCurveRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - test_size=0.3, n_learning_points=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17)): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._n_learning_points = n_learning_points - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - - self._input = input.CAPSVoxelBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) + test_size=0.3, n_learning_points=10, grid_search_folds=10, balanced=True, + c_range=np.logspace(-6, 2, 17)): - self._validation = None - self._algorithm = None + super(VoxelBasedLearningCurveRepHoldOutDualSVM, self).__init__(input.CAPSVoxelBasedInput, + validation.LearningCurveRepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.LearningCurveRepeatedHoldOut(self._algorithm, - n_iterations=self._n_iterations, - test_size=self._test_size, - n_learning_points=self._n_learning_points) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads) - - for learning_point in range(self._n_learning_points): - - learning_point_dir = path.join(self._output_dir, 'learning_split-' + str(learning_point)) - - classifier_dir = path.join(learning_point_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier[learning_point], classifier_dir) - self._algorithm.save_parameters(best_params[learning_point], classifier_dir) - weights = self._algorithm.save_weights(classifier[learning_point], x, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class RB_RepKFold_DualSVM(base.MLWorkflow): +class RegionBasedRepKFoldDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, n_folds=10, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._n_folds = n_folds - self._splits_indices = splits_indices - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None + super(RegionBasedRepKFoldDualSVM, self).__init__(input.CAPSRegionBasedInput, + validation.RepeatedKFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - def run(self): - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - -class TB_RepHoldOut_DualSVM(base.MLWorkflow): +class CAPSTsvRepHoldOutDualSVM(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - - self._input = input.CAPSTSVBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, - atlas, dataset, pvc) - - self._validation = None - self._algorithm = None - def run(self): + super(CAPSTsvRepHoldOutDualSVM, self).__init__(input.CAPSTSVBasedInput, + validation.RepeatedHoldOut, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class TB_RepHoldOut_RandomForest(base.MLWorkflow): +class CAPSTsvRepHoldOutRandomForest(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, dataset, output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), max_depth_range=[None], min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._splits_indices = splits_indices - - self._input = input.CAPSTSVBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, dataset, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - n_estimators_range=self._n_estimators_range, - max_depth_range=self._max_depth_range, - min_samples_split_range=self._min_samples_split_range, - max_features_range=self._max_features_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - - # self._input.save_weights_as_nifti(weights, classifier_dir) - - # self._validation.save_results(self._output_dir) - + super(CAPSTsvRepHoldOutRandomForest, self).__init__(input.CAPSTSVBasedInput, + validation.RepeatedHoldOut, + algorithm.RandomForest, + locals(), + output_dir) # SVM reg -class VBREG_RepKFold_DualSVM(base.MLWorkflow): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, - modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - n_folds=10, - test_size=0.1, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), - splits_indices=None): +class VoxelBasedREGRepKFoldDualSVM(base.MLWorkflow): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - self._n_folds = n_folds - self._input = input.CAPSVoxelBasedInputREGSVM(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.DualSVMAlgorithm(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV(self._algorithm) - print('K fold') - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - weights = self._algorithm.save_weights(classifier, x, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - -class RB_RepHoldOut_RandomForest_Multiclass(base.MLWorkflow): - - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, atlas, - output_dir, pvc=None, n_threads=15, n_iterations=100, test_size=0.3, - grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), - max_depth_range=[None], min_samples_split_range=[2], - max_features_range=('auto', 0.25, 0.5), splits_indices=None): - - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._n_estimators_range = n_estimators_range - self._max_depth_range = max_depth_range - self._min_samples_split_range = min_samples_split_range - self._max_features_range = max_features_range - self._splits_indices = splits_indices - - self._input = input.CAPSRegionBasedInput(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, atlas, pvc) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - - self._algorithm = algorithm.RandomForest(x, y, balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - n_estimators_range=self._n_estimators_range, - max_depth_range=self._max_depth_range, - min_samples_split_range=self._min_samples_split_range, - max_features_range=self._max_features_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedHoldOut(self._algorithm, n_iterations=self._n_iterations, test_size=self._test_size) - classifier, best_params, results = self._validation.validate(y, n_threads=self._n_threads, splits_indices=self._splits_indices) - - classifier_dir = os.path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_classifier(classifier, classifier_dir) - self._algorithm.save_parameters(best_params, classifier_dir) - weights = self._algorithm.save_weights(classifier, classifier_dir) - - self._input.save_weights_as_nifti(weights, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class VBREG_RepKfold_SVMOV0(base.MLWorkflow): def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, n_folds=10, - test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), + test_size=0.1, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), splits_indices=None): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - self._n_folds = n_folds - self._input = input.CAPSVoxelBasedInputREGSVM(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): + super(VoxelBasedREGRepKFoldDualSVM, self).__init__(input.CAPSTSVBasedInput, + validation.RepeatedKFoldCV, + algorithm.DualSVMAlgorithm, + locals(), + output_dir) - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - self._algorithm = algorithm.OneVsOneSVM(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) +# TSV - self._validation = validation.RepeatedKFoldCV_Multiclass(self._algorithm) +class TsvRepHoldOutRandomForest(base.MLWorkflow): - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) - - -class VBREG_RepKfold_SVMOVR(base.MLWorkflow): - def __init__(self, caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, image_type, output_dir, fwhm=0, - modulated="on", pvc=None, precomputed_kernel=None, mask_zeros=True, n_threads=15, n_iterations=100, - n_folds=10, - test_size=0.3, grid_search_folds=10, balanced=True, c_range=np.logspace(-6, 2, 17), - splits_indices=None): + def __init__(self, data_tsv, columns, output_dir, n_threads=20, n_iterations=250, test_size=0.2, + grid_search_folds=10, balanced=True, n_estimators_range=(100, 200, 400), max_depth_range=[None], + min_samples_split_range=[2], max_features_range=('auto', 0.25, 0.5), splits_indices=None, + inner_cv=False): - self._output_dir = output_dir - self._n_threads = n_threads - self._n_iterations = n_iterations - self._test_size = test_size - self._grid_search_folds = grid_search_folds - self._balanced = balanced - self._c_range = c_range - self._splits_indices = splits_indices - self._n_folds = n_folds - self._input = input.CAPSVoxelBasedInputREGSVM(caps_directory, subjects_visits_tsv, diagnoses_tsv, group_id, - image_type, fwhm, modulated, pvc, mask_zeros, precomputed_kernel) - self._validation = None - self._algorithm = None - - def run(self): - - x = self._input.get_x() - y = self._input.get_y() - kernel = self._input.get_kernel() - - self._algorithm = algorithm.OneVsRestSVM(kernel, - y, - balanced=self._balanced, - grid_search_folds=self._grid_search_folds, - c_range=self._c_range, - n_threads=self._n_threads) - - self._validation = validation.RepeatedKFoldCV_Multiclass(self._algorithm) - - classifier, best_params, results = self._validation.validate(y, n_iterations=self._n_iterations, - n_folds=self._n_folds, n_threads=self._n_threads) - - classifier_dir = path.join(self._output_dir, 'classifier') - if not path.exists(classifier_dir): - os.makedirs(classifier_dir) - - self._algorithm.save_parameters(best_params, classifier_dir) - - self._validation.save_results(self._output_dir) + super(TsvRepHoldOutRandomForest, self).__init__(input.TsvInput, + validation.RepeatedHoldOut, + algorithm.RandomForest, + locals(), + output_dir) diff --git a/clinica/pipelines/machine_learning/validation.py b/clinica/pipelines/machine_learning/validation.py index d743600dc..56d32a65e 100644 --- a/clinica/pipelines/machine_learning/validation.py +++ b/clinica/pipelines/machine_learning/validation.py @@ -21,41 +21,32 @@ class KFoldCV(base.MLValidation): - def __init__(self, ml_algorithm): - self._ml_algorithm = ml_algorithm - self._fold_results = [] - self._classifier = None - self._best_params = None - self._cv = None + def validate(self, y): - def validate(self, y, n_folds=10, splits_indices=None, n_threads=15): + if self._validation_params['splits_indices'] is None: + skf = StratifiedKFold(n_splits=self._validation_params['n_folds'], shuffle=True) + self._validation_params['splits_indices'] = list(skf.split(np.zeros(len(y)), y)) - if splits_indices is None: - skf = StratifiedKFold(n_splits=n_folds, shuffle=True) - self._cv = list(skf.split(np.zeros(len(y)), y)) - else: - self._cv = splits_indices - - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - for i in range(n_folds): + for i in range(self._validation_params['n_folds']): - train_index, test_index = self._cv[i] + train_index, test_index = self._validation_params['splits_indices'][i] async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) async_pool.close() async_pool.join() - for i in range(n_folds): - self._fold_results.append(async_result[i].get()) + for i in range(self._validation_params['n_folds']): + self._validation_results.append(async_result[i].get()) - self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._fold_results) + self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._validation_results) - return self._classifier, self._best_params, self._fold_results + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._fold_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") subjects_folds = [] @@ -65,27 +56,27 @@ def save_results(self, output_dir): if not path.exists(container_dir): os.makedirs(container_dir) - for i in range(len(self._fold_results)): - subjects_df = pd.DataFrame({'y': self._fold_results[i]['y'], - 'y_hat': self._fold_results[i]['y_hat'], - 'y_index': self._fold_results[i]['y_index']}) + for i in range(len(self._validation_results)): + subjects_df = pd.DataFrame({'y': self._validation_results[i]['y'], + 'y_hat': self._validation_results[i]['y_hat'], + 'y_index': self._validation_results[i]['y_index']}) subjects_df.to_csv(path.join(container_dir, 'subjects_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') subjects_folds.append(subjects_df) - results_df = pd.DataFrame({'balanced_accuracy': self._fold_results[i]['evaluation']['balanced_accuracy'], - 'auc': self._fold_results[i]['auc'], - 'accuracy': self._fold_results[i]['evaluation']['accuracy'], - 'sensitivity': self._fold_results[i]['evaluation']['sensitivity'], - 'specificity': self._fold_results[i]['evaluation']['specificity'], - 'ppv': self._fold_results[i]['evaluation']['ppv'], - 'npv': self._fold_results[i]['evaluation']['npv'], - 'train_balanced_accuracy': self._fold_results[i]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._fold_results[i]['evaluation_train']['accuracy'], - 'train_sensitivity': self._fold_results[i]['evaluation_train']['sensitivity'], - 'train_specificity': self._fold_results[i]['evaluation_train']['specificity'], - 'train_ppv': self._fold_results[i]['evaluation_train']['ppv'], - 'train_npv': self._fold_results[i]['evaluation_train']['npv'] + results_df = pd.DataFrame({'balanced_accuracy': self._validation_results[i]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[i]['auc'], + 'accuracy': self._validation_results[i]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[i]['evaluation']['sensitivity'], + 'specificity': self._validation_results[i]['evaluation']['specificity'], + 'ppv': self._validation_results[i]['evaluation']['ppv'], + 'npv': self._validation_results[i]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[i]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[i]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[i]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[i]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[i]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[i]['evaluation_train']['npv'] }, index=['i', ]) results_df.to_csv(path.join(container_dir, 'results_fold-' + str(i) + '.tsv'), @@ -110,53 +101,66 @@ def save_results(self, output_dir): print("sensitivity: %s" % (mean_results['sensitivity'].to_string(index=False))) print("auc: %s" % (mean_results['auc'].to_string(index=False))) + @staticmethod + def get_default_parameters(): + + parameters_dict = {'n_folds': 10, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} + + return parameters_dict + class RepeatedKFoldCV(base.MLValidation): - def __init__(self, ml_algorithm): - self._ml_algorithm = ml_algorithm - self._repeated_fold_results = [] - self._classifier = None - self._best_params = None - self._cv = None + def validate(self, y): - def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): + if self._validation_params['splits_indices'] is None: + self._validation_params['splits_indices'] = [] + + for i in range(self._validation_params['n_iterations']): + skf = StratifiedKFold(n_splits=self._validation_params['n_folds'], shuffle=True) + self._validation_params['splits_indices'].append(list(skf.split(np.zeros(len(y)), y))) - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - self._cv = [] - for r in range(n_iterations): - skf = StratifiedKFold(n_splits=n_folds, shuffle=True) - self._cv.append(list(skf.split(np.zeros(len(y)), y))) + for i in range(self._validation_params['n_iterations']): + + train_index, test_index = self._validation_params['splits_indices'][i] + async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) + + for r in range(self._validation_params['n_iterations']): + async_result[r] = {} - self._repeated_fold_results.append([]) + self._validation_results.append([]) - for i in range(n_folds): + for i in range(self._validation_params['n_folds']): - train_index, test_index = self._cv[r][i] + train_index, test_index = self._validation_params['splits_indices'][r][i] async_result[r][i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) async_pool.close() async_pool.join() - for r in range(n_iterations): - for i in range(n_folds): - self._repeated_fold_results[r].append(async_result[r][i].get()) + for r in range(self._validation_params['n_iterations']): + for i in range(self._validation_params['n_folds']): + self._validation_results[r].append(async_result[r][i].get()) # TODO Find a better way to estimate best parameter - flat_results = [result for fold in self._repeated_fold_results for result in fold] + flat_results = [result for fold in self._validation_results for result in fold] self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(flat_results) - return self._classifier, self._best_params, self._repeated_fold_results + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._repeated_fold_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") all_results_list = [] all_subjects_list = [] - for iteration in range(len(self._repeated_fold_results)): + for iteration in range(len(self._validation_results)): iteration_dir = path.join(output_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): @@ -169,28 +173,28 @@ def save_results(self, output_dir): if not path.exists(folds_dir): os.makedirs(folds_dir) - for i in range(len(self._repeated_fold_results[iteration])): - subjects_df = pd.DataFrame({'y': self._repeated_fold_results[iteration][i]['y'], - 'y_hat': self._repeated_fold_results[iteration][i]['y_hat'], - 'y_index': self._repeated_fold_results[iteration][i]['y_index']}) + for i in range(len(self._validation_results[iteration])): + subjects_df = pd.DataFrame({'y': self._validation_results[iteration][i]['y'], + 'y_hat': self._validation_results[iteration][i]['y_hat'], + 'y_index': self._validation_results[iteration][i]['y_index']}) subjects_df.to_csv(path.join(folds_dir, 'subjects_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') iteration_subjects_list.append(subjects_df) results_df = pd.DataFrame( - {'balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation']['balanced_accuracy'], - 'auc': self._repeated_fold_results[iteration][i]['auc'], - 'accuracy': self._repeated_fold_results[iteration][i]['evaluation']['accuracy'], - 'sensitivity': self._repeated_fold_results[iteration][i]['evaluation']['sensitivity'], - 'specificity': self._repeated_fold_results[iteration][i]['evaluation']['specificity'], - 'ppv': self._repeated_fold_results[iteration][i]['evaluation']['ppv'], - 'npv': self._repeated_fold_results[iteration][i]['evaluation']['npv'], - 'train_balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['accuracy'], - 'train_sensitivity': self._repeated_fold_results[iteration][i]['evaluation_train']['sensitivity'], - 'train_specificity': self._repeated_fold_results[iteration][i]['evaluation_train']['specificity'], - 'train_ppv': self._repeated_fold_results[iteration][i]['evaluation_train']['ppv'], - 'train_npv': self._repeated_fold_results[iteration][i]['evaluation_train']['npv'] + {'balanced_accuracy': self._validation_results[iteration][i]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[iteration][i]['auc'], + 'accuracy': self._validation_results[iteration][i]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[iteration][i]['evaluation']['sensitivity'], + 'specificity': self._validation_results[iteration][i]['evaluation']['specificity'], + 'ppv': self._validation_results[iteration][i]['evaluation']['ppv'], + 'npv': self._validation_results[iteration][i]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[iteration][i]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[iteration][i]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[iteration][i]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[iteration][i]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[iteration][i]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[iteration][i]['evaluation_train']['npv'] }, index=['i', ]) results_df.to_csv(path.join(folds_dir, 'results_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') @@ -230,36 +234,34 @@ def save_results(self, output_dir): print("sensitivity: %s" % (mean_results_df['sensitivity'].to_string(index=False))) print("auc: %s" % (mean_results_df['auc'].to_string(index=False))) + @staticmethod + def get_default_parameters(): + + parameters_dict = {'n_iterations': 100, + 'n_folds': 10, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} + + return parameters_dict + class RepeatedHoldOut(base.MLValidation): - def __init__(self, ml_algorithm, n_iterations=100, test_size=0.3): - self._ml_algorithm = ml_algorithm - self._split_results = [] - self._classifier = None - self._best_params = None - self._cv = None - self._n_iterations = n_iterations - self._test_size = test_size - self._error_resampled_t = None - self._error_corrected_resampled_t = None - self._bal_accuracy_resampled_t = None - self._bal_accuracy_corrected_resampled_t = None - - def validate(self, y, n_threads=15, splits_indices=None, inner_cv=True): - - if splits_indices is None: - splits = StratifiedShuffleSplit(n_splits=self._n_iterations, test_size=self._test_size) - self._cv = list(splits.split(np.zeros(len(y)), y)) - else: - self._cv = splits_indices - async_pool = ThreadPool(n_threads) + def validate(self, y): + + if self._validation_params['splits_indices'] is None: + splits = StratifiedShuffleSplit(n_splits=self._validation_params['n_iterations'], + test_size=self._validation_params['test_size']) + self._validation_params['splits_indices'] = list(splits.split(np.zeros(len(y)), y)) + + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - for i in range(self._n_iterations): + for i in range(self._validation_params['n_iterations']): - train_index, test_index = self._cv[i] - if inner_cv: + train_index, test_index = self._validation_params['splits_indices'][i] + if self._validation_params['inner_cv']: async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index, test_index)) else: async_result[i] = async_pool.apply_async(self._ml_algorithm.evaluate_no_cv, (train_index, test_index)) @@ -267,55 +269,55 @@ def validate(self, y, n_threads=15, splits_indices=None, inner_cv=True): async_pool.close() async_pool.join() - for i in range(self._n_iterations): - self._split_results.append(async_result[i].get()) + for i in range(self._validation_params['n_iterations']): + self._validation_results.append(async_result[i].get()) - self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._split_results) - return self._classifier, self._best_params, self._split_results + self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(self._validation_results) + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._split_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") all_results_list = [] all_train_subjects_list = [] all_test_subjects_list = [] - for iteration in range(len(self._split_results)): + for iteration in range(len(self._validation_results)): iteration_dir = path.join(output_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): os.makedirs(iteration_dir) iteration_train_subjects_df = pd.DataFrame({'iteration': iteration, - 'y': self._split_results[iteration]['y_train'], - 'y_hat': self._split_results[iteration]['y_hat_train'], - 'subject_index': self._split_results[iteration]['x_index']}) + 'y': self._validation_results[iteration]['y_train'], + 'y_hat': self._validation_results[iteration]['y_hat_train'], + 'subject_index': self._validation_results[iteration]['x_index']}) iteration_train_subjects_df.to_csv(path.join(iteration_dir, 'train_subjects.tsv'), index=False, sep='\t', encoding='utf-8') all_train_subjects_list.append(iteration_train_subjects_df) iteration_test_subjects_df = pd.DataFrame({'iteration': iteration, - 'y': self._split_results[iteration]['y'], - 'y_hat': self._split_results[iteration]['y_hat'], - 'subject_index': self._split_results[iteration]['y_index']}) + 'y': self._validation_results[iteration]['y'], + 'y_hat': self._validation_results[iteration]['y_hat'], + 'subject_index': self._validation_results[iteration]['y_index']}) iteration_test_subjects_df.to_csv(path.join(iteration_dir, 'test_subjects.tsv'), index=False, sep='\t', encoding='utf-8') all_test_subjects_list.append(iteration_test_subjects_df) iteration_results_df = pd.DataFrame( - {'balanced_accuracy': self._split_results[iteration]['evaluation']['balanced_accuracy'], - 'auc': self._split_results[iteration]['auc'], - 'accuracy': self._split_results[iteration]['evaluation']['accuracy'], - 'sensitivity': self._split_results[iteration]['evaluation']['sensitivity'], - 'specificity': self._split_results[iteration]['evaluation']['specificity'], - 'ppv': self._split_results[iteration]['evaluation']['ppv'], - 'npv': self._split_results[iteration]['evaluation']['npv'], - 'train_balanced_accuracy': self._split_results[iteration]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._split_results[iteration]['evaluation_train']['accuracy'], - 'train_sensitivity': self._split_results[iteration]['evaluation_train']['sensitivity'], - 'train_specificity': self._split_results[iteration]['evaluation_train']['specificity'], - 'train_ppv': self._split_results[iteration]['evaluation_train']['ppv'], - 'train_npv': self._split_results[iteration]['evaluation_train']['npv'] + {'balanced_accuracy': self._validation_results[iteration]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[iteration]['auc'], + 'accuracy': self._validation_results[iteration]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[iteration]['evaluation']['sensitivity'], + 'specificity': self._validation_results[iteration]['evaluation']['specificity'], + 'ppv': self._validation_results[iteration]['evaluation']['ppv'], + 'npv': self._validation_results[iteration]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[iteration]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[iteration]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[iteration]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[iteration]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[iteration]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[iteration]['evaluation_train']['npv'] }, index=['i', ]) iteration_results_df.to_csv(path.join(iteration_dir, 'results.tsv'), index=False, sep='\t', encoding='utf-8') @@ -349,156 +351,99 @@ def save_results(self, output_dir): print("sensitivity: %s" % (mean_results_df['sensitivity'].to_string(index=False))) print("auc: %s" % (mean_results_df['auc'].to_string(index=False))) - self.compute_error_variance() - self.compute_accuracy_variance() + @staticmethod + def get_default_parameters(): - variance_df = pd.DataFrame({'bal_accuracy_resampled_t': self._bal_accuracy_resampled_t, - 'bal_accuracy_corrected_resampled_t': self._bal_accuracy_corrected_resampled_t, - 'error_resampled_t': self._error_resampled_t, - 'error_corrected_resampled_t': self._error_corrected_resampled_t}, index=[0, ]) + parameters_dict = {'n_iterations': 100, + 'test_size': 0.2, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} - variance_df.to_csv(path.join(output_dir, 'variance.tsv'), - index=False, sep='\t', encoding='utf-8') - - def _compute_variance(self, test_error_split): - - # compute average test error - num_split = len(self._split_results) # J in the paper - - # compute mu_{n_1}^{n_2} - average_test_error = np.mean(test_error_split) - - approx_variance = np.sum((test_error_split - average_test_error)**2)/(num_split - 1) - - # compute variance (point 2 and 6 of Nadeau's paper) - resampled_t = approx_variance / num_split - corrected_resampled_t = (1/num_split + self._test_size/(1 - self._test_size)) * approx_variance - - return resampled_t, corrected_resampled_t - - def compute_error_variance(self): - num_split = len(self._split_results) - test_error_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_error_split[i] = self._compute_average_test_error(self._split_results[i]['y'], - self._split_results[i]['y_hat']) - - self._error_resampled_t, self._error_corrected_resampled_t = self._compute_variance(test_error_split) - - return self._error_resampled_t, self._error_corrected_resampled_t + return parameters_dict - def _compute_average_test_error(self, y_list, yhat_list): - # return the average test error (denoted mu_j hat) - return float(len(np.where(y_list != yhat_list)[0]))/float(len(y_list)) - def compute_accuracy_variance(self): - num_split = len(self._split_results) - test_accuracy_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_accuracy_split[i] = self._compute_average_test_accuracy(self._split_results[i]['y'], - self._split_results[i]['y_hat']) - - self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t = self._compute_variance(test_accuracy_split) - - return self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t - - def _compute_average_test_accuracy(self, y_list, yhat_list): - - from clinica.pipelines.machine_learning.ml_utils import evaluate_prediction - - return evaluate_prediction(y_list, yhat_list)['balanced_accuracy'] +class LearningCurveRepeatedHoldOut(base.MLValidation): + def validate(self, y): -class LearningCurveRepeatedHoldOut(base.MLValidation): + if self._validation_params['splits_indices'] is None: + splits = StratifiedShuffleSplit(n_splits=self._validation_params['n_iterations'], + test_size=self._validation_params['test_size']) + self._validation_params['splits_indices'] = list(splits.split(np.zeros(len(y)), y)) - def __init__(self, ml_algorithm, n_iterations=100, test_size=0.3, n_learning_points=10): - self._ml_algorithm = ml_algorithm - self._split_results = [] - self._classifier = None - self._best_params = None - self._cv = None - self._n_iterations = n_iterations - self._test_size = test_size - self._n_learning_points = n_learning_points - self._error_resampled_t = None - self._error_corrected_resampled_t = None - self._bal_accuracy_resampled_t = None - self._bal_accuracy_corrected_resampled_t = None - - def validate(self, y, n_threads=15): - - splits = StratifiedShuffleSplit(n_splits=self._n_iterations, test_size=self._test_size) - self._cv = list(splits.split(np.zeros(len(y)), y)) - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} - for i in range(self._n_iterations): - train_index, test_index = self._cv[i] + for i in range(self._validation_params['n_iterations']): + train_index, test_index = self._validation_params['splits_indices'][i] async_result[i] = {} - skf = StratifiedKFold(n_splits=self._n_learning_points, shuffle=False) - inner_cv = list(skf.split(np.zeros(len(y[train_index])), y[train_index])) + skf = StratifiedKFold(n_splits=self._validation_params['n_learning_points'], shuffle=False) + inner_cv_splits = list(skf.split(np.zeros(len(y[train_index])), y[train_index])) - for j in range(self._n_learning_points): - inner_train_index = np.concatenate([indexes[1] for indexes in inner_cv[:j + 1]]).ravel() - async_result[i][j] = async_pool.apply_async(self._ml_algorithm.evaluate, (train_index[inner_train_index], test_index)) + for j in range(self._validation_params['n_learning_points']): + inner_train_index = np.concatenate([indexes[1] for indexes in + inner_cv_splits[:j + 1]]).ravel() + async_result[i][j] = async_pool.apply_async(self._ml_algorithm.evaluate, + (train_index[inner_train_index], test_index)) async_pool.close() async_pool.join() - for j in range(self._n_learning_points): + for j in range(self._validation_params['n_learning_points']): learning_point_results = [] - for i in range(self._n_iterations): + for i in range(self._validation_params['n_iterations']): learning_point_results.append(async_result[i][j].get()) - self._split_results.append(learning_point_results) + self._validation_results.append(learning_point_results) self._classifier = [] self._best_params = [] - for j in range(self._n_learning_points): - classifier, best_params = self._ml_algorithm.apply_best_parameters(self._split_results[j]) + for j in range(self._validation_params['n_learning_points']): + classifier, best_params = self._ml_algorithm.apply_best_parameters(self._validation_results[j]) self._classifier.append(classifier) self._best_params.append(best_params) - return self._classifier, self._best_params, self._split_results + return self._classifier, self._best_params, self._validation_results def save_results(self, output_dir): - if self._split_results is None: + if self._validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") - for learning_point in range(self._n_learning_points): + for learning_point in range(self._validation_params['n_learning_points']): all_results_list = [] all_subjects_list = [] learning_point_dir = path.join(output_dir, 'learning_split-' + str(learning_point)) - for iteration in range(self._n_iterations): + for iteration in range(self._validation_params['n_iterations']): iteration_dir = path.join(learning_point_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): os.makedirs(iteration_dir) - iteration_subjects_df = pd.DataFrame({'y': self._split_results[learning_point][iteration]['y'], - 'y_hat': self._split_results[learning_point][iteration]['y_hat'], - 'y_index': self._split_results[learning_point][iteration]['y_index']}) + iteration_subjects_df = pd.DataFrame({'y': self._validation_results[learning_point][iteration]['y'], + 'y_hat': self._validation_results[learning_point][iteration]['y_hat'], + 'y_index': self._validation_results[learning_point][iteration]['y_index']}) iteration_subjects_df.to_csv(path.join(iteration_dir, 'subjects.tsv'), index=False, sep='\t', encoding='utf-8') all_subjects_list.append(iteration_subjects_df) iteration_results_df = pd.DataFrame( - {'balanced_accuracy': self._split_results[learning_point][iteration]['evaluation']['balanced_accuracy'], - 'auc': self._split_results[learning_point][iteration]['auc'], - 'accuracy': self._split_results[learning_point][iteration]['evaluation']['accuracy'], - 'sensitivity': self._split_results[learning_point][iteration]['evaluation']['sensitivity'], - 'specificity': self._split_results[learning_point][iteration]['evaluation']['specificity'], - 'ppv': self._split_results[learning_point][iteration]['evaluation']['ppv'], - 'npv': self._split_results[learning_point][iteration]['evaluation']['npv'], - 'train_balanced_accuracy': self._split_results[learning_point][iteration]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._split_results[learning_point][iteration]['evaluation_train']['accuracy'], - 'train_sensitivity': self._split_results[learning_point][iteration]['evaluation_train']['sensitivity'], - 'train_specificity': self._split_results[learning_point][iteration]['evaluation_train']['specificity'], - 'train_ppv': self._split_results[learning_point][iteration]['evaluation_train']['ppv'], - 'train_npv': self._split_results[learning_point][iteration]['evaluation_train']['npv']}, index=['i', ]) + {'balanced_accuracy': self._validation_results[learning_point][iteration]['evaluation']['balanced_accuracy'], + 'auc': self._validation_results[learning_point][iteration]['auc'], + 'accuracy': self._validation_results[learning_point][iteration]['evaluation']['accuracy'], + 'sensitivity': self._validation_results[learning_point][iteration]['evaluation']['sensitivity'], + 'specificity': self._validation_results[learning_point][iteration]['evaluation']['specificity'], + 'ppv': self._validation_results[learning_point][iteration]['evaluation']['ppv'], + 'npv': self._validation_results[learning_point][iteration]['evaluation']['npv'], + 'train_balanced_accuracy': self._validation_results[learning_point][iteration]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._validation_results[learning_point][iteration]['evaluation_train']['accuracy'], + 'train_sensitivity': self._validation_results[learning_point][iteration]['evaluation_train']['sensitivity'], + 'train_specificity': self._validation_results[learning_point][iteration]['evaluation_train']['specificity'], + 'train_ppv': self._validation_results[learning_point][iteration]['evaluation_train']['ppv'], + 'train_npv': self._validation_results[learning_point][iteration]['evaluation_train']['npv']}, index=['i', ]) iteration_results_df.to_csv(path.join(iteration_dir, 'results.tsv'), index=False, sep='\t', encoding='utf-8') @@ -522,78 +467,31 @@ def save_results(self, output_dir): mean_results_df.to_csv(path.join(learning_point_dir, 'mean_results.tsv'), index=False, sep='\t', encoding='utf-8') - self.compute_error_variance(learning_point) - self.compute_accuracy_variance(learning_point) - - variance_df = pd.DataFrame({'bal_accuracy_resampled_t': self._bal_accuracy_resampled_t, - 'bal_accuracy_corrected_resampled_t': self._bal_accuracy_corrected_resampled_t, - 'error_resampled_t': self._error_resampled_t, - 'error_corrected_resampled_t': self._error_corrected_resampled_t}, index=[0, ]) - - variance_df.to_csv(path.join(learning_point_dir, 'variance.tsv'), - index=False, sep='\t', encoding='utf-8') - - def _compute_variance(self, test_error_split): - - # compute average test error - num_split = self._n_iterations # J in the paper - - # compute mu_{n_1}^{n_2} - average_test_error = np.mean(test_error_split) - - approx_variance = np.sum((test_error_split - average_test_error)**2)/(num_split - 1) - - # compute variance (point 2 and 6 of Nadeau's paper) - resampled_t = approx_variance / num_split - corrected_resampled_t = (1/num_split + self._test_size/(1 - self._test_size)) * approx_variance - - return resampled_t, corrected_resampled_t - - def compute_error_variance(self, learning_point): - num_split = self._n_iterations - test_error_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_error_split[i] = self._compute_average_test_error(self._split_results[learning_point][i]['y'], - self._split_results[learning_point][i]['y_hat']) - - self._error_resampled_t, self._error_corrected_resampled_t = self._compute_variance(test_error_split) - - return self._error_resampled_t, self._error_corrected_resampled_t - - def _compute_average_test_error(self, y_list, yhat_list): - # return the average test error (denoted mu_j hat) - return float(len(np.where(y_list != yhat_list)[0]))/float(len(y_list)) - - def compute_accuracy_variance(self, learning_point): - num_split = self._n_iterations - test_accuracy_split = np.zeros((num_split, 1)) # this list will contain the list of mu_j hat for j = 1 to J - for i in range(num_split): - test_accuracy_split[i] = self._compute_average_test_accuracy(self._split_results[learning_point][i]['y'], - self._split_results[learning_point][i]['y_hat']) - - self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t = self._compute_variance(test_accuracy_split) - - return self._bal_accuracy_resampled_t, self._bal_accuracy_corrected_resampled_t - - def _compute_average_test_accuracy(self, y_list, yhat_list): + @staticmethod + def get_default_parameters(): - from clinica.pipelines.machine_learning.ml_utils import evaluate_prediction + parameters_dict = {'n_iterations': 100, + 'test_size': 0.2, + 'n_learning_points': 10, + 'n_threads': 15, + 'splits_indices': None, + 'inner_cv': True} - return evaluate_prediction(y_list, yhat_list)['balanced_accuracy'] + return parameters_dict class RepeatedKFoldCV_Multiclass(base.MLValidation): def __init__(self, ml_algorithm): self._ml_algorithm = ml_algorithm - self._repeated_fold_results = [] + self._repeated_validation_results = [] self._classifier = None self._best_params = None self._cv = None def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): - async_pool = ThreadPool(n_threads) + async_pool = ThreadPool(self._validation_params['n_threads']) async_result = {} self._cv = [] @@ -601,7 +499,7 @@ def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): skf = StratifiedKFold(n_splits=n_folds, shuffle=True) self._cv.append(list(skf.split(np.zeros(len(y)), y))) async_result[r] = {} - self._repeated_fold_results.append([]) + self._repeated_validation_results.append([]) for i in range(n_folds): @@ -612,22 +510,22 @@ def validate(self, y, n_iterations=100, n_folds=10, n_threads=15): async_pool.join() for r in range(n_iterations): for i in range(n_folds): - self._repeated_fold_results[r].append(async_result[r][i].get()) + self._repeated_validation_results[r].append(async_result[r][i].get()) # TODO Find a better way to estimate best parameter - flat_results = [result for fold in self._repeated_fold_results for result in fold] + flat_results = [result for fold in self._repeated_validation_results for result in fold] self._classifier, self._best_params = self._ml_algorithm.apply_best_parameters(flat_results) - return self._classifier, self._best_params, self._repeated_fold_results + return self._classifier, self._best_params, self._repeated_validation_results def save_results(self, output_dir): - if self._repeated_fold_results is None: + if self._repeated_validation_results is None: raise Exception("No results to save. Method validate() must be run before save_results().") all_results_list = [] all_subjects_list = [] - for iteration in range(len(self._repeated_fold_results)): + for iteration in range(len(self._repeated_validation_results)): iteration_dir = path.join(output_dir, 'iteration-' + str(iteration)) if not path.exists(iteration_dir): @@ -640,19 +538,19 @@ def save_results(self, output_dir): if not path.exists(folds_dir): os.makedirs(folds_dir) - for i in range(len(self._repeated_fold_results[iteration])): - subjects_df = pd.DataFrame({'y': self._repeated_fold_results[iteration][i]['y'], - 'y_hat': self._repeated_fold_results[iteration][i]['y_hat'], - 'y_index': self._repeated_fold_results[iteration][i]['y_index']}) + for i in range(len(self._repeated_validation_results[iteration])): + subjects_df = pd.DataFrame({'y': self._repeated_validation_results[iteration][i]['y'], + 'y_hat': self._repeated_validation_results[iteration][i]['y_hat'], + 'y_index': self._repeated_validation_results[iteration][i]['y_index']}) subjects_df.to_csv(path.join(folds_dir, 'subjects_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') iteration_subjects_list.append(subjects_df) results_df = pd.DataFrame( - {'balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation']['balanced_accuracy'], - 'accuracy': self._repeated_fold_results[iteration][i]['evaluation']['accuracy'], - 'train_balanced_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['balanced_accuracy'], - 'train_accuracy': self._repeated_fold_results[iteration][i]['evaluation_train']['accuracy'] + {'balanced_accuracy': self._repeated_validation_results[iteration][i]['evaluation']['balanced_accuracy'], + 'accuracy': self._repeated_validation_results[iteration][i]['evaluation']['accuracy'], + 'train_balanced_accuracy': self._repeated_validation_results[iteration][i]['evaluation_train']['balanced_accuracy'], + 'train_accuracy': self._repeated_validation_results[iteration][i]['evaluation_train']['accuracy'] }, index=['i', ]) results_df.to_csv(path.join(folds_dir, 'results_fold-' + str(i) + '.tsv'), index=False, sep='\t', encoding='utf-8') diff --git a/test/instantiation/test_instantiate_all_pipelines.py b/test/instantiation/test_instantiate_all_pipelines.py index 563531ec9..1ac4bd953 100644 --- a/test/instantiation/test_instantiate_all_pipelines.py +++ b/test/instantiation/test_instantiate_all_pipelines.py @@ -295,12 +295,29 @@ def test_instantiate_InputsML(): atlases = ['AAL2', 'Neuromorphometrics', 'AICHA', 'LPBA40', 'Hammers'] possible_psf = [0, 5, 10, 15, 20, 25] - voxel_input = [CAPSVoxelBasedInput(caps_dir, tsv, diagnoses_tsv, group_id, im, fwhm=8) + voxel_input = [CAPSVoxelBasedInput({'caps_directory': caps_dir, + 'subjects_visits_tsv': tsv, + 'diagnoses_tsv': diagnoses_tsv, + 'group_id': group_id, + 'image_type': im, + 'fwhm': 8}) for im in image_type] - region_input = [CAPSRegionBasedInput(caps_dir, tsv, diagnoses_tsv, group_id, im, at) + + region_input = [CAPSRegionBasedInput({'caps_directory': caps_dir, + 'subjects_visits_tsv': tsv, + 'diagnoses_tsv': diagnoses_tsv, + 'group_id': group_id, + 'image_type': im, + 'atlas': at}) for im in image_type for at in atlases] - vertex_input = [CAPSVertexBasedInput(caps_dir, tsv, diagnoses_tsv, group_id, fwhm, 'fdg') + + vertex_input = [CAPSVertexBasedInput({'caps_directory': caps_dir, + 'subjects_visits_tsv': tsv, + 'diagnoses_tsv': diagnoses_tsv, + 'group_id': group_id, + 'image_type': 'fdg', + 'fwhm': fwhm}) for fwhm in possible_psf] # Check that each file exists diff --git a/test/nonregression/test_run_pipelines.py b/test/nonregression/test_run_pipelines.py index 45aa1cc78..e1539e28e 100644 --- a/test/nonregression/test_run_pipelines.py +++ b/test/nonregression/test_run_pipelines.py @@ -685,10 +685,10 @@ def test_run_PETSurfaceCrossSectional(cmdopt): def test_run_WorkflowsML(cmdopt): - from clinica.pipelines.machine_learning.ml_workflows import (RB_RepHoldOut_LogisticRegression, - VertexB_RepHoldOut_dualSVM, - RB_RepHoldOut_RandomForest, - VB_KFold_DualSVM) + from clinica.pipelines.machine_learning.ml_workflows import (RegionBasedRepHoldOutLogisticRegression, + VertexBasedRepHoldOutDualSVM, + RegionBasedRepHoldOutRandomForest, + VoxelBasedKFoldDualSVM) from os.path import dirname, join, abspath import shutil import warnings @@ -706,31 +706,31 @@ def test_run_WorkflowsML(cmdopt): diagnoses_tsv = join(root_input, 'in', 'diagnosis.tsv') group_id = 'allADNIdartel' - output_dir1 = join(root, 'out', 'VertexB_RepHoldOut_dualSVM') + output_dir1 = join(root, 'out', 'VertexBasedRepHoldOutDualSVM') clean_folder(output_dir1, recreate=True) - wf1 = VertexB_RepHoldOut_dualSVM(caps_dir, tsv, diagnoses_tsv, group_id, output_dir1, image_type='fdg', fwhm=20, - n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) + wf1 = VertexBasedRepHoldOutDualSVM(caps_dir, tsv, diagnoses_tsv, group_id, output_dir1, image_type='fdg', fwhm=20, + n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) wf1.run() shutil.rmtree(output_dir1) - output_dir2 = join(root, 'out', 'RB_RepHoldOut_LogisticRegression') + output_dir2 = join(root, 'out', 'RegionBasedRepHoldOutLogisticRegression') clean_folder(output_dir2, recreate=True) - wf2 = RB_RepHoldOut_LogisticRegression(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', 'AICHA', output_dir2, - n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) + wf2 = RegionBasedRepHoldOutLogisticRegression(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', 'AICHA', output_dir2, + n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) wf2.run() shutil.rmtree(output_dir2) - output_dir3 = join(root, 'out', 'RB_RepHoldOut_RandomForest') + output_dir3 = join(root, 'out', 'RegionBasedRepHoldOutRandomForest') clean_folder(output_dir3, recreate=True) - wf3 = RB_RepHoldOut_RandomForest(caps_dir, tsv, diagnoses_tsv, group_id, 'T1', 'AAL2', output_dir3, n_threads=8, - n_iterations=10, grid_search_folds=3, test_size=0.3) + wf3 = RegionBasedRepHoldOutRandomForest(caps_dir, tsv, diagnoses_tsv, group_id, 'T1', 'AAL2', output_dir3, + n_threads=8, n_iterations=10, grid_search_folds=3, test_size=0.3) wf3.run() shutil.rmtree(output_dir3) - output_dir4 = join(root, 'out', 'VB_KFold_DualSVM') + output_dir4 = join(root, 'out', 'VoxelBasedKFoldDualSVM') clean_folder(output_dir4, recreate=True) - wf4 = VB_KFold_DualSVM(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', output_dir4, fwhm=8, n_threads=8, n_folds=5, - grid_search_folds=3) + wf4 = VoxelBasedKFoldDualSVM(caps_dir, tsv, diagnoses_tsv, group_id, 'fdg', output_dir4, fwhm=8, n_threads=8, + n_folds=5, grid_search_folds=3) wf4.run() shutil.rmtree(output_dir4) From 529a4bd9e2432d52d732420dbdb6abf48a1dc64a Mon Sep 17 00:00:00 2001 From: Mauricio DIAZ Date: Thu, 7 May 2020 09:12:18 +0200 Subject: [PATCH 07/69] Add skeleton for deeplearning pipeline --- ....deeplearning_prepare_data_pipeline.py.swp | Bin 0 -> 20480 bytes .../deeplearning_prepare_data/README.md | 73 +++++++++ .../deeplearning_prepare_data/__init__.py | 0 .../deeplearning_prepare_data_cli.py | 96 +++++++++++ .../deeplearning_prepare_data_pipeline.py | 154 ++++++++++++++++++ .../deeplearning_prepare_data_utils.py | 23 +++ .../deeplearning_prepare_data_visualizer.py | 7 + .../deeplearning_prepare_data/info.json | 14 ++ 8 files changed, 367 insertions(+) create mode 100644 clinica/pipelines/deeplearning_prepare_data/.deeplearning_prepare_data_pipeline.py.swp create mode 100644 clinica/pipelines/deeplearning_prepare_data/README.md create mode 100644 clinica/pipelines/deeplearning_prepare_data/__init__.py create mode 100644 clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_cli.py create mode 100644 clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_pipeline.py create mode 100644 clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_utils.py create mode 100644 clinica/pipelines/deeplearning_prepare_data/deeplearning_prepare_data_visualizer.py create mode 100644 clinica/pipelines/deeplearning_prepare_data/info.json diff --git a/clinica/pipelines/deeplearning_prepare_data/.deeplearning_prepare_data_pipeline.py.swp b/clinica/pipelines/deeplearning_prepare_data/.deeplearning_prepare_data_pipeline.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..e200ab66f0b9e11a138d9512306d8c27c18617bb GIT binary patch literal 20480 zcmeHOU5p$@5uQtc0FFt32p$}w%I(AMhP%Dnvq6r<37oSnZHUk2?wm*%XFNC4yW7s} z%w&4jd+R8emtk9EQ%M3yhSfMZ-E3!6zVy2~QL4Q=ci)RG`Zx{x;QFC(l_9HuI>eJlk()gY9 z?spw|!*svsXu&|iK*2!4K*2!4K*2!4K*2!4fMGhn#>@AoF3 zhgLnmG5NhK`E0Fvp8PC63I+-W3I+-W3I+-W3I+-W3I+-W3I+-W3I+-W{tFCPj$wQs z&i%HxpmMzbPyPQ_ZZV9%0KWj91)c#u3oHS9fSZ7q-(eVk23`Vw1e^fw0G{7&7{3E9 z0Z#+p11Fs=YE0WSiVfs23-ECUvB9QX+E&uxbB58!3shroA$ zZv$t5$AC`(Ch$Jsc3>Ou+}jP~m%t_9>%iB5uL5U)IpCeZpWbE|PXG>Z^{wa+a2EI| za4+!kO@?tE_%Lu4Ts;ds1vtPRz|8=2c<#-H@i@=}J_L*b7s2mYU>W#30wuo&eg!-Q z_`s_OzFYx*44eaOU=FwqxCBUG2k;_-GMCf9%$Da0jk;9dv)iHX$x~u>X_O!hNrqi9 zVT=?%w#joYTq#HOUU`D>*KA5xsOlz#WF$gs8G^G%v)V&pIi`oa5}GA15Vd)Ht$0>P z%#JVM5wb@PuQs3%>QTuVh{1wrVq$c_~Ww@HPu}{A?!#^Q2Y}Qphs0R4YEnD@P>t}dxkp~T2cv`ixX__Y3y6vABKKd z5n(vc!zADaSSu%j)I?7lz6n`XrCuAvQ55#UAw@nh)1=3Vq(^rQ(Y+Ig#0cnxUifr+Vk(1AV&)uet?yr_%xjgdE zX>Cu-j!^^Q5e`PL&6S0acs#HRkv4UBJ|R z#7HYBVY&(rYtQpA8POU{*a&O;xL!um!-~{a;`m@KmT0Lh6h~M$d;ea#>-2ZSVYUgpQHlB#V$EcT)OYm9DhRtmU=o9&2lf7OQR$ z_-G3(TsLWw>!aa%Rb#qNp=)z{4=-Z5qZSo2toxc-OsNKzXJ`e)Grzk6V;T5igx0~_ zt~GiYk?RzW2Q4u;u@)Zu`9@>YQYx)BOEQEh!_0z>4%aU{a>d4#+ihV#WS;chZbz7$ z^2n>fl%Co1L)S5F-w(0fUsp6@(({h8`U~h7A{+xl3)>8@)nZ*q(u0 zs+E#!FY$9;9To5m<5Z<&!8E0C9rOdogu&`q(AB8iUJ8Yafv{z>pVh|NuxOOTF(lEZ z_}B zBLd+$!m}j~RZ-iD$i|dmW4A2yu)##)l%Y+y^qPL{gz~-P`uI^x$C(L^7p2`*k>4@% zN{5N&BDxU^d^F0B@!oXP&g>=m`Uu8TIkJ$W%%>mh2K#ae$!u&nAvw0~*|d&pSTl_# z=_nney&m~q%h&3B$7f^T8B9LNtU!i4=<3~JNMZ>E@{OK^WG~K*j#WGm|43!_$z~I4 z#KYv*d$uBES(C7^1&Ng?v}`tA)DcnJ$C3r{ARXOp%2pRUd8~3tc*+kIRcb+0x<{P2!Acd5UWwPc~s zW2>PXL5KS|U!w7GD{6*~L%JFokz_!4udUdya)*;da&1WBuGP8$9t#+1?r6nswyFmH z;27aZJF18kVZ*NZ-s~eds31pO!1UoLVB&nMaOgQ;xk@}*HS|su+tSn)#~O_i9l}Hj zdwlpt_uH{Q0cvp=Bo~9PEo4i0@M7R69Ey+SD0Tz|+Lporz{6PzJ1`rWv`L0{#Xc{%62B;ETX# zfG#i(>;RsFzkUHY3G4)BfIER(fy?mIp9UTV*l)iD_$mDLp8($i&H;}BE>Hz-1Ac>k zao>K<{VT45fr5d8fr5d8fr5emCkFU%$=9(#ypey#;n9IlIl(tc`=T@Y`_VM^jJw+L zs;<&dqQ#~ChiS&(a_KcfZGA(vgU}VLN%Yo@RHu$l;Q+~DtI8m(U833S%4AD47YpK) zI&gd0!=9TVb>w#b9;pGFCD6{NA>UI-^s zd-cOnGK66)9S5#aBO=xIyNLC{tBoCI1S@>c?dSA0;;@avY9{Fji&0P&%XdX4(tG~z^DH!@LPcW{r3Rdf!E;6KM#BpI1Stj+y!twuW?<) zRWML6P%uz1P%uz1P%uz1P%uz1a9tT#<8N%?#{YhrXVTdcO|E+x+4BW{)5^|gdW*6z zEVHEy(fTyvY>46^siepdMY=8LZs-hAoj>-vWrz-v1Fylw=0Q%b*m<`C#V`BgWY&Ra zp9+oo)C$t!hQ_G36}n*GOwY-t8be7O9FzCgfDO{Jktdyy{D)qeP83 Rb5G|iSjbJAO40|6e*u@q(`Ntx literal 0 HcmV?d00001 diff --git a/clinica/pipelines/deeplearning_prepare_data/README.md b/clinica/pipelines/deeplearning_prepare_data/README.md new file mode 100644 index 000000000..ef67aa014 --- /dev/null +++ b/clinica/pipelines/deeplearning_prepare_data/README.md @@ -0,0 +1,73 @@ +# `deeplearning-prepare-data`` - + + + + +## Dependencies + + +If you only installed the core of Clinica, this pipeline needs the installation +of **** on your computer. You can find how to install this +software on the [third-party](../../Third-party) page. + + +## Running the pipeline +The pipeline can be run with the following command line: + +``` +clinica run deeplearning-prepare-data bids_directory caps_directory +``` +where: + + - `bids_directory` is the input folder containing the dataset in a [BIDS](../BIDS) hierarchy. + - `caps_directory` is the output folder containing the results in a [CAPS](../CAPS) hierarchy. + - `` + - `` + +If you want to run the pipeline on a subset of your BIDS dataset, you can use +the `-tsv` flag to specify in a TSV file the participants belonging to your subset. + +!!! note "Particular note regarding your pipeline" + If you have any note regarding your pipeline + + +## Outputs + +Results are stored in the following folder of the [CAPS hierarchy](../../CAPS/Specifications/#): +`subjects/sub-/ses-/`. + +The main output files are: + + - `_labelname-