From 695668a9d72e782b7062f67b3efaa1406e9acf0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Boris=20Cl=C3=A9net?=
 <117362283+bclenet@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:45:37 +0200
Subject: [PATCH] Adding participants exclusions in narps_open_runner (#194)

* Adding a command line tool showing the correlation results of a pipeline execution

* [DOC] install doc about correlation command line tool [skip ci]

* Modifications on runner

* Correlation main + exclusions in runner
---
 INSTALL.md                                    |  6 +++
 narps_open/runner.py                          | 16 +++++-
 .../__init__.py}                              |  0
 narps_open/utils/correlation/__main__.py      | 53 +++++++++++++++++++
 setup.py                                      |  1 +
 tests/conftest.py                             | 18 +++----
 6 files changed, 84 insertions(+), 10 deletions(-)
 rename narps_open/utils/{correlation.py => correlation/__init__.py} (100%)
 create mode 100644 narps_open/utils/correlation/__main__.py

diff --git a/INSTALL.md b/INSTALL.md
index e9f124ba..28936287 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -95,6 +95,7 @@ Finally, you are able to use the scripts of the project :
 
 * `narps_open_runner`: run pipelines
 * `narps_open_tester`: run a pipeline and test its results against original ones from the team
+* `narps_open_correlations`: compute and display correlation between results and original ones from the team
 * `narps_description`: get the textual description made by a team
 * `narps_results`: download the original results from teams
 * `narps_open_status`: get status information about the development process of the pipelines
@@ -107,6 +108,10 @@ narps_open_runner -t 2T6S -n 40
 #   and produces a report with correlation values.
 narps_open_tester -t 08MQ
 
+# Compute the correlation values between results of 2T6S reproduction on 60 subjects with original ones
+#   WARNING : 2T6S must have been previously computed with a group of 60 subjects
+narps_open_correlations -t 2T6S -n 60
+
 # Get the description of team C88N in markdown formatting
 narps_description -t C88N --md
 
@@ -121,6 +126,7 @@ narps_open_status --json
 > For further information about these command line tools, read the corresponding documentation pages.
 > * `narps_open_runner` : [docs/running.md](docs/running.md)
 > * `narps_open_tester` : [docs/testing.md](docs/testing.md#command-line-tool)
+> * `narps_open_correlations` : [docs/correlation.md](docs/correlation.md#command-line-tool)
 > * `narps_description` : [docs/description.md](docs/description.md)
 > * `narps_results` : [docs/data.md](docs/data.md#results-from-narps-teams)
 > * `narps_open_status` : [docs/status.md](docs/status.md)
diff --git a/narps_open/runner.py b/narps_open/runner.py
index bf557ba0..597d1144 100644
--- a/narps_open/runner.py
+++ b/narps_open/runner.py
@@ -178,8 +178,15 @@ def main():
         help='run the first levels only (preprocessing + subjects + runs)')
     parser.add_argument('-c', '--check', action='store_true', required=False,
         help='check pipeline outputs (runner is not launched)')
+    parser.add_argument('-e', '--exclusions', action='store_true', required=False,
+        help='run the analyses without the excluded subjects')
     arguments = parser.parse_args()
 
+    # Check arguments
+    if arguments.exclusions and not arguments.nsubjects:
+        print('Argument -e/--exclusions only works with -n/--nsubjects')
+        return
+
     # Initialize a PipelineRunner
     runner = PipelineRunner(team_id = arguments.team)
     runner.pipeline.directories.dataset_dir = Configuration()['directories']['dataset']
@@ -193,7 +200,14 @@ def main():
     elif arguments.rsubjects is not None:
         runner.random_nb_subjects = int(arguments.rsubjects)
     else:
-        runner.nb_subjects = int(arguments.nsubjects)
+        if arguments.exclusions:
+            # Intersection between the requested subset and the list of not excluded subjects
+            runner.subjects = list(
+                set(get_participants_subset(int(arguments.nsubjects)))
+              & set(get_participants(arguments.team))
+            )
+        else:
+            runner.nb_subjects = int(arguments.nsubjects)
 
     # Check data
     if arguments.check:
diff --git a/narps_open/utils/correlation.py b/narps_open/utils/correlation/__init__.py
similarity index 100%
rename from narps_open/utils/correlation.py
rename to narps_open/utils/correlation/__init__.py
diff --git a/narps_open/utils/correlation/__main__.py b/narps_open/utils/correlation/__main__.py
new file mode 100644
index 00000000..d086499b
--- /dev/null
+++ b/narps_open/utils/correlation/__main__.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+# coding: utf-8
+
+""" A command line tool for the narps_open.utils.correlation module """
+
+from os.path import join
+from argparse import ArgumentParser
+
+from narps_open.data.results import ResultsCollection
+from narps_open.utils.configuration import Configuration
+from narps_open.utils.correlation import get_correlation_coefficient
+from narps_open.pipelines import get_implemented_pipelines
+from narps_open.runner import PipelineRunner
+
+def main():
+    """ Entry-point for the command line tool narps_open_correlations """
+
+    # Parse arguments
+    parser = ArgumentParser(description = 'Compare reproduced files to original results.')
+    parser.add_argument('-t', '--team', type = str, required = True,
+        help = 'the team ID', choices = get_implemented_pipelines())
+    subjects.add_argument('-n', '--nsubjects', type=str, required = True,
+        help='the number of subjects to be selected')
+    arguments = parser.parse_args()
+
+    # Initialize pipeline
+    runner = PipelineRunner(arguments.team)
+    runner.pipeline.directories.dataset_dir = Configuration()['directories']['dataset']
+    runner.pipeline.directories.results_dir = Configuration()['directories']['reproduced_results']
+    runner.pipeline.directories.set_output_dir_with_team_id(arguments.team)
+    runner.pipeline.directories.set_working_dir_with_team_id(arguments.team)
+    runner.nb_subjects = arguments.nsubjects
+
+    # Indices and keys to the unthresholded maps
+    indices = list(range(1, 18, 2))
+
+    # Retrieve the paths to the reproduced files
+    reproduced_files = runner.pipeline.get_hypotheses_outputs()
+    reproduced_files = [reproduced_files[i] for i in indices]
+
+    # Retrieve the paths to the results files
+    collection = ResultsCollection(arguments.team)
+    file_keys = [f'hypo{h}_unthresh.nii.gz' for h in range(1,10)]
+    results_files = [join(collection.directory, k) for k in file_keys]
+
+    # Compute the correlation coefficients
+    print([
+        get_correlation_coefficient(reproduced_file, results_file)
+        for reproduced_file, results_file in zip(reproduced_files, results_files)
+        ])
+
+if __name__ == '__main__':
+    main()
diff --git a/setup.py b/setup.py
index b17409b6..e3c65bb0 100644
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@
             'narps_open_runner = narps_open.runner:main',
             'narps_open_tester = narps_open.tester:main',
             'narps_open_status = narps_open.utils.status:main',
+            'narps_open_correlations = narps_open.utils.correlation.__main__:main',
             'narps_description = narps_open.data.description.__main__:main',
             'narps_results = narps_open.data.results.__main__:main'
         ]
diff --git a/tests/conftest.py b/tests/conftest.py
index f12f77a0..3e5570ff 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -22,6 +22,7 @@
 from narps_open.utils.correlation import get_correlation_coefficient
 from narps_open.utils.configuration import Configuration
 from narps_open.data.results import ResultsCollection
+from narps_open.data.participants import get_participants_subset
 
 # Init configuration, to ensure it is in testing mode
 Configuration(config_type='testing')
@@ -88,13 +89,12 @@ def test_pipeline_execution(
     TODO : how to keep intermediate files of the low level for the next numbers of subjects ?
         - keep intermediate levels : boolean in PipelineRunner
     """
-    # A list of number of subject to iterate over
-    nb_subjects_list = list(range(
-        Configuration()['testing']['pipelines']['nb_subjects_per_group'],
-        nb_subjects,
-        Configuration()['testing']['pipelines']['nb_subjects_per_group'])
-        )
-    nb_subjects_list.append(nb_subjects)
+    # Create subdivisions of the requested subject list
+    nb_subjects_per_group = Configuration()['testing']['pipelines']['nb_subjects_per_group']
+    all_subjects = get_participants_subset(nb_subjects)
+    subjects_lists = []
+    for index in range(0, len(all_subjects), nb_subjects_per_group):
+        subjects_lists.append(all_subjects[index:index+nb_subjects_per_group])
 
     # Initialize the pipeline
     runner = PipelineRunner(team_id)
@@ -104,8 +104,8 @@ def test_pipeline_execution(
     runner.pipeline.directories.set_working_dir_with_team_id(team_id)
 
     # Run first level by (small) sub-groups of subjects
-    for subjects in nb_subjects_list:
-        runner.nb_subjects = subjects
+    for subjects_list in subjects_lists:
+        runner.subjects = subjects_list
 
         # Run as long as there are missing files after first level (with a max number of trials)
         # TODO : this is a workaround