[ENH] Proposition of a function to translate study ids to bids ids (a…

…ramis-lab#1220) * Proposition for study_to_bids_id * Factoring * add test * Add to converters * Rename function * Apply to all datasets * Add nifd * Add ADNI * Changes upon suggestions * Use for adni-json * Add other uses * todo * changes upon suggestion
AliceJoubert · Jul 10, 2024 · 047065c · 047065c
1 parent 3a34e68
commit 047065c
Show file tree

Hide file tree

Showing 14 changed files with 345 additions and 60 deletions.
diff --git a/clinica/iotools/bids_utils.py b/clinica/iotools/bids_utils.py
@@ -2,9 +2,12 @@
 
 import json
 import os
+import re
+from abc import ABC, abstractmethod
+from collections import UserString
 from enum import Enum
 from pathlib import Path
-from typing import BinaryIO, List, Optional, Union
+from typing import BinaryIO, List, Optional, Type, Union
 
 import pandas as pd
 
@@ -49,6 +52,246 @@ class StudyName(str, Enum):
 }
 
 
+class BIDSSubjectID(ABC, UserString):
+    """This is the interface that BIDS subject IDs have to implement."""
+
+    def __init__(self, value: str):
+        instance = super().__init__(self.validate(value))
+        return instance
+
+    @abstractmethod
+    def validate(self, value: str) -> str:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def to_original_study_id(self) -> str:
+        raise NotImplementedError
+
+
+def bids_id_factory(study: StudyName) -> Type[BIDSSubjectID]:
+    if study == StudyName.ADNI:
+        return ADNIBIDSSubjectID
+    if study == StudyName.NIFD:
+        return NIFDBIDSSubjectID
+    if study == StudyName.AIBL:
+        return AIBLBIDSSubjectID
+    if study == StudyName.UKB:
+        return UKBBIDSSubjectID
+    if study == StudyName.GENFI:
+        return GENFIBIDSSubjectID
+    if study == StudyName.OASIS:
+        return OASISBIDSSubjectID
+    if study == StudyName.OASIS3:
+        return OASIS3BIDSSubjectID
+    if study == StudyName.HABS:
+        return HABSBIDSSubjectID
+
+
+class ADNIBIDSSubjectID(BIDSSubjectID):
+    """Implementation for ADNI of the BIDSSubjectIDClass, allowing to go from the source id XXX_S_XXXX
+    to a bids id sub-ADNIXXXSXXX and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-ADNI\d{3}S\d{4}", value):
+            return value
+        raise ValueError(
+            f"BIDS ADNI subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-ADNIXXXSXXXX' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"\d{3}_S_\d{4}", study_id):
+            return "sub-ADNI" + study_id.replace("_", "")
+        raise ValueError(
+            f"Raw ADNI subject ID {study_id} is not properly formatted. "
+            "Expecting a 'XXX_S_XXXX' format."
+        )
+
+    def to_original_study_id(self) -> str:
+        return "_S_".join(self.split("ADNI")[1].split("S"))
+
+
+class NIFDBIDSSubjectID(BIDSSubjectID):
+    """Implementation for NIFD of the BIDSSubjectIDClass, allowing to go from the source id X_S_XXXX
+    to a bids id sub-NIFDXSXXX and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-NIFD\dS\d{4}", value):
+            return value
+        raise ValueError(
+            f"BIDS NIFD subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-NIFDXSXXXX' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"\d_S_\d{4}", study_id):
+            return "sub-NIFD" + study_id.replace("_", "")
+        raise ValueError(
+            f"Raw NIFD subject ID {study_id} is not properly formatted. "
+            "Expecting a 'X_S_XXXX' format."
+        )
+
+    def to_original_study_id(self) -> str:
+        return "_S_".join(self.split("NIFD")[1].split("S"))
+
+
+class AIBLBIDSSubjectID(BIDSSubjectID):
+    """Implementation for AIBL of the BIDSSubjectIDClass, allowing to go from the source id Y
+    to a bids id sub-ADNIY and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-AIBL\d*", value):
+            return value
+        raise ValueError(
+            f"BIDS AIBL subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-AIBLY' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"\d*", study_id):
+            return "sub-AIBL" + study_id
+        raise ValueError(
+            f"Raw AIBL subject ID {study_id} is not properly formatted. "
+            "Expecting a 'Y' format where Y is a combination of digits."
+        )
+
+    def to_original_study_id(self) -> str:
+        return self.split("AIBL")[1]
+
+
+class UKBBIDSSubjectID(BIDSSubjectID):
+    """Implementation for UKB of the BIDSSubjectIDClass, allowing to go from the source id Y
+    to a bids id sub-ADNIY and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-UKB\d*", value):
+            return value
+        raise ValueError(
+            f"BIDS UKB subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-UKBY' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"\d*", study_id):
+            return "sub-UKB" + study_id
+        raise ValueError(
+            f"Raw UKB subject ID {study_id} is not properly formatted. "
+            "Expecting a 'Y' format where Y is a combination of digits."
+        )
+
+    def to_original_study_id(self) -> str:
+        return self.split("UKB")[1]
+
+
+class GENFIBIDSSubjectID(BIDSSubjectID):
+    """Implementation for GENFI of the BIDSSubjectIDClass, allowing to go from the source id Y
+    to a bids id sub-Y and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-\w*", value):
+            return value
+        raise ValueError(
+            f"BIDS GENFI subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-Y' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"\w*", study_id):
+            return "sub-" + study_id
+        raise ValueError(
+            f"Raw GENFI subject ID {study_id} is not properly formatted. "
+            "Expecting a 'Y' format where Y is a combination of letters and digits."
+        )
+
+    def to_original_study_id(self) -> str:
+        return self.split("-")[1]
+
+
+class OASISBIDSSubjectID(BIDSSubjectID):
+    """Implementation for OASIS1 of the BIDSSubjectIDClass, allowing to go from the source id OAS1_XXXX_MR1/2
+    to a bids id sub-OASIS1XXXX and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-OASIS1\d{4}", value):
+            return value
+        raise ValueError(
+            f"BIDS OASIS1 subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-OASIS1XXXX' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"OAS1_\d{4}_MR\d", study_id):
+            return "sub-OASIS1" + study_id.split("_")[1]
+        raise ValueError(
+            f"Raw OASIS1 subject ID {study_id} is not properly formatted. "
+            "Expecting a 'OAS1_XXXX_MR1/2' format."
+        )
+
+    def to_original_study_id(self) -> str:
+        return "OAS1" + self.split("OASIS1")[1] + "MR1"
+
+
+class OASIS3BIDSSubjectID(BIDSSubjectID):
+    """Implementation for OASIS3 of the BIDSSubjectIDClass, allowing to go from the source id XXXX
+    to a bids id sub-OAS3XXXX and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-OAS3\d{4}", value):
+            return value
+        raise ValueError(
+            f"BIDS OASIS3 subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-OAS3XXXX' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"OAS3\d{4}", study_id):
+            return "sub-" + study_id
+        raise ValueError(
+            f"Raw OASIS3 subject ID {study_id} is not properly formatted. "
+            "Expecting a 'OAS3XXXX' format."
+        )
+
+    def to_original_study_id(self) -> str:
+        return self.split("-")[1]
+
+
+class HABSBIDSSubjectID(BIDSSubjectID):
+    """Implementation for HABS of the BIDSSubjectIDClass, allowing to go from the source id P_Y
+    to a bids id sub-HABSY and reciprocally."""
+
+    def validate(self, value: str) -> str:
+        if re.fullmatch(r"sub-HABS\w*", value):
+            return value
+        raise ValueError(
+            f"BIDS HABS subject ID {value} is not properly formatted. "
+            "Expecting a 'sub-HABSY' format."
+        )
+
+    @classmethod
+    def from_original_study_id(cls, study_id: str) -> str:
+        if re.fullmatch(r"P_\w*", study_id):
+            return study_id.replace("P_", "sub-HABS")
+        raise ValueError(
+            f"Raw HABS subject ID {study_id} is not properly formatted. "
+            "Expecting a 'P_Y' format."
+        )
+
+    def to_original_study_id(self) -> str:
+        return str(self.replace("sub-HABS", "P_"))
+
+
 # -- Methods for the clinical data --
 def create_participants_df(
     study_name: StudyName,
@@ -166,15 +409,10 @@ def create_participants_df(
 
     # Adding participant_id column with BIDS ids
     for i in range(0, len(participant_df)):
-        if study_name == StudyName.OASIS:
-            value = (participant_df["alternative_id_1"][i].split("_"))[1]
-        elif study_name == StudyName.OASIS3:
-            value = participant_df["alternative_id_1"][i].replace("OAS3", "")
-        else:
-            value = remove_space_and_symbols(participant_df["alternative_id_1"][i])
-
+        value = bids_id_factory(study_name).from_original_study_id(
+            participant_df["alternative_id_1"][i]
+        )
         bids_id = [s for s in bids_ids if value in s]
-
         if len(bids_id) == 0:
             index_to_drop.append(i)
             subjects_to_drop.append(value)
@@ -289,11 +527,7 @@ def create_sessions_dict_oasis(
                     if subj_id.dtype == np.int64:
                         subj_id = str(subj_id)
                 # Removes all the - from
-                subj_id_alpha = remove_space_and_symbols(subj_id)
-                if study_name == StudyName.OASIS:
-                    subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])
-                if study_name == StudyName.OASIS3:
-                    subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3:])
+                subj_id_alpha = str(subj_id[0:3] + "IS" + subj_id[3] + subj_id[5:9])
 
                 # Extract the corresponding BIDS id and create the output file if doesn't exist
                 subj_bids = [s for s in bids_ids if subj_id_alpha in s]

diff --git a/clinica/iotools/converters/adni_to_bids/adni_json.py b/clinica/iotools/converters/adni_to_bids/adni_json.py
@@ -31,12 +31,12 @@ def _bids_id_to_loni(bids_id: str) -> Optional[str]:
     """Convert a subject id of the form sub-ADNI000S0000
     back to original format 000_S_0000
     """
-    import re
+    from clinica.iotools.bids_utils import StudyName, bids_id_factory
 
-    ids = re.findall(r"\d+", bids_id)
-    if len(ids) == 2:
-        return ids[0] + "_S_" + ids[1]
-    return None
+    try:
+        return bids_id_factory(StudyName.ADNI)(bids_id).to_original_study_id()
+    except ValueError:
+        return None
 
 
 def _read_xml_files(

diff --git a/clinica/iotools/converters/adni_to_bids/adni_to_bids.py b/clinica/iotools/converters/adni_to_bids/adni_to_bids.py
@@ -235,6 +235,8 @@ def _get_bids_subjects_info(
     out_path: Path,
     subjects: Optional[Path] = None,
 ) -> tuple[list[str], list[Path]]:
+    from clinica.iotools.bids_utils import StudyName, bids_id_factory
+
     from .adni_utils import load_clinical_csv
 
     # Read optional list of participants.
@@ -246,7 +248,9 @@ def _get_bids_subjects_info(
     # Filter participants if requested.
     participants = sorted(participants & subjects if subjects else participants)
     # Compute their corresponding BIDS IDs and paths.
-    bids_ids = [f"sub-ADNI{p.replace('_', '')}" for p in participants]
+    bids_ids = [
+        bids_id_factory(StudyName.ADNI).from_original_study_id(p) for p in participants
+    ]
     bids_paths = [out_path / bids_id for bids_id in bids_ids]
 
     return bids_ids, bids_paths
diff --git a/clinica/iotools/converters/adni_to_bids/adni_utils.py b/clinica/iotools/converters/adni_to_bids/adni_utils.py
@@ -183,8 +183,6 @@ def _write_adni_sessions_tsv(
         df_subj_sessions: global dataframe containing clinical sessions data for all subjects
         bids_subjs_paths: a list with the path to all bids subjects
     """
-    import os
-    from os import path
 
     df_subj_sessions["adas_memory"] = (
         df_subj_sessions["adas_Q1"]
@@ -268,6 +266,7 @@ def _filter_subj_bids(
 
     # Depending on the file that needs to be open, identify and
     # preprocess the column that contains the subjects ids.
+    # todo : use id class here ?
     bids_ids = [x[8:] for x in bids_ids if "sub-ADNI" in x]
     if location == "ADNIMERGE.csv":
         df_files["RID"] = df_files["PTID"].apply(
@@ -529,6 +528,7 @@ def create_adni_scans_files(conversion_path: Path, bids_subjs_paths: list[Path])
     """
     from os import path
 
+    from clinica.iotools.bids_utils import StudyName, bids_id_factory
     from clinica.utils.stream import cprint
 
     scans_fields_bids = ["filename", "scan_id", "mri_field"]
@@ -552,7 +552,7 @@ def create_adni_scans_files(conversion_path: Path, bids_subjs_paths: list[Path])
     for bids_subject_path in bids_subjs_paths:
         # Create the file
         bids_id = bids_subject_path.resolve().name
-        subject_id = "_S_".join(bids_id[8::].split("S"))
+        subject_id = bids_id_factory(StudyName.ADNI)(bids_id).to_original_study_id()
         for session_path in bids_subject_path.glob("ses-*"):
             viscode = _session_label_to_viscode(session_path.name[4::])
             tsv_name = f"{bids_id}_{session_path.name}_scans.tsv"
@@ -768,7 +768,7 @@ def _create_file(
     import numpy as np
 
     from clinica.cmdline import setup_clinica_logging
-    from clinica.iotools.bids_utils import run_dcm2niix
+    from clinica.iotools.bids_utils import StudyName, bids_id_factory, run_dcm2niix
     from clinica.iotools.converter_utils import viscode_to_session
     from clinica.iotools.utils.data_handling import center_nifti_origin
     from clinica.utils.stream import cprint
@@ -805,12 +805,10 @@ def _create_file(
     # If the original image is a DICOM, check if contains two DICOM inside the same folder
     if image.Is_Dicom:
         image_path = _check_two_dcm_folder(image_path, bids_dir, image_id)
-    bids_subj = subject.replace("_", "")
-    output_path = (
-        bids_dir / f"sub-ADNI{bids_subj}" / session / _get_output_path(modality)
-    )
+    bids_id = bids_id_factory(StudyName.ADNI).from_original_study_id(subject)
+    output_path = bids_dir / bids_id / session / _get_output_path(modality)
     output_filename = (
-        f"sub-ADNI{bids_subj}_{session}{_get_output_filename(modality, image_tracer)}"
+        f"{bids_id}_{session}{_get_output_filename(modality, image_tracer)}"
     )
     output_path.mkdir(parents=True, exist_ok=True)