SIMEXP · htwangtw · Jan 29, 2025 · Mar 25, 2024 · Mar 25, 2024 · Mar 26, 2024
diff --git a/code/cimaq.py b/code/cimaq.py
@@ -1,6 +1,6 @@
 """Load CIMA-Q data and extract demographic information.
 
-Author: Natasha Clarke; last edit 2024-03-05
+Author: Natasha Clarke; last edit 2024-04-16
 
 All input stored in `data/cimaq` folder. The content of `data` is not
 included in the repository.
@@ -22,7 +22,7 @@
         "description": "Unique identifier for each participant",
     },
     "age": {
-        "original_field_name": "age",
+        "original_field_name": "age_du_participant",
         "description": "Age of the participant in years",
     },
     "sex": {
@@ -61,69 +61,27 @@
         "original_field_name": "84756_nombre_annee_education",
         "description": "Years in education",
     },
+    "ses": {
+        "original_field_name": "no_visite",
+        "description": "Session label, in this dataset it is the visit label indicating months since baseline",
+    },
 }
 
 
-def find_closest_diagnosis(scan_row, diagnosis_df):
-    pscid = scan_row["pscid"]
-    scan_date = scan_row["date"]
-
-    # Filter to evaluations for the same participant and where the evaluation date is not NULL
-    participant_df = diagnosis_df[
-        (diagnosis_df["pscid"] == pscid) & diagnosis_df["date_de_l_évaluation"].notna()
-    ].copy()
-
-    # Find the closest date
-    if not participant_df.empty:
-        # Compute the absolute difference in days between the scan date and diagnosis evaluation dates
-        participant_df["date_diff"] = (
-            participant_df["date_de_l_évaluation"].sub(scan_date).dt.days.abs()
-        )
-
-        # Find the diagnosis with the smallest date difference
-        closest_date = participant_df.loc[participant_df["date_diff"].idxmin()]
-
-        # Return the diagnosis, date and numebr of days. Also return the sex info
-        return (
-            closest_date["22501_diagnostic_clinique"],
-            closest_date["date_de_l_évaluation"],
-            closest_date["date_diff"],
-            closest_date["sexe"],
-        )
-    else:
-        # Return None for each expected value to maintain consistency
-        return None, None, None
-
-
-def process_data(root_p, output_p, metadata):
-    # Paths to different data files
-    scan_file_p = root_p / "sommaire_des_scans.tsv"
-    diagnosis_file_p = root_p / "22501_diagnostic_clinique.tsv"
-    socio_file_p = (
-        root_p / "55398_informations_socio_demographiques_participant_initial.tsv"
-    )
-    edu_file_p = root_p / "84756_variables_reserve_cognitive_bartres_initial.tsv"
-
-    # Load the CSVs
-    df = pd.read_csv(scan_file_p, sep="\t", parse_dates=["date"])
-    diagnosis_df = pd.read_csv(
-        diagnosis_file_p, sep="\t", parse_dates=["date_de_l_évaluation"]
-    )
-    socio_df = pd.read_csv(socio_file_p, sep="\t")
-    edu_df = pd.read_csv(edu_file_p, sep="\t", encoding="ISO-8859-1")
-
-    # Select only resting state data
-    df = df[df["nii_protocole"] == "task-rest"]  # Run again for task-memory
+def merge_pheno(scan_df, diagnosis_df, socio_df, cog_df):
+    df = diagnosis_df.copy()
 
-    # Apply function to match diagnoses according to closest scan date, and split the results into new columns
-    result = df.apply(
-        lambda row: pd.Series(find_closest_diagnosis(row, diagnosis_df)), axis=1
-    )
-    df[["diagnosis", "matched_evaluation_date", "date_diff", "sex"]] = (
-        result  # Returning these for now because at a later date we may want to drop e.g participants with diagnoses outside a certain window
+    # Match for site
+    scan_df_first = scan_df.drop_duplicates(subset="pscid", keep="first")
+    df = pd.merge(
+        df,
+        scan_df_first[["pscid", "site_scanner"]],
+        on="pscid",
+        how="left",
     )
 
-    # Match with df for handedness
+    # Match for handedness
+    socio_df = socio_df.drop_duplicates(subset="PSCID", keep="first")
     df = pd.merge(
         df,
         socio_df[["PSCID", "55398_lateralite"]],
@@ -132,21 +90,29 @@ def process_data(root_p, output_p, metadata):
         how="left",
     )
 
-    # Match with df for education
+    # Match for education
+    cog_df = cog_df.drop_duplicates(subset="PSCID", keep="first")
     df = pd.merge(
         df,
-        edu_df[["PSCID", "84756_nombre_annee_education"]],
+        cog_df[["PSCID", "84756_nombre_annee_education"]],
         left_on="pscid",
         right_on="PSCID",
         how="left",
     )
+    return df
 
+
+def process_pheno(df):
     # Process the data
     df["participant_id"] = df["pscid"].astype(str)
-    df["age"] = df["age"].astype(float)
-    df["sex"] = df["sex"].map({"femme": "female", "homme": "male"})
-    df["site"] = df["centre"]
-    df["diagnosis"] = df["diagnosis"].map(
+    df["age"] = df["âge_du_participant"].astype(float)
+    df["sex"] = df["sexe"].map({"femme": "female", "homme": "male"})
+    df["site"] = df["site_scanner"].replace(
+        {
+            "Hopital Général Juif": "JGH",
+        }
+    )
+    df["diagnosis"] = df["22501_diagnostic_clinique"].map(
         {
             "démence_de_type_alzheimer-légère": "ADD(M)",
             "cognitivement_sain_(cs)": "CON",
@@ -162,14 +128,130 @@ def process_data(root_p, output_p, metadata):
     df["education"] = pd.to_numeric(
         df["84756_nombre_annee_education"], errors="coerce"
     )  # This will replace the "donnée_non_disponible" entries with NaN
+    df["ses"] = df["no_visite"]
 
     # Select columns
     df = df[
-        ["participant_id", "age", "sex", "site", "diagnosis", "handedness", "education"]
+        [
+            "participant_id",
+            "age",
+            "sex",
+            "site",
+            "diagnosis",
+            "handedness",
+            "education",
+            "ses",
+        ]
     ]
+    return df.copy()
+
+
+def merge_qc_pheno(qc_df_filtered, pheno_df):
+    # Create a numeric version of the session
+    pheno_df["ses_numeric"] = pheno_df["ses"].str.replace("V", "").astype(int)
+    qc_df_filtered["ses_numeric"] = (
+        qc_df_filtered["ses"].str.replace("V", "").astype(int)
+    )
+
+    pheno_df["participant_id"] = pheno_df["participant_id"].astype(int)
+    qc_df_filtered["participant_id"] = qc_df_filtered["participant_id"].astype(int)
+
+    pheno_df = pheno_df.sort_values(by="ses_numeric")
+    qc_df_filtered = qc_df_filtered.sort_values(by="ses_numeric")
+
+    # Merge pheno and QC on nearest. Note that since the longest difference between scanning and pheno collection is 3 months, we don't need to set a threshold for the diagnoses
+    merged_df = pd.merge_asof(
+        qc_df_filtered,
+        pheno_df,
+        by="participant_id",  # Match participants
+        on="ses_numeric",  # Find the nearest match based on session date
+        direction="nearest",
+    )
+
+    # Handle site columns
+    merged_df.drop(columns=["site_x"], inplace=True)
+    merged_df.rename(columns={"site_y": "site"}, inplace=True)
+
+    # Handle session columns
+    merged_df.drop(columns=["ses_y"], inplace=True)
+    merged_df.rename(columns={"ses_x": "ses"}, inplace=True)
+    merged_df.drop(columns=["ses_numeric"], inplace=True)
+    return merged_df
+
+
+def merge_scanner(qc_pheno_df, scan_df):
+    # Create scanner column
+    scan_df["scanner"] = (
+        scan_df["fabriquant"].str.replace(" ", "_")
+        + "_"
+        + scan_df["modele_scanner"].str.replace(" ", "_")
+    ).str.lower()
+
+    # Drop multiple entries per session for scannning data
+    scan_df.drop_duplicates(subset=["pscid", "no_visite"], keep="first", inplace=True)
+
+    qc_pheno_df["participant_id"] = qc_pheno_df["participant_id"].astype(int)
+    scan_df["pscid"] = scan_df["pscid"].astype(int)
+
+    merged_df = pd.merge(
+        qc_pheno_df,
+        scan_df[["pscid", "no_visite", "scanner"]],
+        left_on=["participant_id", "ses"],
+        right_on=["pscid", "no_visite"],
+        how="left",
+    )
+
+    merged_df.drop(columns=["pscid", "no_visite"], inplace=True)
+
+    return merged_df
+
+
+def process_data(root_p, metadata):
+    # Paths to data
+    diagnosis_file_p = (
+        root_p / "wrangling-phenotype/data/cimaq/22501_diagnostic_clinique.tsv"
+    )
+    scan_file_p = (
+        root_p
+        / "wrangling-phenotype/data/cimaq/dr15_20240301_sommaire_des_scans-nii.tsv"
+    )  # Using the dr15 spreadsheet since there is more data available
+    socio_file_p = (
+        root_p
+        / "wrangling-phenotype/data/cimaq/55398_informations_socio_demographiques_participant_initial.tsv"
+    )
+    cog_file_p = (
+        root_p
+        / "wrangling-phenotype/data/cimaq/84756_variables_reserve_cognitive_bartres_initial.tsv"
+    )
+    qc_file_p = root_p / "qc_output/rest_df.tsv"
+    output_p = root_p / "wrangling-phenotype/outputs"
+
+    # Load the CSVs
+    diagnosis_df = pd.read_csv(
+        diagnosis_file_p, sep="\t", parse_dates=["date_de_l_évaluation"]
+    )
+    scan_df = pd.read_csv(scan_file_p, sep="\t")
+    socio_df = pd.read_csv(socio_file_p, sep="\t")
+    cog_df = pd.read_csv(cog_file_p, sep="\t", encoding="ISO-8859-1")
+    qc_df = pd.read_csv(qc_file_p, sep="\t", low_memory=False)
+
+    # Merge different phenotypic fields
+    df = merge_pheno(scan_df, diagnosis_df, socio_df, cog_df)
+
+    # Process pheno data
+    pheno_df = process_pheno(df)
+
+    # Filter qc df for dataset
+    qc_df_filtered = qc_df.loc[qc_df["dataset"] == "cimaq"].copy()
+
+    # Merge pheno with qc
+    qc_pheno_df = merge_qc_pheno(qc_df_filtered, pheno_df)
+
+    # Merge with scan info
+    qc_scan_df = merge_scanner(qc_pheno_df, scan_df)
 
     # Output tsv file
-    df.to_csv(output_p / "cimaq_pheno.tsv", sep="\t", index=False)
+    qc_scan_df.to_csv(output_p / "cimaq_qc_pheno.tsv", sep="\t", index=False)
 
     # Output metadata to json
     with open(output_p / "cimaq_pheno.json", "w") as f:
@@ -180,11 +262,9 @@ def process_data(root_p, output_p, metadata):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Process CIMA-Q phenotype data and output to TSV and JSON"
+        description="Process CIMS-Q phenotype data, merge with QC and output to to TSV and JSON"
     )
-    parser.add_argument("rootpath", type=Path, help="Root path to the data files")
-    parser.add_argument("output", type=Path, help="Path to the output directory")
-
+    parser.add_argument("rootpath", type=Path, help="Root path to files")
     args = parser.parse_args()
 
-    process_data(args.rootpath, args.output, metadata)
+    process_data(args.rootpath, metadata)