Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combined with QC - Cimaq #23

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 152 additions & 72 deletions code/cimaq.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Load CIMA-Q data and extract demographic information.

Author: Natasha Clarke; last edit 2024-03-05
Author: Natasha Clarke; last edit 2024-04-16

All input stored in `data/cimaq` folder. The content of `data` is not
included in the repository.
Expand All @@ -22,7 +22,7 @@
"description": "Unique identifier for each participant",
},
"age": {
"original_field_name": "age",
"original_field_name": "age_du_participant",
"description": "Age of the participant in years",
},
"sex": {
Expand Down Expand Up @@ -61,69 +61,27 @@
"original_field_name": "84756_nombre_annee_education",
"description": "Years in education",
},
"ses": {
"original_field_name": "no_visite",
"description": "Session label, in this dataset it is the visit label indicating months since baseline",
},
}


def find_closest_diagnosis(scan_row, diagnosis_df):
pscid = scan_row["pscid"]
scan_date = scan_row["date"]

# Filter to evaluations for the same participant and where the evaluation date is not NULL
participant_df = diagnosis_df[
(diagnosis_df["pscid"] == pscid) & diagnosis_df["date_de_l_évaluation"].notna()
].copy()

# Find the closest date
if not participant_df.empty:
# Compute the absolute difference in days between the scan date and diagnosis evaluation dates
participant_df["date_diff"] = (
participant_df["date_de_l_évaluation"].sub(scan_date).dt.days.abs()
)

# Find the diagnosis with the smallest date difference
closest_date = participant_df.loc[participant_df["date_diff"].idxmin()]

# Return the diagnosis, date and numebr of days. Also return the sex info
return (
closest_date["22501_diagnostic_clinique"],
closest_date["date_de_l_évaluation"],
closest_date["date_diff"],
closest_date["sexe"],
)
else:
# Return None for each expected value to maintain consistency
return None, None, None


def process_data(root_p, output_p, metadata):
# Paths to different data files
scan_file_p = root_p / "sommaire_des_scans.tsv"
diagnosis_file_p = root_p / "22501_diagnostic_clinique.tsv"
socio_file_p = (
root_p / "55398_informations_socio_demographiques_participant_initial.tsv"
)
edu_file_p = root_p / "84756_variables_reserve_cognitive_bartres_initial.tsv"

# Load the CSVs
df = pd.read_csv(scan_file_p, sep="\t", parse_dates=["date"])
diagnosis_df = pd.read_csv(
diagnosis_file_p, sep="\t", parse_dates=["date_de_l_évaluation"]
)
socio_df = pd.read_csv(socio_file_p, sep="\t")
edu_df = pd.read_csv(edu_file_p, sep="\t", encoding="ISO-8859-1")

# Select only resting state data
df = df[df["nii_protocole"] == "task-rest"] # Run again for task-memory
def merge_pheno(scan_df, diagnosis_df, socio_df, cog_df):
df = diagnosis_df.copy()

# Apply function to match diagnoses according to closest scan date, and split the results into new columns
result = df.apply(
lambda row: pd.Series(find_closest_diagnosis(row, diagnosis_df)), axis=1
)
df[["diagnosis", "matched_evaluation_date", "date_diff", "sex"]] = (
result # Returning these for now because at a later date we may want to drop e.g participants with diagnoses outside a certain window
# Match for site
scan_df_first = scan_df.drop_duplicates(subset="pscid", keep="first")
df = pd.merge(
df,
scan_df_first[["pscid", "site_scanner"]],
on="pscid",
how="left",
)

# Match with df for handedness
# Match for handedness
socio_df = socio_df.drop_duplicates(subset="PSCID", keep="first")
df = pd.merge(
df,
socio_df[["PSCID", "55398_lateralite"]],
Expand All @@ -132,21 +90,29 @@ def process_data(root_p, output_p, metadata):
how="left",
)

# Match with df for education
# Match for education
cog_df = cog_df.drop_duplicates(subset="PSCID", keep="first")
df = pd.merge(
df,
edu_df[["PSCID", "84756_nombre_annee_education"]],
cog_df[["PSCID", "84756_nombre_annee_education"]],
left_on="pscid",
right_on="PSCID",
how="left",
)
return df


def process_pheno(df):
# Process the data
df["participant_id"] = df["pscid"].astype(str)
df["age"] = df["age"].astype(float)
df["sex"] = df["sex"].map({"femme": "female", "homme": "male"})
df["site"] = df["centre"]
df["diagnosis"] = df["diagnosis"].map(
df["age"] = df["âge_du_participant"].astype(float)
df["sex"] = df["sexe"].map({"femme": "female", "homme": "male"})
df["site"] = df["site_scanner"].replace(
{
"Hopital Général Juif": "JGH",
}
)
df["diagnosis"] = df["22501_diagnostic_clinique"].map(
{
"démence_de_type_alzheimer-légère": "ADD(M)",
"cognitivement_sain_(cs)": "CON",
Expand All @@ -162,14 +128,130 @@ def process_data(root_p, output_p, metadata):
df["education"] = pd.to_numeric(
df["84756_nombre_annee_education"], errors="coerce"
) # This will replace the "donnée_non_disponible" entries with NaN
df["ses"] = df["no_visite"]

# Select columns
df = df[
["participant_id", "age", "sex", "site", "diagnosis", "handedness", "education"]
[
"participant_id",
"age",
"sex",
"site",
"diagnosis",
"handedness",
"education",
"ses",
]
]
return df.copy()


def merge_qc_pheno(qc_df_filtered, pheno_df):
# Create a numeric version of the session
pheno_df["ses_numeric"] = pheno_df["ses"].str.replace("V", "").astype(int)
qc_df_filtered["ses_numeric"] = (
qc_df_filtered["ses"].str.replace("V", "").astype(int)
)

pheno_df["participant_id"] = pheno_df["participant_id"].astype(int)
qc_df_filtered["participant_id"] = qc_df_filtered["participant_id"].astype(int)

pheno_df = pheno_df.sort_values(by="ses_numeric")
qc_df_filtered = qc_df_filtered.sort_values(by="ses_numeric")

# Merge pheno and QC on nearest. Note that since the longest difference between scanning and pheno collection is 3 months, we don't need to set a threshold for the diagnoses
merged_df = pd.merge_asof(
qc_df_filtered,
pheno_df,
by="participant_id", # Match participants
on="ses_numeric", # Find the nearest match based on session date
direction="nearest",
)

# Handle site columns
merged_df.drop(columns=["site_x"], inplace=True)
merged_df.rename(columns={"site_y": "site"}, inplace=True)

# Handle session columns
merged_df.drop(columns=["ses_y"], inplace=True)
merged_df.rename(columns={"ses_x": "ses"}, inplace=True)
merged_df.drop(columns=["ses_numeric"], inplace=True)
return merged_df


def merge_scanner(qc_pheno_df, scan_df):
# Create scanner column
scan_df["scanner"] = (
scan_df["fabriquant"].str.replace(" ", "_")
+ "_"
+ scan_df["modele_scanner"].str.replace(" ", "_")
).str.lower()

# Drop multiple entries per session for scannning data
scan_df.drop_duplicates(subset=["pscid", "no_visite"], keep="first", inplace=True)

qc_pheno_df["participant_id"] = qc_pheno_df["participant_id"].astype(int)
scan_df["pscid"] = scan_df["pscid"].astype(int)

merged_df = pd.merge(
qc_pheno_df,
scan_df[["pscid", "no_visite", "scanner"]],
left_on=["participant_id", "ses"],
right_on=["pscid", "no_visite"],
how="left",
)

merged_df.drop(columns=["pscid", "no_visite"], inplace=True)

return merged_df


def process_data(root_p, metadata):
# Paths to data
diagnosis_file_p = (
root_p / "wrangling-phenotype/data/cimaq/22501_diagnostic_clinique.tsv"
)
scan_file_p = (
root_p
/ "wrangling-phenotype/data/cimaq/dr15_20240301_sommaire_des_scans-nii.tsv"
) # Using the dr15 spreadsheet since there is more data available
socio_file_p = (
root_p
/ "wrangling-phenotype/data/cimaq/55398_informations_socio_demographiques_participant_initial.tsv"
)
cog_file_p = (
root_p
/ "wrangling-phenotype/data/cimaq/84756_variables_reserve_cognitive_bartres_initial.tsv"
)
qc_file_p = root_p / "qc_output/rest_df.tsv"
output_p = root_p / "wrangling-phenotype/outputs"

# Load the CSVs
diagnosis_df = pd.read_csv(
diagnosis_file_p, sep="\t", parse_dates=["date_de_l_évaluation"]
)
scan_df = pd.read_csv(scan_file_p, sep="\t")
socio_df = pd.read_csv(socio_file_p, sep="\t")
cog_df = pd.read_csv(cog_file_p, sep="\t", encoding="ISO-8859-1")
qc_df = pd.read_csv(qc_file_p, sep="\t", low_memory=False)

# Merge different phenotypic fields
df = merge_pheno(scan_df, diagnosis_df, socio_df, cog_df)

# Process pheno data
pheno_df = process_pheno(df)

# Filter qc df for dataset
qc_df_filtered = qc_df.loc[qc_df["dataset"] == "cimaq"].copy()

# Merge pheno with qc
qc_pheno_df = merge_qc_pheno(qc_df_filtered, pheno_df)

# Merge with scan info
qc_scan_df = merge_scanner(qc_pheno_df, scan_df)

# Output tsv file
df.to_csv(output_p / "cimaq_pheno.tsv", sep="\t", index=False)
qc_scan_df.to_csv(output_p / "cimaq_qc_pheno.tsv", sep="\t", index=False)

# Output metadata to json
with open(output_p / "cimaq_pheno.json", "w") as f:
Expand All @@ -180,11 +262,9 @@ def process_data(root_p, output_p, metadata):

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process CIMA-Q phenotype data and output to TSV and JSON"
description="Process CIMS-Q phenotype data, merge with QC and output to to TSV and JSON"
)
parser.add_argument("rootpath", type=Path, help="Root path to the data files")
parser.add_argument("output", type=Path, help="Path to the output directory")

parser.add_argument("rootpath", type=Path, help="Root path to files")
args = parser.parse_args()

process_data(args.rootpath, args.output, metadata)
process_data(args.rootpath, metadata)