Merge pull request #105 from AMP-SCZ/kcho/pronet

metadata initiation using REDCap is updated to be more effective
AMP-SCZ · Jun 27, 2022 · b8426ef · b8426ef
2 parents 13a7da5 + ed34f68
commit b8426ef
Show file tree

Hide file tree

Showing 4 changed files with 218 additions and 96 deletions.
diff --git a/lochness/email/__init__.py b/lochness/email/__init__.py
@@ -75,7 +75,12 @@ def send_detail(Lochness,
 
     server_name = Lochness['project_name'] \
         if 'project_name' in Lochness else 'Data aggregation server'
-    title = f'{server_name}: {title} {datetime.now(tz).date()}'
+
+    if Lochness.get('production', False):
+        title = f'{server_name} Production: {title} {datetime.now(tz).date()}'
+    else:
+        title = f'{server_name}: {title} {datetime.now(tz).date()}'
+
 
     html_str = template.render(title=title,
                                subtitle=subtitle,
@@ -88,7 +93,6 @@ def send_detail(Lochness,
                                username=getpass.getuser())
 
     msg = MIMEText(html_str, 'html')
-    print(title)
     msg['Subject'] = title
     msg['From'] = sender
     msg['To'] = recipients[0]

diff --git a/lochness/redcap/__init__.py b/lochness/redcap/__init__.py
@@ -51,6 +51,14 @@ def get_field_names_from_redcap(api_url: str,
     return field_names
 
 
+def remove_file_that_may_exist(file_path: Path) -> None:
+    '''Remove a file that may exist'''
+    try:
+        os.remove(file_path)
+    except:
+        pass
+
+
 def initialize_metadata(Lochness: 'Lochness object',
                         study_name: str,
                         redcap_id_colname: str,
@@ -72,117 +80,133 @@ def initialize_metadata(Lochness: 'Lochness object',
     site_code_study = study_name[-2:]  # 'LA'
     project_name = study_name.split(site_code_study)[0]  # 'Pronet'
 
+    # metadata study location
+    general_path = Path(Lochness['phoenix_root']) / 'GENERAL'
+    metadata_study = general_path / study_name / f"{study_name}_metadata.csv"
+
     # use redcap_project function to load the redcap keyrings for the project
     _, api_url, api_key = next(redcap_projects(
         Lochness, study_name, f'redcap.{project_name}'))
 
     # sources to add to the metadata, apart from REDCap, XNAT, and Box
     source_source_name_dict = {'mindlamp': ['Mindlamp', 'chrdbb_lamp_id']}
 
-    record_query = {'token': api_key,
-                    'content': 'record',
-                    'format': 'json',
-                    'fields[0]': redcap_id_colname,
-                    'fields[1]': redcap_consent_colname,
-                    }
+    # to extract ID and consent form for all the records that
+    # belong to the site from screening & baseline arms
+    record_query = {
+        'token': api_key,
+        'content': 'record',
+        'format': 'json',
+        'fields[0]': redcap_id_colname,
+        'fields[1]': redcap_consent_colname,
+        'events[0]': 'screening_arm_1',
+        'events[1]': 'screening_arm_2',
+        'events[2]': 'baseline_arm_1',
+        'events[3]': 'baseline_arm_2',
+        'filterLogic': f"contains([{redcap_id_colname}],'{site_code_study}')"
+        }
+
 
-    # only pull source_names
-    # mindlamp id is manually added to "chrdig_lamp_id" field
-    field_num = 2
-    for source, (source_name, source_field_name) in \
-            source_source_name_dict.items():
-        record_query[f"fields[{field_num}]"] = source_field_name
+    # to add mindlamp source ID to the record query
+    for num, (source, (source_name, source_field_name)) in \
+            enumerate(source_source_name_dict.items()):
+        record_query[f"fields[{2+num}]"] = source_field_name
 
     # pull all records from the project's REDCap repo
-    try:
-        content = post_to_redcap(api_url,
-                                 record_query,
-                                 f'initializing data {study_name}')
-    except:  # if subject ID field name are not set, above will raise error
-        record_query = {
-            'token': api_key,
-            'content': 'record',
-            'format': 'json',
-        }
-        content = post_to_redcap(api_url,
-                                 record_query,
-                                 f'initializing data {study_name}')
+    content = post_to_redcap(api_url,
+                             record_query,
+                             f'initializing data {study_name}')
 
     # load pulled information as a list of dictionaries
     with tf.NamedTemporaryFile(suffix='tmp.json') as tmpfilename:
         lochness.atomic_write(tmpfilename.name, content)
         with open(tmpfilename.name, 'r') as f:
             data = json.load(f)
 
-    df = pd.DataFrame()
+    # replace empty string as None
+    df = pd.DataFrame(data).replace('', None)
 
-    # extract subject ID and source IDs for each sources
-    for item in data:
-        # filter out data from other sites (if multistudy removed)
-        site_code_redcap_id = item[redcap_id_colname][:2]
-        if site_code_redcap_id != site_code_study:
-            continue
+    # if empty REDCap
+    if len(df) == 0:
+        logger.warn(f'There are no records for {site_code_study}')
+        remove_file_that_may_exist(metadata_study)
+        return
 
-        subject_dict = {'Subject ID': item[redcap_id_colname]}
+    # only keep AMPSCZ rows
+    df = df[df[redcap_id_colname].str.match('[A-Z]{2}\d{5}')]
+
+    # if no data matches AMPSCZ ID
+    if len(df) == 0:
+        logger.warn(f'There are no records for {site_code_study}')
+        remove_file_that_may_exist(metadata_study)
+        return
+
+    # make a single row for each subject record
+    # redcap_event_name column contains timepoint information (different arms)
+    df.drop('redcap_event_name', axis=1, inplace=True)
+
+    df_final = pd.DataFrame()
+    for subject, df_tmp in df.groupby(redcap_id_colname):
+        df_new = pd.concat(
+            [df_tmp[col].dropna().reset_index(drop=True) for col in df_tmp],
+            axis=1)
+        df_final = pd.concat([df_final, df_new], axis=0)
+
+    # drop if consent date is missing
+    df_final = df_final[
+            ~df_final[redcap_consent_colname].isnull()].reset_index()
+
+    # skip no data has consent date
+    if len(df_final) == 0:
+        logger.warn(f'There are no records for {site_code_study}')
+        remove_file_that_may_exist(metadata_study)
+        return
+
+    df = pd.DataFrame()
+    # extract subject ID and source IDs for each sources
+    for index, row in df_final.iterrows():
+        subject_id = row[redcap_id_colname]
+        # Subject ID
+        subject_dict = {'Subject ID': subject_id}
 
         # Consent date
-        if item[redcap_consent_colname] != '':
-            subject_dict['Consent'] = item[redcap_consent_colname]
-        else:
-            # subject_dict['Consent'] = '2021-10-01'
-            continue  ## subject without consent date will be ignored
+        subject_dict['Consent'] = row[redcap_consent_colname]
 
         # Redcap default information
         subject_dict['REDCap'] = \
-                f'redcap.{project_name}:{item[redcap_id_colname]}'
-        if upenn:
-            subject_dict['REDCap'] += \
-                    f';redcap.UPENN:{item[redcap_id_colname]}'  # UPENN REDCAP
-
-        subject_dict['Box'] = f'box.{study_name}:{item[redcap_id_colname]}'
-        subject_dict['XNAT'] = f'xnat.{study_name}:*:{item[redcap_id_colname]}'
+                f'redcap.{project_name}:{subject_id}'
+        subject_dict['REDCap'] += \
+                f';redcap.UPENN:{row[redcap_id_colname]}'  # UPENN REDCAP
+        subject_dict['Box'] = f'box.{study_name}:{subject_id}'
+        subject_dict['XNAT'] = f'xnat.{study_name}:*:{subject_id}'
 
+        # for the datatype, which requires ID extraction from REDCap
         for source, (source_name, source_field_name) \
                 in source_source_name_dict.items():
-            # if mindlamp_id field is available in REDCap record
-            if source_field_name in item:
-                source_id = item[source_field_name]
-                if source_id != '':
-                    subject_dict[source_name] = \
-                            f"{source}.{study_name}:{source_id}"
-                else:
-                    pass
-
-        df_tmp = pd.DataFrame.from_dict(subject_dict, orient='index')
-        df = pd.concat([df, df_tmp.T])
-
-    if len(df) == 0:
-        logger.warn(f'There are no records for {site_code_study}')
-        return
-
-    # Each subject may have more than one arms, which will result in more than
-    # single item for the subject in the redcap pulled `content`
-    # remove empty lables
-    df_final = pd.DataFrame()
-    for _, table in df.groupby(['Subject ID']):
-        pad_filled = table.fillna(
-                method='ffill').fillna(method='bfill').iloc[0]
-
-        df_final = pd.concat([df_final, pad_filled], axis=1)
+            if pd.isnull(row[source_field_name]):
+                pass
+            else:
+                value = row[source_field_name]
+                subject_dict[source_name] = f"{source}.{study_name}:{value}"
 
-    df_final = df_final.T
+        subject_df_tmp = pd.DataFrame.from_dict(subject_dict, orient='index')
+        df = pd.concat([df, subject_df_tmp.T])
 
     # register all of the lables as active
-    df_final['Active'] = 1
+    df['Active'] = 1
 
-    # reorder columns
+    # reorder columns to match lochness metadata format
     main_cols = ['Active', 'Consent', 'Subject ID']
-    df_final = df_final[main_cols + \
-            [x for x in df_final.columns if x not in main_cols]]
-
-    general_path = Path(Lochness['phoenix_root']) / 'GENERAL'
-    metadata_study = general_path / study_name / f"{study_name}_metadata.csv"
-    df_final.to_csv(metadata_study, index=False)
+    df = df[main_cols + \
+            [x for x in df.columns if x not in main_cols]]
+
+    # only overwrite when there is an update in the data
+    target_df = pd.read_csv(metadata_study)
+    same_df = df.reset_index(drop=True).equals(target_df)
+    if same_df:
+        pass
+    else:
+        df.to_csv(metadata_study, index=False)
 
 
 def get_run_sheets_for_datatypes(api_url, api_key,

diff --git a/lochness/transfer/__init__.py b/lochness/transfer/__init__.py
@@ -375,8 +375,8 @@ def create_s3_transfer_table(Lochness, rewrite=False) -> None:
                     continue
                 try:
                     source = re.search(r'upload: (\S+)', line).group(1)
-                     # do not save metadata.csv update since it
-                     # gets updated every pull
+                    # do not save metadata.csv update since it
+                    # gets updated every pull
                     if 'metadata.csv' in source:
                         continue
                     target = re.search(r'upload: (\S+) to (\S+)',