Skip to content

Commit

Permalink
Merge pull request #105 from AMP-SCZ/kcho/pronet
Browse files Browse the repository at this point in the history
metadata initiation using REDCap is updated to be more effective
  • Loading branch information
kcho authored Jun 27, 2022
2 parents 13a7da5 + ed34f68 commit b8426ef
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 96 deletions.
8 changes: 6 additions & 2 deletions lochness/email/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,12 @@ def send_detail(Lochness,

server_name = Lochness['project_name'] \
if 'project_name' in Lochness else 'Data aggregation server'
title = f'{server_name}: {title} {datetime.now(tz).date()}'

if Lochness.get('production', False):
title = f'{server_name} Production: {title} {datetime.now(tz).date()}'
else:
title = f'{server_name}: {title} {datetime.now(tz).date()}'


html_str = template.render(title=title,
subtitle=subtitle,
Expand All @@ -88,7 +93,6 @@ def send_detail(Lochness,
username=getpass.getuser())

msg = MIMEText(html_str, 'html')
print(title)
msg['Subject'] = title
msg['From'] = sender
msg['To'] = recipients[0]
Expand Down
182 changes: 103 additions & 79 deletions lochness/redcap/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ def get_field_names_from_redcap(api_url: str,
return field_names


def remove_file_that_may_exist(file_path: Path) -> None:
'''Remove a file that may exist'''
try:
os.remove(file_path)
except:
pass


def initialize_metadata(Lochness: 'Lochness object',
study_name: str,
redcap_id_colname: str,
Expand All @@ -72,117 +80,133 @@ def initialize_metadata(Lochness: 'Lochness object',
site_code_study = study_name[-2:] # 'LA'
project_name = study_name.split(site_code_study)[0] # 'Pronet'

# metadata study location
general_path = Path(Lochness['phoenix_root']) / 'GENERAL'
metadata_study = general_path / study_name / f"{study_name}_metadata.csv"

# use redcap_project function to load the redcap keyrings for the project
_, api_url, api_key = next(redcap_projects(
Lochness, study_name, f'redcap.{project_name}'))

# sources to add to the metadata, apart from REDCap, XNAT, and Box
source_source_name_dict = {'mindlamp': ['Mindlamp', 'chrdbb_lamp_id']}

record_query = {'token': api_key,
'content': 'record',
'format': 'json',
'fields[0]': redcap_id_colname,
'fields[1]': redcap_consent_colname,
}
# to extract ID and consent form for all the records that
# belong to the site from screening & baseline arms
record_query = {
'token': api_key,
'content': 'record',
'format': 'json',
'fields[0]': redcap_id_colname,
'fields[1]': redcap_consent_colname,
'events[0]': 'screening_arm_1',
'events[1]': 'screening_arm_2',
'events[2]': 'baseline_arm_1',
'events[3]': 'baseline_arm_2',
'filterLogic': f"contains([{redcap_id_colname}],'{site_code_study}')"
}


# only pull source_names
# mindlamp id is manually added to "chrdig_lamp_id" field
field_num = 2
for source, (source_name, source_field_name) in \
source_source_name_dict.items():
record_query[f"fields[{field_num}]"] = source_field_name
# to add mindlamp source ID to the record query
for num, (source, (source_name, source_field_name)) in \
enumerate(source_source_name_dict.items()):
record_query[f"fields[{2+num}]"] = source_field_name

# pull all records from the project's REDCap repo
try:
content = post_to_redcap(api_url,
record_query,
f'initializing data {study_name}')
except: # if subject ID field name are not set, above will raise error
record_query = {
'token': api_key,
'content': 'record',
'format': 'json',
}
content = post_to_redcap(api_url,
record_query,
f'initializing data {study_name}')
content = post_to_redcap(api_url,
record_query,
f'initializing data {study_name}')

# load pulled information as a list of dictionaries
with tf.NamedTemporaryFile(suffix='tmp.json') as tmpfilename:
lochness.atomic_write(tmpfilename.name, content)
with open(tmpfilename.name, 'r') as f:
data = json.load(f)

df = pd.DataFrame()
# replace empty string as None
df = pd.DataFrame(data).replace('', None)

# extract subject ID and source IDs for each sources
for item in data:
# filter out data from other sites (if multistudy removed)
site_code_redcap_id = item[redcap_id_colname][:2]
if site_code_redcap_id != site_code_study:
continue
# if empty REDCap
if len(df) == 0:
logger.warn(f'There are no records for {site_code_study}')
remove_file_that_may_exist(metadata_study)
return

subject_dict = {'Subject ID': item[redcap_id_colname]}
# only keep AMPSCZ rows
df = df[df[redcap_id_colname].str.match('[A-Z]{2}\d{5}')]

# if no data matches AMPSCZ ID
if len(df) == 0:
logger.warn(f'There are no records for {site_code_study}')
remove_file_that_may_exist(metadata_study)
return

# make a single row for each subject record
# redcap_event_name column contains timepoint information (different arms)
df.drop('redcap_event_name', axis=1, inplace=True)

df_final = pd.DataFrame()
for subject, df_tmp in df.groupby(redcap_id_colname):
df_new = pd.concat(
[df_tmp[col].dropna().reset_index(drop=True) for col in df_tmp],
axis=1)
df_final = pd.concat([df_final, df_new], axis=0)

# drop if consent date is missing
df_final = df_final[
~df_final[redcap_consent_colname].isnull()].reset_index()

# skip no data has consent date
if len(df_final) == 0:
logger.warn(f'There are no records for {site_code_study}')
remove_file_that_may_exist(metadata_study)
return

df = pd.DataFrame()
# extract subject ID and source IDs for each sources
for index, row in df_final.iterrows():
subject_id = row[redcap_id_colname]
# Subject ID
subject_dict = {'Subject ID': subject_id}

# Consent date
if item[redcap_consent_colname] != '':
subject_dict['Consent'] = item[redcap_consent_colname]
else:
# subject_dict['Consent'] = '2021-10-01'
continue ## subject without consent date will be ignored
subject_dict['Consent'] = row[redcap_consent_colname]

# Redcap default information
subject_dict['REDCap'] = \
f'redcap.{project_name}:{item[redcap_id_colname]}'
if upenn:
subject_dict['REDCap'] += \
f';redcap.UPENN:{item[redcap_id_colname]}' # UPENN REDCAP

subject_dict['Box'] = f'box.{study_name}:{item[redcap_id_colname]}'
subject_dict['XNAT'] = f'xnat.{study_name}:*:{item[redcap_id_colname]}'
f'redcap.{project_name}:{subject_id}'
subject_dict['REDCap'] += \
f';redcap.UPENN:{row[redcap_id_colname]}' # UPENN REDCAP
subject_dict['Box'] = f'box.{study_name}:{subject_id}'
subject_dict['XNAT'] = f'xnat.{study_name}:*:{subject_id}'

# for the datatype, which requires ID extraction from REDCap
for source, (source_name, source_field_name) \
in source_source_name_dict.items():
# if mindlamp_id field is available in REDCap record
if source_field_name in item:
source_id = item[source_field_name]
if source_id != '':
subject_dict[source_name] = \
f"{source}.{study_name}:{source_id}"
else:
pass

df_tmp = pd.DataFrame.from_dict(subject_dict, orient='index')
df = pd.concat([df, df_tmp.T])

if len(df) == 0:
logger.warn(f'There are no records for {site_code_study}')
return

# Each subject may have more than one arms, which will result in more than
# single item for the subject in the redcap pulled `content`
# remove empty lables
df_final = pd.DataFrame()
for _, table in df.groupby(['Subject ID']):
pad_filled = table.fillna(
method='ffill').fillna(method='bfill').iloc[0]

df_final = pd.concat([df_final, pad_filled], axis=1)
if pd.isnull(row[source_field_name]):
pass
else:
value = row[source_field_name]
subject_dict[source_name] = f"{source}.{study_name}:{value}"

df_final = df_final.T
subject_df_tmp = pd.DataFrame.from_dict(subject_dict, orient='index')
df = pd.concat([df, subject_df_tmp.T])

# register all of the lables as active
df_final['Active'] = 1
df['Active'] = 1

# reorder columns
# reorder columns to match lochness metadata format
main_cols = ['Active', 'Consent', 'Subject ID']
df_final = df_final[main_cols + \
[x for x in df_final.columns if x not in main_cols]]

general_path = Path(Lochness['phoenix_root']) / 'GENERAL'
metadata_study = general_path / study_name / f"{study_name}_metadata.csv"
df_final.to_csv(metadata_study, index=False)
df = df[main_cols + \
[x for x in df.columns if x not in main_cols]]

# only overwrite when there is an update in the data
target_df = pd.read_csv(metadata_study)
same_df = df.reset_index(drop=True).equals(target_df)
if same_df:
pass
else:
df.to_csv(metadata_study, index=False)


def get_run_sheets_for_datatypes(api_url, api_key,
Expand Down
4 changes: 2 additions & 2 deletions lochness/transfer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ def create_s3_transfer_table(Lochness, rewrite=False) -> None:
continue
try:
source = re.search(r'upload: (\S+)', line).group(1)
# do not save metadata.csv update since it
# gets updated every pull
# do not save metadata.csv update since it
# gets updated every pull
if 'metadata.csv' in source:
continue
target = re.search(r'upload: (\S+) to (\S+)',
Expand Down
Loading

0 comments on commit b8426ef

Please sign in to comment.