Skip to content

Commit cef41ca

Browse files
authored
Merge pull request #5 from cgmeyer/develop
Develop
2 parents 76c56d8 + 5d161cc commit cef41ca

File tree

1 file changed

+38
-35
lines changed

1 file changed

+38
-35
lines changed

migration/migration.py

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def write_tsv(self,df,project_id,node,name='temp'):
6868
df.to_csv(outname, sep='\t', index=False, encoding='utf-8')
6969
print("\tTotal of {} records written to node '{}' in file:\n\t\t{}.".format(len(df),node,outname))
7070
except Exception as e:
71-
print("Error writing TSV file: {}".format(e))
71+
print("\tError writing TSV file: {}".format(e))
7272
return df
7373

7474
def make_temp_files(self,prefix,suffix,name='temp',overwrite=True,nodes=['all']):
@@ -121,7 +121,7 @@ def merge_nodes(self,project_id,in_nodes,out_node,name='temp'):
121121
in_nodes(list): List of node TSVs to merge into a single TSV.
122122
out_node(str): The name of the new merged TSV.
123123
"""
124-
print("Merging nodes {} to '{}'.".format(in_nodes,out_node))
124+
print("\tMerging nodes {} to '{}'.".format(in_nodes,out_node))
125125
dfs = []
126126
for node in in_nodes:
127127
filename = "{}_{}_{}.tsv".format(name,project_id, node)
@@ -166,11 +166,11 @@ def merge_properties(self,project_id,node,properties,name='temp'):
166166
df_merged = pd.concat([df_rest,df_old],ignore_index=True,sort=False)
167167
df = df_merged.drop(columns=[old_prop])
168168
dropped.append(old_prop)
169-
print("Property '{}' merged into '{}' and dropped from '{}' TSV.".format(old_prop,prop,node))
169+
print("\tProperty '{}' merged into '{}' and dropped from '{}' TSV.".format(old_prop,prop,node))
170170
else:
171-
print("Property '{}' not found in '{}' TSV. Skipping...".format(old_prop,node))
171+
print("\tProperty '{}' not found in '{}' TSV. Skipping...".format(old_prop,node))
172172
if len(dropped) > 0:
173-
print("Properties {} merged into {}.".format(dropped,list(properties.keys())))
173+
print("\tProperties {} merged into {}.".format(dropped,list(properties.keys())))
174174
df = self.write_tsv(df,project_id,node)
175175
return df
176176
else:
@@ -218,7 +218,7 @@ def create_missing_links(self,project_id,node,link,old_parent,properties,new_dd,
218218
create_missing_links(node='imaging_exam',link='visit',old_parent='cases',properties={'visit_label':'Imaging','visit_method':'In-person Visit'},new_dd=dd,old_dd=prod_dd,links=None)
219219
create_missing_links(node='diagnosis',link='visit',old_parent='cases',properties={'visit_label':'Unknown','visit_method':'Unknown'},new_dd=dd,old_dd=prod_dd)
220220
"""
221-
print("Creating missing '{}' records with links to '{}' for '{}'.".format(link,old_parent,node))
221+
print("\tCreating missing '{}' records with links to '{}' for '{}'.".format(link,old_parent,node))
222222

223223
df = self.read_tsv(project_id=project_id,node=node,name=name)
224224
# filename = "{}_{}_{}.tsv".format(name,project_id,node)
@@ -341,7 +341,7 @@ def batch_add_visits(self,project_id,new_dd,old_dd,links):
341341
print("\tNo links to 'case' found in the '{}' TSV.".format(node))
342342
if len(dfs) > 0:
343343
df = pd.concat(dfs,ignore_index=True,sort=False)
344-
print("Total of {} missing visit links created for this batch.".format(total))
344+
print("\tTotal of {} missing visit links created for this batch.".format(total))
345345
return df
346346

347347
def move_properties(self,project_id,from_node,to_node,properties,dd,parent_node=None,required_props=None,name='temp'):
@@ -358,7 +358,7 @@ def move_properties(self,project_id,from_node,to_node,properties,dd,parent_node=
358358
This moves the property 'military_status' from 'demographic' node to 'military_history' node, which should link to the same parent node 'case'.
359359
move_properties(from_node='demographic',to_node='military_history',properties=['military_status'],parent_node='case')
360360
"""
361-
print("Moving {} from '{}' to '{}'.".format(properties,from_node,to_node))
361+
print("\tMoving {} from '{}' to '{}'.".format(properties,from_node,to_node))
362362

363363
from_name = "{}_{}_{}.tsv".format(name,project_id,from_node) #from imaging_exam
364364
try:
@@ -423,10 +423,10 @@ def move_properties(self,project_id,from_node,to_node,properties,dd,parent_node=
423423
if len(vals) == 1:
424424
case_data.loc[case_data['submitter_id']==case_id,header] = vals
425425
elif len(vals) > 1:
426-
print("{}: {}".format(header,vals))
426+
print("\t{}: {}".format(header,vals))
427427
if header == 'age_at_enrollment': # special case hard-coded for BRAIN Commons migration
428428
lowest_val = min(vals, key=float)
429-
print("Selecting lowest value '{}' from {}.".format(lowest_val,vals))
429+
print("\tSelecting lowest value '{}' from {}.".format(lowest_val,vals))
430430
case_data.loc[case_data['submitter_id']==case_id,header] = lowest_val
431431
count += 1
432432
all_to = pd.merge(df_to,case_data,on='submitter_id', how='left')
@@ -450,10 +450,10 @@ def move_properties(self,project_id,from_node,to_node,properties,dd,parent_node=
450450
for prop in to_required:
451451
if prop in list(required_props.keys()):
452452
all_to[prop] = required_props[prop]
453-
print("Missing required property '{}' added to new '{}' TSV with all {} values.".format(prop,to_node,required_props[prop]))
453+
print("\tMissing required property '{}' added to new '{}' TSV with all {} values.".format(prop,to_node,required_props[prop]))
454454
else:
455455
all_to[prop] = np.nan
456-
print("Missing required property '{}' added to new '{}' TSV with all null values.".format(prop,to_node))
456+
print("\tMissing required property '{}' added to new '{}' TSV with all null values.".format(prop,to_node))
457457

458458
all_to.to_csv(to_name,sep='\t',index=False,encoding='utf-8')
459459
print("\tProperties moved to '{}' node from '{}'. Data saved in file:\n\t{}".format(to_node,from_node,to_name))
@@ -468,7 +468,7 @@ def add_property(self,project_id,node,properties):
468468
if prop not in list(df):
469469
df[prop] = properties[prop]
470470
else:
471-
print("Property '{}' already in the TSV for node '{}'.".format(prop,node))
471+
print("\tProperty '{}' already in the TSV for node '{}'.".format(prop,node))
472472

473473
df.to_csv(filename,sep='\t',index=False,encoding='utf-8')
474474
return df
@@ -667,7 +667,7 @@ def merge_links(self,project_id,node,link,links_to_merge,name='temp'):
667667
sid = "{}.submitter_id".format(sublink)
668668
df.loc[df[link_name].isnull(), link_name] = df[sid]
669669
df.to_csv(filename,sep='\t',index=False,encoding='utf-8')
670-
print("Links merged to '{}' and data written to TSV file: \n\t{}".format(link,filename))
670+
print("\tLinks merged to '{}' and data written to TSV file: \n\t\t{}".format(link,filename))
671671
return df
672672

673673
def drop_ids(self,project_id,node,name='temp'):
@@ -702,7 +702,7 @@ def batch_drop_ids(self,project_id,suborder,name='temp'):
702702
for node_order in suborder:
703703

704704
node = node_order[0]
705-
print(node)
705+
print("\t{}:".format(node))
706706

707707
df = self.read_tsv(project_id=project_id,node=node,name=name)
708708
# filename = "{}_{}_{}.tsv".format(name,project_id,node)
@@ -733,7 +733,7 @@ def create_project(self,program,project):
733733
}}""".format(program,program)
734734
prog_json = json.loads(prog_txt)
735735
data = self.sub.create_program(json=prog_json)
736-
print(data)
736+
print("\t{}".format(data))
737737
proj_txt = """{{
738738
"type": "project",
739739
"code": "{}",
@@ -742,7 +742,7 @@ def create_project(self,program,project):
742742
}}""".format(project,project,project)
743743
proj_json = json.loads(proj_txt)
744744
data = self.sub.create_project(program=program,json=proj_json)
745-
print(data)
745+
print("\t{}".format(data))
746746

747747
def remove_special_chars(self,project_id,node,name='temp'):
748748
""" Replace a special character in 'Parkinson's Disease'
@@ -758,10 +758,10 @@ def remove_special_chars(self,project_id,node,name='temp'):
758758
df_txt2 = re.sub(substring,"Parkinson's Disease",df_txt)
759759
df = pd.read_csv(StringIO(df_txt2),sep='\t',dtype=str) # this converts int to float (adds .0 to int)
760760
df.to_csv(filename,sep='\t',index=False, encoding='utf-8')
761-
print("Special chars removed from: {}".format(filename))
761+
print("\tSpecial chars removed from: {}".format(filename))
762762

763763
else:
764-
print("No special chars found in {}".format(filename))
764+
print("\tNo special chars found in {}".format(filename))
765765

766766
return df
767767

@@ -773,7 +773,7 @@ def floats_to_integers(self,project_id,node,prop,name='temp'):
773773

774774
df[prop] = df[prop].str.extract(r'^(\d+).0$', expand=True)
775775
df.to_csv(filename,sep='\t',index=False, encoding='utf-8')
776-
print("Trailing '.0' decimals removed from: {}".format(filename))
776+
print("\tTrailing '.0' decimals removed from: {}".format(filename))
777777
return df
778778

779779
def get_submission_order(self,dd,project_id,name='temp',suffix='tsv',missing_nodes=['project','study','case','visit']):
@@ -798,7 +798,7 @@ def get_submission_order(self,dd,project_id,name='temp',suffix='tsv',missing_nod
798798
else:
799799
print("\tThe node '{}' is not in the data dictionary! Skipping...".format(node))
800800

801-
print("\tFound the following nodes:\n\t{}".format(all_nodes))
801+
print("\tFound the following nodes:\n\t\t{}".format(all_nodes))
802802

803803
# Check for the common missing root nodes
804804
for missing_node in missing_nodes:
@@ -809,7 +809,7 @@ def get_submission_order(self,dd,project_id,name='temp',suffix='tsv',missing_nod
809809
while len(all_nodes) > 0:
810810

811811
node = all_nodes.pop(0)
812-
print("\tDetermining order for node '{}'.".format(node))
812+
#print("\tDetermining order for node '{}'.".format(node)) # for trouble-shooting
813813

814814
node_links = dd[node]['links']
815815
for link in node_links:
@@ -840,7 +840,7 @@ def get_submission_order(self,dd,project_id,name='temp',suffix='tsv',missing_nod
840840
else: #skip it for now
841841
all_nodes.append(node)
842842
else:
843-
print("No link target_type found for node '{}'".format(node))
843+
print("\tNo link target_type found for node '{}'".format(node))
844844
#suborder = sorted(suborder.items(), key=operator.itemgetter(1))
845845
suborder = {key:val for key, val in suborder.items() if val > 0}
846846
print("\tSubmission Order: \n\t\t{}".format(suborder))
@@ -873,32 +873,34 @@ def submit_tsvs(self,project_id,suborder,check_done=False,name='temp'):
873873
data = self.sub.submit_file(project_id=project_id,filename=filename,chunk_size=1000)
874874
#print("data: {}".format(data)) #for trouble-shooting
875875
logfile.write(filename + '\n' + json.dumps(data)+'\n\n') #put in log file
876+
876877
if len(data['invalid']) == 0 and len(data['succeeded']) > 0:
877-
cmd = ['mv',filename,'done']
878+
mv_done_cmd = ['mv',filename,'done']
878879
try:
879-
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('UTF-8')
880-
print("Submission successful. Moving file to done:\n\t{}\n\n".format(filename))
880+
output = subprocess.check_output(mv_done_cmd, stderr=subprocess.STDOUT).decode('UTF-8')
881+
print("Submission successful. Moving file to done:\n\t\t{}\n\n".format(filename))
881882
except Exception as e:
882883
output = e.output.decode('UTF-8')
883884
print("ERROR:" + output)
884885
else:
885886
if len(data['invalid'])>0:
886887
invalid_records = list(data['invalid'].keys())[0:10]
887888
for i in invalid_records:
888-
print(data['invalid'][i])
889-
print("Need to fix errors in {}".format(filename))
890-
cmd = ['mv',filename,'failed']
889+
print("{}".format(data['invalid'][i]))
890+
print("Need to fix {} errors in '{}'".format(len(invalid_records),filename))
891+
892+
mv_failed_cmd = ['mv',filename,'failed']
891893
try:
892-
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('UTF-8')
893-
print("Submission successful. Moving file to done:\n\t{}\n\n".format(filename))
894+
output = subprocess.check_output(mv_failed_cmd, stderr=subprocess.STDOUT).decode('UTF-8')
895+
print("Submission failed. Moving file to failed:\n\t\t{}".format(filename))
894896
except Exception as e:
895897
output = e.output.decode('UTF-8')
896898
print("ERROR:" + output)
897899

898900
except Exception as e:
899-
print(e)
901+
print("\t{}".format(e))
900902
else:
901-
print("\nPreviously submitted file already exists in done directory:\n\t{}\n".format(done_file))
903+
print("\tPreviously submitted file already exists in done directory:\n\t\t{}\n".format(done_file))
902904

903905
def check_migration_counts(self, projects=None, overwrite=False):
904906
""" Gets counts and downloads TSVs for all nodes for every project.
@@ -925,11 +927,12 @@ def check_migration_counts(self, projects=None, overwrite=False):
925927
query_txt = """{_%s_count (project_id:"%s")}""" % (node,project_id)
926928
res = self.sub.query(query_txt)
927929
count = res['data'][str('_'+node+'_count')]
928-
print(str(count) + ' records found in node ' + node + ' in project ' + project_id)
930+
print("\t{} records found in node '{}' in project '{}'.".format(str(count),node,project_id))
931+
929932
if count > 0:
930933
filename = str(mydir+'/'+project_id+'_'+node+'.tsv')
931934
if (os.path.isfile(filename)) and (overwrite is False):
932-
print('Previously downloaded '+ filename )
935+
print('\tPreviously downloaded '+ filename )
933936
else:
934937
prog,proj = project_id.split('-',1)
935938
self.sub.export_node(prog,proj,node,'tsv',filename)

0 commit comments

Comments
 (0)