gfkg and pakg bug fixes

ncats · Jul 22, 2024 · bb7b8ae · bb7b8ae
1 parent 3512473
commit bb7b8ae
Show file tree

Hide file tree

Showing 9 changed files with 182 additions and 88 deletions.
diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,8 @@ RDAS_CTKG/eligibility_extraction/
 RDAS_CTKG/metamap_cond_out.json
 RDAS_CTKG/metamap_cond.txt
 RDAS_GFKG/convert_csv_fields.py
+fix_normmap_file_sep.py
+project_check.py
+project_check_missing.csv
+project_check_new.csv
+project_check_old.csv
diff --git a/RDAS_GARD/methods.py b/RDAS_GARD/methods.py
@@ -118,7 +118,7 @@ def create_disease_node(db, data, xrefs): # Include xrefs into GARD node instead
     "syns":data[6],
     "orpha":results['Orphanet'] if 'Orphanet' in results else None,
     "icd10":results['ICD-10'] if 'ICD-10' in results else None,
-    "umls":results['UMLS'] if 'UMLS' in results else None,
+    "umls":list(set(results['UMLS'])) if 'UMLS' in results else None,
     "omim":results['OMIM'] if 'OMIM' in results else None,
     "snomed":results['SNOMED-CT'] if 'SNOMED-CT' in results else None,
     "diseaseontology":results['DiseaseOntology'] if 'DiseaseOntology' in results else None,
@@ -315,7 +315,7 @@ def get_remaining_umls(db, umls_update=True):
     INSTANCE.form['SingLinePMID'] = True
 
     print('GATHERING GARD UMLS DATA')
-    db.run('MATCH (x:GARD) WHERE x.UMLS IS NOT NULL SET x.UMLS_Source = "DATALAKE"')
+    db.run('MATCH (x:GARD) WHERE x.UMLS IS NOT NULL SET x.UMLS_Source = "GARD"')
     res = db.run('MATCH (x:GARD) WHERE x.UMLS IS NULL SET x.UMLS_Source = "METAMAP" RETURN x.GardId AS gard_id, x.GardName as gard_name').data()
 
     gard_strs = [f"{i['gard_id'].replace('GARD:','')}|{normalize(i['gard_name'])}\n" for i in res if i['gard_name']]

diff --git a/RDAS_GFKG/methods.py b/RDAS_GFKG/methods.py
@@ -764,7 +764,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
           else: return normalize_combined_dictionary(title_,title_,name,{},{},{},1,1,'title')
 
     if Public_health_relevance_statement and isinstance(Public_health_relevance_statement, str):
-        A, B, C,D = check_sen(Public_health_relevance_statement, nlp)
+        A, B, C,D = check_sen(Public_health_relevance_statement)
         name1 = get_gard_abstract_stem_exact(A)
         name2 = get_gard_abstract_stem_exact(B)
         name3 = get_gard_abstract_stem_exact(C)
@@ -773,15 +773,15 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
         if name and (name !={}): return name
 
     if abstract_ and isinstance(abstract_, str):
-        A, B, C , D = check_sen(abstract_, nlp)
+        A, B, C , D = check_sen(abstract_)
         name1 = get_gard_abstract_stem_exact(A)
         name2 = get_gard_abstract_stem_exact(B)
         name3 = get_gard_abstract_stem_exact(C)
         name4 = get_gard_abstract_stem_exact(D)
         name=normalize_combined_dictionary(abstract_,title_,name1,name2,name3,name4,0,0.7,'abstract')
         if name and (name !={}): return name
 
-def GardNameExtractor(project_title,phr_text,abstract_text, nlp):
+def GardNameExtractor(project_title,phr_text,abstract_text):
   #Abstract1['Gard_name']=Abstract1.apply(lambda x: gard_id(x['project_title'],x['phr_text'],x['abstract_text']), axis=1)
   gard_ids = grad_id(project_title,phr_text,abstract_text)
   if gard_ids:

diff --git a/RDAS_GFKG/prep_neo4j_data.py b/RDAS_GFKG/prep_neo4j_data.py
@@ -152,25 +152,32 @@ def run_normmap():
 		print(abs_file, ' -merged- ',prj_file)
 		tmp = pd.read_csv(('{filename}'.format(filename=abs_file)),index_col=False, encoding = "ISO-8859-1")
 		tmp2 = pd.read_csv(('{filename}'.format(filename=prj_file)),index_col=False, usecols=['APPLICATION_ID','PHR', 'PROJECT_TITLE'], encoding = "ISO-8859-1", low_memory=False)
-		merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'])
+		merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'], how='outer', indicator='EXISTS_IN_ABSTRACT_FILE')
+		#merged_df.fillna('', inplace=True)
 		merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int)
 		merged_df.to_csv(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv'), index=False)
-
+	
 	norm_files = glob.glob(data_raw('normmap') + '/*.csv')
 	norm_files = sorted(norm_files)
 	for norm_file in norm_files:
 		year = re.findall(r'\d+', norm_file)[0]
-		if os.path.exists(data_neo4j(f'normmap/normmap_results_{year}.csv')):
+
+		if os.path.exists(data_neo4j(f'normmap/normmap_results_{year}.csv')): #COMMENTED OUT FOR TESTING
 			print(f'{year} Gard-Project mapping file already exists... bypassing')
 			continue
 
 		# Create CSV files headers
-		with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f:
+		with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f: #COMMENTED OUT FOR TESTING
 			f.writelines(['ID|GARD_id|CONF_SCORE|SEM_SIM\n'])
 
 		df = pd.read_csv(norm_file, index_col=False, low_memory=False)
-		chunk_size = int(len(df)/5)
 		thread_list = list()
+
+		#df = df[df['EXISTS_IN_ABSTRACT_FILE']=='right_only'] #TEST
+		#df = df[['APPLICATION_ID', 'ABSTRACT_TEXT', 'PHR', 'PROJECT_TITLE']] #TEST
+
+		chunk_size = int(len(df)/5)
+
 		list_df = [df[i:i+chunk_size] for i in range(0,len(df),chunk_size)]
 
 		# Create threads to process results
@@ -185,10 +192,13 @@ def run_normmap():
 	combine_normmap_results()
 	print('GARD to Project connections made')
 
+
+
 def get_RD_project_ids():
     # Get GARD to Project mappings
 	run_normmap()
 	aggregate_disease_data()
+
 	apps = pd.read_csv(data_neo4j("normmap_results.csv"), usecols=["ID"])
 
 	# Drop duplicate results and sort by Application ID
@@ -275,7 +285,7 @@ def find_RD_apps(input_file, rd_ids):
 		print('Finished ', output_file)
 
 def clean_pi (pi_info):
-	pi_info = pi_info[:len(pi_info)-1]
+	pi_info = pi_info.replace(";","")
 	return pi_info
 
 def cleanup_project_IC_NAME_totalcost():
@@ -608,13 +618,14 @@ def annotate_grant_abstracts():
 
 
 	# Annotate text with four scispaCy models
-	for model in MODELS:
+	for model in MODELS[2:]:
 		print(f'*** Annotate with {model} model ***')
 
 		nlp = load_model(model)
 		for file in input_files:
 			year = file[-8:-4]
-
+			if int(year) < 2006 and model == 'en_ner_bc5cdr_md':
+				continue
 			try:
 				text = pd.read_csv(file, encoding=ENCODING, dtype={'APPLICATION_ID':int, 'ABSTRACT_TEXT':str})
 
@@ -779,10 +790,8 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
 	merge_project_funding()
 	print("Running select_RD_projects")
 	select_RD_projects()
-	
 	print("Running cleanup_project_IC_NAME_totalcost")
 	cleanup_project_IC_NAME_totalcost()
-	
 	print("Running find_RD_core_projects")
 	find_RD_core_projects()
 	print("Running select_RD_patents")
@@ -797,7 +806,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
 	cleanup_pub_country()
 	print("Running select_RD_abstracts")
 	select_RD_abstracts()
-	
+	"""
 	# The below stages are extremely slow, so we will only run them for
 	# years that have changed data.
 
@@ -807,6 +816,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
 											 and v in [pygit2.GIT_STATUS_WT_MODIFIED, pygit2.GIT_STATUS_WT_NEW]}
 	'''	
 
+	"""
 	print("Running annotation_preprocess_grant")
 	annotation_preprocess_grant()
 	
@@ -818,6 +828,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
 	clean_annotation_source()
 	print("Running map_semantic_types")
 	map_semantic_types()
+	
 	print("Running fix_escaped_endings")
 	fix_escaped_endings()
 

diff --git a/RDAS_GFKG/steps.py b/RDAS_GFKG/steps.py
@@ -310,13 +310,10 @@
 			""",
 		"query":
 			"""
-			WITH split(data.PI_IDS, ';') as ids,
-			     split(data.PI_NAMEs, ';') as names, data
-			UNWIND [x in range(0, coalesce(size(ids) - 1, -1)) |
-				[trim(split(ids[x], '(')[0]), trim(split(names[x], '(')[0])]
-			] as pi_data
+            WITH [data.PI_IDS] as ids, [data.PI_NAMEs] as names, data
+            UNWIND [x in range(0, coalesce(size(ids) - 1, -1)) | [trim(split(ids[x], '(')[0]), trim(split(names[x], '(')[0])]] as pi_data
 			MERGE (p:PrincipalInvestigator {
-				pi_id: pi_data[0],
+				pi_id: coalesce(pi_data[0], ""),
 				pi_name: coalesce(pi_data[1], ""),
 				org_state: coalesce(data.ORG_STATE, ""),
 				org_name: coalesce(data.ORG_NAME, "")})

diff --git a/RDAS_GFKG/update_grant.py b/RDAS_GFKG/update_grant.py
@@ -58,6 +58,6 @@ def main(db: AlertCypher, restart_raw=False, restart_processed=False):
 	fta = prep_data(f"{sysvars.gnt_files_path}raw", f"{sysvars.gnt_files_path}processed")
 
 	# run database upgrade steps on only new/modified files
-	for step in steps[10:]:
+	for step in steps[11:]:
 		print("\n\n" + step["description"] + "...")
-		step_to_fn(**step)(db, fta)
+		step_to_fn(**step)(db, fta)