Merge branch 'master' into devon_dev

ncats · Jun 18, 2024 · 2015787 · 2015787
2 parents f8b0a01 + 3553adf
commit 2015787
Show file tree

Hide file tree

Showing 8 changed files with 180 additions and 9 deletions.
diff --git a/RDAS_CTKG/methods.py b/RDAS_CTKG/methods.py
@@ -1079,6 +1079,7 @@ def condition_map(db, update_metamap=True):
                 for gard_id in gard_ids:
                     # Create Annotation nodes and connect to Condition and GARD nodes
                     db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
+
             else:
                 # Create Annotation nodes and connect to Condition nodes
                 db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
@@ -1088,7 +1089,6 @@ def condition_map(db, update_metamap=True):
     db.run('MATCH (x:Condition) SET x.METAMAP_PREFERRED_TERM = NULL SET x.METAMAP_OUTPUT = NULL SET x.FUZZY_SCORE = NULL SET x.METAMAP_SCORE = NULL')
 
 
-
 def drug_normalize(drug):
     """
     Normalize a drug name by removing non-ASCII characters and replacing non-word characters with spaces.

diff --git a/RDAS_GFKG/methods.py b/RDAS_GFKG/methods.py
@@ -27,6 +27,12 @@
 from transformers import AutoTokenizer, AutoModel
 import torch
 import glob
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+
+def start(db, restart_raw=False, restart_processed=False):
+    update_grant.main(db, restart_raw=restart_raw, restart_processed=restart_processed)
+
+========
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, models
 from transformers import BertTokenizer
@@ -46,6 +52,7 @@ def start(db, restart_raw=False, restart_processed=False):
     update_grant.main(db, restart_raw=restart_raw, restart_processed=restart_processed)
     db.setConf('UPDATE_PROGRESS','grant_in_progress','False')
 
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
 def download_nih_data(restart_raw=False):
     current_year = int(datetime.today().year)
 
@@ -84,7 +91,11 @@ def download_nih_data(restart_raw=False):
 
         if len(os.listdir(f'{sysvars.gnt_files_path}raw/{file_dir}/')) == 1:
             for i in range(1985,current_year+1):
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+                command = f'curl -L -X GET https://reporter.nih.gov/exporter/{type}/download/{i} -o {sysvars.base_path}grant/src/raw/{file_dir}/{type}{i}.zip'
+========
                 command = f'curl -L -X GET https://reporter.nih.gov/exporter/{type}/download/{i} -o {sysvars.gnt_files_path}raw/{file_dir}/{type}{i}.zip'
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
                 os.system(command)
                 command = f'unzip {sysvars.gnt_files_path}raw/{file_dir}/{type}{i}.zip -d {sysvars.gnt_files_path}raw/{file_dir}'
                 os.system(command)
@@ -251,6 +262,33 @@ def get_def(a):
 
 source_dict = {}
 def GardNamePreprocessor(Gard):
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+   Gard['GardName'] = Gard['GardName'].apply(lambda x: str(x).replace('"', '').lower())
+   Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x).lower()))
+   Gard = remove_similar_strings(Gard)
+   Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x)))
+   Gard['Synonyms'] =Gard['GardName'].apply(lambda x: [x])+Gard['Synonyms']
+   #Gard['Synonyms_bow']=Gard['Synonyms'].apply(lambda x: generate_term_orders_list_of_sords(x) )
+   Gard['Synonyms_sw'] = Gard['Synonyms'].apply(lambda x: process_row_list(x)) #.apply(lambda x: process_row_list(x))
+   Gard['Synonyms_sw_bow']=Gard['Synonyms_sw'].apply(lambda x: generate_term_orders_list_of_sords(x) )
+   Gard['Synonyms_sw_bow']=Gard['Synonyms_sw_bow'].apply(lambda x: list(set(len_chcek(x))) )
+   #Gard['Synonyms_sw_nltk'] = Gard['Synonyms_sw'].apply(lambda x: process_row_list_2(x))
+   #Gard['Synonyms_sw_nltk']=Gard['Synonyms_sw_nltk']+Gard['Synonyms_sw']
+   #Gard['Synonyms_sw_nltk'] = Gard['Synonyms_sw_nltk'].apply(lambda x: list(set(x)))
+   #Gard['Synonyms_stem'] = Gard['Synonyms'].apply(lambda x: stem_text_list(x))
+   #Gard['Synonyms_stem_bow']=Gard['Synonyms_stem'].apply(lambda x: generate_term_orders_list_of_sords(x) )
+   Gard['Synonyms_sw_stem'] = Gard['Synonyms_sw'].apply(lambda x: stem_text_list(x))
+   Gard['Synonyms_sw_stem_bow']=Gard['Synonyms_sw_stem'].apply(lambda x: generate_term_orders_list_of_sords(x) )
+   Gard['Synonyms_sw_stem'] = Gard['Synonyms_sw_stem'].apply(lambda x:list(set(len_chcek(x))) )
+   Gard['Synonyms_sw_stem_bow']=Gard['Synonyms_sw_stem_bow'].apply(lambda x: list(set(len_chcek(x))) )
+   Gard['Synonyms_sw'] = Gard['Synonyms_sw_stem'].apply(lambda x: list(set(len_chcek(x))) )
+
+   Excluding_list = ['GARD:{:07d}'.format(int(gard_id.split(':')[1])) for gard_id in sysvars.gard_preprocessor_exclude]
+   Gard['GardId'] = Gard['GardId'].str.strip('"')
+   Gard = Gard[~Gard['GardId'].isin(Excluding_list)]
+
+   return Gard
+========
     print(Gard)
     Gard['GardName'] = Gard['GardName'].apply(lambda x: str(x).replace('"', '').lower())
     Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x).lower()))
@@ -280,51 +318,81 @@ def GardNamePreprocessor(Gard):
     Gard['GardNamedef']=Gard.apply(lambda x: get_def(x['GardName']), axis=1)
 
     return Gard
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
 
 def download_gard_data_from_db ():
     db = AlertCypher(sysvars.gard_db)
     in_progress = db.getConf('UPDATE_PROGRESS', 'grant_in_progress')
 
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+    if not in_progress == 'True':
+        return None
+
+    if not os.path.exists(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv'):
+        response = db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms').data()
+
+        myFile = open(f'{sysvars.base_path}grant/src/raw/all_gards.csv', 'w')
+========
     #if not in_progress == 'True':
         #return None
 
     if not os.path.exists(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv'):
         response = db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms').data()
 
         myFile = open(f'{sysvars.gnt_files_path}raw/all_gards.csv', 'w')
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
         writer = csv.writer(myFile)
         writer.writerow(['GardId', 'GardName', 'Synonyms'])
         for dictionary in response:
             writer.writerow(dictionary.values())
         myFile.close()
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+        df = pd.read_csv(f'{sysvars.base_path}grant/src/raw/all_gards.csv')
+
+        df = GardNamePreprocessor(df)
+        df.to_csv(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv')
+
+    else:
+        df = pd.read_csv(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv')
+========
         df = pd.read_csv(f'{sysvars.gnt_files_path}raw/all_gards.csv')
 
         df = GardNamePreprocessor(df)
         df.to_csv(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv')
 
     else:
         df = pd.read_csv(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv')
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
         df['Synonyms_sw'] = df['Synonyms_sw'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
         df['Synonyms_sw_bow'] = df['Synonyms_sw_bow'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
         df['Synonyms_sw_stem'] = df['Synonyms_sw_stem'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
         df['Synonyms_sw_stem_bow'] = df['Synonyms_sw_stem_bow'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+========
 
         help=pd.read_csv(f'{sysvars.gnt_files_path}J_GARD_master.csv')
         for index, row in help.iterrows():
             source_name = row['SourceName']
             source_description = row['SourceDescription']
             if type(source_name) ==str:
                 source_dict[source_name.lower()] = source_description
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
 
     return df
 
 # Global Objects for Processing
 
 Gard = download_gard_data_from_db()
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+
+'''
+if not os.path.exists(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv'):
+========
 print(Gard)
 
 '''
 if not os.path.exists(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv'):
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
     pass
     Gard = download_gard_data_from_db()
 else:
@@ -589,7 +657,11 @@ def combine_dictionaries_sent(dict1, dict2):
             combined_dict[key] = value
     return combined_dict
 
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+def modified_dict(combined_dict,combined_dict_sen):
+========
 def modified_dict(combined_dict):#,combined_dict_sen):
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
     keys_to_remove = set()
     for key1 in combined_dict:
         for key2 in combined_dict:
@@ -729,6 +801,14 @@ def normalize_combined_dictionary(input_text,title_,dict1, dict2, dict3, dict4,m
     normalized_dict = {key: value   for key, value in combined_dict.items()}
     result_dict = {}
     for key, value in normalized_dict.items():
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+    #if  is_about_term(input_text.lower(), key) >=0.7:
+        result_dict[key] = [value, is_about_term(input_text.lower(), key)]
+    return result_dict
+
+
+def gard_id(title_, Public_health_relevance_statement, abstract_, nlp):
+========
         #if  is_about_term(input_text.lower(), key) >=0.5:
         #sen_has_gard=get_sen(input_text.lower(), key,title_)
         defin=get_def(key)
@@ -753,6 +833,7 @@ def normalize_combined_dictionary(input_text,title_,dict1, dict2, dict3, dict4,m
 
 
 def grad_id(title_, Public_health_relevance_statement, abstract_):
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
     if not isinstance(title_, str) and not isinstance(Public_health_relevance_statement, str) and not isinstance(abstract_, str):
         return ''  # Return default values when no string input is provided
     if title_ and isinstance(title_, str):
@@ -763,7 +844,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
           else: return normalize_combined_dictionary(title_,title_,name,{},{},{},1,1,'title')
 
     if Public_health_relevance_statement and isinstance(Public_health_relevance_statement, str):
-        A, B, C,D = check_sen(Public_health_relevance_statement)
+        A, B, C,D = check_sen(Public_health_relevance_statement, nlp)
         name1 = get_gard_abstract_stem_exact(A)
         name2 = get_gard_abstract_stem_exact(B)
         name3 = get_gard_abstract_stem_exact(C)
@@ -772,17 +853,21 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
         if name and (name !={}): return name
 
     if abstract_ and isinstance(abstract_, str):
-        A, B, C , D = check_sen(abstract_)
+        A, B, C , D = check_sen(abstract_, nlp)
         name1 = get_gard_abstract_stem_exact(A)
         name2 = get_gard_abstract_stem_exact(B)
         name3 = get_gard_abstract_stem_exact(C)
         name4 = get_gard_abstract_stem_exact(D)
         name=normalize_combined_dictionary(abstract_,title_,name1,name2,name3,name4,0,0.7,'abstract')
         if name and (name !={}): return name
 
-def GardNameExtractor(project_title,phr_text,abstract_text):
+def GardNameExtractor(project_title,phr_text,abstract_text, nlp):
   #Abstract1['Gard_name']=Abstract1.apply(lambda x: gard_id(x['project_title'],x['phr_text'],x['abstract_text']), axis=1)
+<<<<<<<< HEAD:RDAS.GFKG/methods.py
+  gard_ids = gard_id(project_title,phr_text,abstract_text, nlp)
+========
   gard_ids = grad_id(project_title,phr_text,abstract_text)
+>>>>>>>> devon_dev:RDAS_GFKG/methods.py
   if gard_ids:
     return update_dictionary(gard_ids)
   else:

diff --git a/RDAS_GFKG/prep_neo4j_data.py b/RDAS_GFKG/prep_neo4j_data.py
@@ -109,6 +109,53 @@ def combine_normmap_results():
 
 
 lock = threading.Lock()
+<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
+def batch_normmap(df, thr, year, nlp):
+	r,c = df.shape
+	for idx in range(r):
+		try:
+			with lock:
+				print(f'{idx}/{r} [{thr}]')
+
+			row = df.iloc[idx]
+			appl_id = row['APPLICATION_ID']
+			abstract = row['ABSTRACT_TEXT']
+			phr = row['PHR']
+			title = row['PROJECT_TITLE']
+
+			gard_ids = rdas.GardNameExtractor(title, phr, abstract, nlp)
+			if gard_ids:
+				for gard,add_data in gard_ids.items():
+					if add_data == 1:
+						add_data = [1,1]
+
+					with lock:
+						print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
+						with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f:
+							f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n'])
+
+		except Exception as e:
+			print(e)
+			continue
+
+"""
+def normmap_process (df):
+	r,c = df.shape
+	for idx in range(r):
+		try:
+			print(f'{idx}/{r}') 
+			row = df.iloc[idx]
+			appl_id = row['APPLICATION_ID']
+			abstract = row['ABSTRACT_TEXT']
+			phr = row['PHR']
+			title = row['PROJECT_TITLE']
+
+			#project_data = rdas.get_project_data(appl_id).get('results')[0]
+
+			#title = project_data.get('project_title')
+			#phr = project_data.get('phr_text')
+
+========
 def batch_normmap(df, thr, year):
 	r,c = df.shape
 	for idx in range(r):
@@ -121,13 +168,27 @@ def batch_normmap(df, thr, year):
 		abstract = row['ABSTRACT_TEXT']
 		phr = row['PHR']
 		title = row['PROJECT_TITLE']
+>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
 
-		gard_ids = rdas.GardNameExtractor(title, phr, abstract)
-		if gard_ids:
-			for gard,add_data in gard_ids.items():
-				if add_data == 1:
-					add_data = [1,1]
+			gard_ids = rdas.GardNameExtractor(title, phr, abstract)
+			if gard_ids:
+				for gard,add_data in gard_ids.items():
+					if add_data == 1:
+						add_data = [1,1]
+
+					with lock:
+						print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
+						with open(data_raw('normmap_results.csv'), "a") as f:
+							f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n'])
 
+		except Exception as e:
+			print(e)
+			continue
+"""
+
+
+<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
+========
 				with lock:
 					print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
 					with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f:
@@ -172,11 +233,16 @@ def normmap_process (df):
 """
 
 
+>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
 
 def run_normmap():
 	print('Running NormMap')
 
+<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
+	nlp = spacy.load("en_core_web_sm")
+========
 	#nlp = spacy.load("en_core_web_sm")
+>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
 
 	abs_files = glob.glob(data_raw('abstracts') + '/*.csv')
 	abs_files = sorted(abs_files)
@@ -185,6 +251,20 @@ def run_normmap():
 	prj_files = sorted(prj_files)
 
 	for idx, abs_file in enumerate(abs_files):
+<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
+		prj_file = prj_files[idx]
+
+		print(abs_file, ' -merged- ',prj_file)
+
+		tmp = pd.read_csv(('{filename}'.format(filename=abs_file)),index_col=False, encoding = "ISO-8859-1")
+		tmp2 = pd.read_csv(('{filename}'.format(filename=prj_file)),index_col=False, usecols=['APPLICATION_ID','PHR', 'PROJECT_TITLE'], encoding = "ISO-8859-1", low_memory=False)
+
+		merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'])
+		merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int)
+
+		year = re.findall(r'\d+', abs_file)[0]
+
+========
 		year = re.findall(r'\d+', abs_file)[0]
 
 		if os.path.exists(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv')):
@@ -201,6 +281,7 @@ def run_normmap():
 		merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'])
 		merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int)
 
+>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
 		merged_df.to_csv(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv'), index=False)
 
 
@@ -226,7 +307,11 @@ def run_normmap():
 
 		# Create threads to process results
 		for thrnum, lst in enumerate(list_df):
+<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
+			thread = threading.Thread(target=batch_normmap, args=(lst, thrnum, year, nlp), daemon=True)
+========
 			thread = threading.Thread(target=batch_normmap, args=(lst, thrnum, year), daemon=True)
+>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
 			thread_list.append(thread)
 
 		for thr in thread_list:

diff --git a/RDAS_PAKG/update.py b/RDAS_PAKG/update.py
@@ -24,5 +24,6 @@ def main (update_from=False, update_to=False):
 
     RDAS_PAKG.init.main(update_from=update_from, update_to=update_to)
 
+
 main() #TEST
 #get_node_counts()
diff --git a/backup/RDAS_CTKG/README.md b/backup/RDAS_CTKG/README.md
diff --git a/backup/RDAS_GARD/README.md b/backup/RDAS_GARD/README.md
diff --git a/backup/RDAS_GFKG/README.md b/backup/RDAS_GFKG/README.md
diff --git a/backup/RDAS_PAKG/README.md b/backup/RDAS_PAKG/README.md