Skip to content

Commit

Permalink
Merge branch 'master' into devon_dev
Browse files Browse the repository at this point in the history
  • Loading branch information
devonleadman authored Jun 18, 2024
2 parents f8b0a01 + 3553adf commit 2015787
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 9 deletions.
2 changes: 1 addition & 1 deletion RDAS_CTKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,6 +1079,7 @@ def condition_map(db, update_metamap=True):
for gard_id in gard_ids:
# Create Annotation nodes and connect to Condition and GARD nodes
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))

else:
# Create Annotation nodes and connect to Condition nodes
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
Expand All @@ -1088,7 +1089,6 @@ def condition_map(db, update_metamap=True):
db.run('MATCH (x:Condition) SET x.METAMAP_PREFERRED_TERM = NULL SET x.METAMAP_OUTPUT = NULL SET x.FUZZY_SCORE = NULL SET x.METAMAP_SCORE = NULL')



def drug_normalize(drug):
"""
Normalize a drug name by removing non-ASCII characters and replacing non-word characters with spaces.
Expand Down
91 changes: 88 additions & 3 deletions RDAS_GFKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
from transformers import AutoTokenizer, AutoModel
import torch
import glob
<<<<<<<< HEAD:RDAS.GFKG/methods.py

def start(db, restart_raw=False, restart_processed=False):
update_grant.main(db, restart_raw=restart_raw, restart_processed=restart_processed)

========
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
Expand All @@ -46,6 +52,7 @@ def start(db, restart_raw=False, restart_processed=False):
update_grant.main(db, restart_raw=restart_raw, restart_processed=restart_processed)
db.setConf('UPDATE_PROGRESS','grant_in_progress','False')

>>>>>>>> devon_dev:RDAS_GFKG/methods.py
def download_nih_data(restart_raw=False):
current_year = int(datetime.today().year)

Expand Down Expand Up @@ -84,7 +91,11 @@ def download_nih_data(restart_raw=False):

if len(os.listdir(f'{sysvars.gnt_files_path}raw/{file_dir}/')) == 1:
for i in range(1985,current_year+1):
<<<<<<<< HEAD:RDAS.GFKG/methods.py
command = f'curl -L -X GET https://reporter.nih.gov/exporter/{type}/download/{i} -o {sysvars.base_path}grant/src/raw/{file_dir}/{type}{i}.zip'
========
command = f'curl -L -X GET https://reporter.nih.gov/exporter/{type}/download/{i} -o {sysvars.gnt_files_path}raw/{file_dir}/{type}{i}.zip'
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
os.system(command)
command = f'unzip {sysvars.gnt_files_path}raw/{file_dir}/{type}{i}.zip -d {sysvars.gnt_files_path}raw/{file_dir}'
os.system(command)
Expand Down Expand Up @@ -251,6 +262,33 @@ def get_def(a):

source_dict = {}
def GardNamePreprocessor(Gard):
<<<<<<<< HEAD:RDAS.GFKG/methods.py
Gard['GardName'] = Gard['GardName'].apply(lambda x: str(x).replace('"', '').lower())
Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x).lower()))
Gard = remove_similar_strings(Gard)
Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x)))
Gard['Synonyms'] =Gard['GardName'].apply(lambda x: [x])+Gard['Synonyms']
#Gard['Synonyms_bow']=Gard['Synonyms'].apply(lambda x: generate_term_orders_list_of_sords(x) )
Gard['Synonyms_sw'] = Gard['Synonyms'].apply(lambda x: process_row_list(x)) #.apply(lambda x: process_row_list(x))
Gard['Synonyms_sw_bow']=Gard['Synonyms_sw'].apply(lambda x: generate_term_orders_list_of_sords(x) )
Gard['Synonyms_sw_bow']=Gard['Synonyms_sw_bow'].apply(lambda x: list(set(len_chcek(x))) )
#Gard['Synonyms_sw_nltk'] = Gard['Synonyms_sw'].apply(lambda x: process_row_list_2(x))
#Gard['Synonyms_sw_nltk']=Gard['Synonyms_sw_nltk']+Gard['Synonyms_sw']
#Gard['Synonyms_sw_nltk'] = Gard['Synonyms_sw_nltk'].apply(lambda x: list(set(x)))
#Gard['Synonyms_stem'] = Gard['Synonyms'].apply(lambda x: stem_text_list(x))
#Gard['Synonyms_stem_bow']=Gard['Synonyms_stem'].apply(lambda x: generate_term_orders_list_of_sords(x) )
Gard['Synonyms_sw_stem'] = Gard['Synonyms_sw'].apply(lambda x: stem_text_list(x))
Gard['Synonyms_sw_stem_bow']=Gard['Synonyms_sw_stem'].apply(lambda x: generate_term_orders_list_of_sords(x) )
Gard['Synonyms_sw_stem'] = Gard['Synonyms_sw_stem'].apply(lambda x:list(set(len_chcek(x))) )
Gard['Synonyms_sw_stem_bow']=Gard['Synonyms_sw_stem_bow'].apply(lambda x: list(set(len_chcek(x))) )
Gard['Synonyms_sw'] = Gard['Synonyms_sw_stem'].apply(lambda x: list(set(len_chcek(x))) )

Excluding_list = ['GARD:{:07d}'.format(int(gard_id.split(':')[1])) for gard_id in sysvars.gard_preprocessor_exclude]
Gard['GardId'] = Gard['GardId'].str.strip('"')
Gard = Gard[~Gard['GardId'].isin(Excluding_list)]

return Gard
========
print(Gard)
Gard['GardName'] = Gard['GardName'].apply(lambda x: str(x).replace('"', '').lower())
Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x).lower()))
Expand Down Expand Up @@ -280,51 +318,81 @@ def GardNamePreprocessor(Gard):
Gard['GardNamedef']=Gard.apply(lambda x: get_def(x['GardName']), axis=1)

return Gard
>>>>>>>> devon_dev:RDAS_GFKG/methods.py

def download_gard_data_from_db ():
db = AlertCypher(sysvars.gard_db)
in_progress = db.getConf('UPDATE_PROGRESS', 'grant_in_progress')

<<<<<<<< HEAD:RDAS.GFKG/methods.py
if not in_progress == 'True':
return None

if not os.path.exists(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv'):
response = db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms').data()

myFile = open(f'{sysvars.base_path}grant/src/raw/all_gards.csv', 'w')
========
#if not in_progress == 'True':
#return None

if not os.path.exists(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv'):
response = db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms').data()

myFile = open(f'{sysvars.gnt_files_path}raw/all_gards.csv', 'w')
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
writer = csv.writer(myFile)
writer.writerow(['GardId', 'GardName', 'Synonyms'])
for dictionary in response:
writer.writerow(dictionary.values())
myFile.close()
<<<<<<<< HEAD:RDAS.GFKG/methods.py
df = pd.read_csv(f'{sysvars.base_path}grant/src/raw/all_gards.csv')

df = GardNamePreprocessor(df)
df.to_csv(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv')

else:
df = pd.read_csv(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv')
========
df = pd.read_csv(f'{sysvars.gnt_files_path}raw/all_gards.csv')

df = GardNamePreprocessor(df)
df.to_csv(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv')

else:
df = pd.read_csv(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv')
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
df['Synonyms_sw'] = df['Synonyms_sw'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
df['Synonyms_sw_bow'] = df['Synonyms_sw_bow'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
df['Synonyms_sw_stem'] = df['Synonyms_sw_stem'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
df['Synonyms_sw_stem_bow'] = df['Synonyms_sw_stem_bow'].apply(lambda x: extract_words_from_json_string2(str(x).lower()))
<<<<<<<< HEAD:RDAS.GFKG/methods.py
========

help=pd.read_csv(f'{sysvars.gnt_files_path}J_GARD_master.csv')
for index, row in help.iterrows():
source_name = row['SourceName']
source_description = row['SourceDescription']
if type(source_name) ==str:
source_dict[source_name.lower()] = source_description
>>>>>>>> devon_dev:RDAS_GFKG/methods.py

return df

# Global Objects for Processing

Gard = download_gard_data_from_db()
<<<<<<<< HEAD:RDAS.GFKG/methods.py

'''
if not os.path.exists(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv'):
========
print(Gard)
'''
if not os.path.exists(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv'):
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
pass
Gard = download_gard_data_from_db()
else:
Expand Down Expand Up @@ -589,7 +657,11 @@ def combine_dictionaries_sent(dict1, dict2):
combined_dict[key] = value
return combined_dict
<<<<<<<< HEAD:RDAS.GFKG/methods.py
def modified_dict(combined_dict,combined_dict_sen):
========
def modified_dict(combined_dict):#,combined_dict_sen):
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
keys_to_remove = set()
for key1 in combined_dict:
for key2 in combined_dict:
Expand Down Expand Up @@ -729,6 +801,14 @@ def normalize_combined_dictionary(input_text,title_,dict1, dict2, dict3, dict4,m
normalized_dict = {key: value for key, value in combined_dict.items()}
result_dict = {}
for key, value in normalized_dict.items():
<<<<<<<< HEAD:RDAS.GFKG/methods.py
#if is_about_term(input_text.lower(), key) >=0.7:
result_dict[key] = [value, is_about_term(input_text.lower(), key)]
return result_dict
def gard_id(title_, Public_health_relevance_statement, abstract_, nlp):
========
#if is_about_term(input_text.lower(), key) >=0.5:
#sen_has_gard=get_sen(input_text.lower(), key,title_)
defin=get_def(key)
Expand All @@ -753,6 +833,7 @@ def normalize_combined_dictionary(input_text,title_,dict1, dict2, dict3, dict4,m
def grad_id(title_, Public_health_relevance_statement, abstract_):
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
if not isinstance(title_, str) and not isinstance(Public_health_relevance_statement, str) and not isinstance(abstract_, str):
return '' # Return default values when no string input is provided
if title_ and isinstance(title_, str):
Expand All @@ -763,7 +844,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
else: return normalize_combined_dictionary(title_,title_,name,{},{},{},1,1,'title')
if Public_health_relevance_statement and isinstance(Public_health_relevance_statement, str):
A, B, C,D = check_sen(Public_health_relevance_statement)
A, B, C,D = check_sen(Public_health_relevance_statement, nlp)
name1 = get_gard_abstract_stem_exact(A)
name2 = get_gard_abstract_stem_exact(B)
name3 = get_gard_abstract_stem_exact(C)
Expand All @@ -772,17 +853,21 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
if name and (name !={}): return name
if abstract_ and isinstance(abstract_, str):
A, B, C , D = check_sen(abstract_)
A, B, C , D = check_sen(abstract_, nlp)
name1 = get_gard_abstract_stem_exact(A)
name2 = get_gard_abstract_stem_exact(B)
name3 = get_gard_abstract_stem_exact(C)
name4 = get_gard_abstract_stem_exact(D)
name=normalize_combined_dictionary(abstract_,title_,name1,name2,name3,name4,0,0.7,'abstract')
if name and (name !={}): return name
def GardNameExtractor(project_title,phr_text,abstract_text):
def GardNameExtractor(project_title,phr_text,abstract_text, nlp):
#Abstract1['Gard_name']=Abstract1.apply(lambda x: gard_id(x['project_title'],x['phr_text'],x['abstract_text']), axis=1)
<<<<<<<< HEAD:RDAS.GFKG/methods.py
gard_ids = gard_id(project_title,phr_text,abstract_text, nlp)
========
gard_ids = grad_id(project_title,phr_text,abstract_text)
>>>>>>>> devon_dev:RDAS_GFKG/methods.py
if gard_ids:
return update_dictionary(gard_ids)
else:
Expand Down
95 changes: 90 additions & 5 deletions RDAS_GFKG/prep_neo4j_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,53 @@ def combine_normmap_results():


lock = threading.Lock()
<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
def batch_normmap(df, thr, year, nlp):
r,c = df.shape
for idx in range(r):
try:
with lock:
print(f'{idx}/{r} [{thr}]')

row = df.iloc[idx]
appl_id = row['APPLICATION_ID']
abstract = row['ABSTRACT_TEXT']
phr = row['PHR']
title = row['PROJECT_TITLE']

gard_ids = rdas.GardNameExtractor(title, phr, abstract, nlp)
if gard_ids:
for gard,add_data in gard_ids.items():
if add_data == 1:
add_data = [1,1]

with lock:
print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f:
f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n'])

except Exception as e:
print(e)
continue

"""
def normmap_process (df):
r,c = df.shape
for idx in range(r):
try:
print(f'{idx}/{r}')
row = df.iloc[idx]
appl_id = row['APPLICATION_ID']
abstract = row['ABSTRACT_TEXT']
phr = row['PHR']
title = row['PROJECT_TITLE']
#project_data = rdas.get_project_data(appl_id).get('results')[0]
#title = project_data.get('project_title')
#phr = project_data.get('phr_text')
========
def batch_normmap(df, thr, year):
r,c = df.shape
for idx in range(r):
Expand All @@ -121,13 +168,27 @@ def batch_normmap(df, thr, year):
abstract = row['ABSTRACT_TEXT']
phr = row['PHR']
title = row['PROJECT_TITLE']
>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
gard_ids = rdas.GardNameExtractor(title, phr, abstract)
if gard_ids:
for gard,add_data in gard_ids.items():
if add_data == 1:
add_data = [1,1]
gard_ids = rdas.GardNameExtractor(title, phr, abstract)
if gard_ids:
for gard,add_data in gard_ids.items():
if add_data == 1:
add_data = [1,1]
with lock:
print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
with open(data_raw('normmap_results.csv'), "a") as f:
f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n'])
except Exception as e:
print(e)
continue
"""


<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
========
with lock:
print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]})
with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f:
Expand Down Expand Up @@ -172,11 +233,16 @@ def normmap_process (df):
"""


>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py

def run_normmap():
print('Running NormMap')

<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
nlp = spacy.load("en_core_web_sm")
========
#nlp = spacy.load("en_core_web_sm")
>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py

abs_files = glob.glob(data_raw('abstracts') + '/*.csv')
abs_files = sorted(abs_files)
Expand All @@ -185,6 +251,20 @@ def run_normmap():
prj_files = sorted(prj_files)

for idx, abs_file in enumerate(abs_files):
<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
prj_file = prj_files[idx]

print(abs_file, ' -merged- ',prj_file)

tmp = pd.read_csv(('{filename}'.format(filename=abs_file)),index_col=False, encoding = "ISO-8859-1")
tmp2 = pd.read_csv(('{filename}'.format(filename=prj_file)),index_col=False, usecols=['APPLICATION_ID','PHR', 'PROJECT_TITLE'], encoding = "ISO-8859-1", low_memory=False)

merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'])
merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int)

year = re.findall(r'\d+', abs_file)[0]

========
year = re.findall(r'\d+', abs_file)[0]

if os.path.exists(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv')):
Expand All @@ -201,6 +281,7 @@ def run_normmap():
merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'])
merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int)

>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
merged_df.to_csv(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv'), index=False)


Expand All @@ -226,7 +307,11 @@ def run_normmap():

# Create threads to process results
for thrnum, lst in enumerate(list_df):
<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py
thread = threading.Thread(target=batch_normmap, args=(lst, thrnum, year, nlp), daemon=True)
========
thread = threading.Thread(target=batch_normmap, args=(lst, thrnum, year), daemon=True)
>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py
thread_list.append(thread)

for thr in thread_list:
Expand Down
1 change: 1 addition & 0 deletions RDAS_PAKG/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ def main (update_from=False, update_to=False):

RDAS_PAKG.init.main(update_from=update_from, update_to=update_to)


main() #TEST
#get_node_counts()
Empty file modified backup/RDAS_CTKG/README.md
100755 → 100644
Empty file.
Empty file modified backup/RDAS_GARD/README.md
100755 → 100644
Empty file.
Empty file modified backup/RDAS_GFKG/README.md
100755 → 100644
Empty file.
Empty file modified backup/RDAS_PAKG/README.md
100755 → 100644
Empty file.

0 comments on commit 2015787

Please sign in to comment.