diff --git a/RDAS_CTKG/methods.py b/RDAS_CTKG/methods.py index 6d749bd..fb0e00d 100755 --- a/RDAS_CTKG/methods.py +++ b/RDAS_CTKG/methods.py @@ -1079,6 +1079,7 @@ def condition_map(db, update_metamap=True): for gard_id in gard_ids: # Create Annotation nodes and connect to Condition and GARD nodes db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx])) + else: # Create Annotation nodes and connect to Condition nodes db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:ConditionAnnotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx])) @@ -1088,7 +1089,6 @@ def condition_map(db, update_metamap=True): db.run('MATCH (x:Condition) SET x.METAMAP_PREFERRED_TERM = NULL SET x.METAMAP_OUTPUT = NULL SET x.FUZZY_SCORE = NULL SET x.METAMAP_SCORE = NULL') - def drug_normalize(drug): """ Normalize a drug name by removing non-ASCII characters and replacing non-word characters with spaces. diff --git a/RDAS_GFKG/methods.py b/RDAS_GFKG/methods.py index 7322e55..de9745c 100755 --- a/RDAS_GFKG/methods.py +++ b/RDAS_GFKG/methods.py @@ -27,6 +27,12 @@ from transformers import AutoTokenizer, AutoModel import torch import glob +<<<<<<<< HEAD:RDAS.GFKG/methods.py + +def start(db, restart_raw=False, restart_processed=False): + update_grant.main(db, restart_raw=restart_raw, restart_processed=restart_processed) + +======== from datasets import load_dataset from sentence_transformers import SentenceTransformer, models from transformers import BertTokenizer @@ -46,6 +52,7 @@ def start(db, restart_raw=False, restart_processed=False): update_grant.main(db, restart_raw=restart_raw, restart_processed=restart_processed) db.setConf('UPDATE_PROGRESS','grant_in_progress','False') +>>>>>>>> devon_dev:RDAS_GFKG/methods.py def download_nih_data(restart_raw=False): current_year = int(datetime.today().year) @@ -84,7 +91,11 @@ def download_nih_data(restart_raw=False): if len(os.listdir(f'{sysvars.gnt_files_path}raw/{file_dir}/')) == 1: for i in range(1985,current_year+1): +<<<<<<<< HEAD:RDAS.GFKG/methods.py + command = f'curl -L -X GET https://reporter.nih.gov/exporter/{type}/download/{i} -o {sysvars.base_path}grant/src/raw/{file_dir}/{type}{i}.zip' +======== command = f'curl -L -X GET https://reporter.nih.gov/exporter/{type}/download/{i} -o {sysvars.gnt_files_path}raw/{file_dir}/{type}{i}.zip' +>>>>>>>> devon_dev:RDAS_GFKG/methods.py os.system(command) command = f'unzip {sysvars.gnt_files_path}raw/{file_dir}/{type}{i}.zip -d {sysvars.gnt_files_path}raw/{file_dir}' os.system(command) @@ -251,6 +262,33 @@ def get_def(a): source_dict = {} def GardNamePreprocessor(Gard): +<<<<<<<< HEAD:RDAS.GFKG/methods.py + Gard['GardName'] = Gard['GardName'].apply(lambda x: str(x).replace('"', '').lower()) + Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x).lower())) + Gard = remove_similar_strings(Gard) + Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x))) + Gard['Synonyms'] =Gard['GardName'].apply(lambda x: [x])+Gard['Synonyms'] + #Gard['Synonyms_bow']=Gard['Synonyms'].apply(lambda x: generate_term_orders_list_of_sords(x) ) + Gard['Synonyms_sw'] = Gard['Synonyms'].apply(lambda x: process_row_list(x)) #.apply(lambda x: process_row_list(x)) + Gard['Synonyms_sw_bow']=Gard['Synonyms_sw'].apply(lambda x: generate_term_orders_list_of_sords(x) ) + Gard['Synonyms_sw_bow']=Gard['Synonyms_sw_bow'].apply(lambda x: list(set(len_chcek(x))) ) + #Gard['Synonyms_sw_nltk'] = Gard['Synonyms_sw'].apply(lambda x: process_row_list_2(x)) + #Gard['Synonyms_sw_nltk']=Gard['Synonyms_sw_nltk']+Gard['Synonyms_sw'] + #Gard['Synonyms_sw_nltk'] = Gard['Synonyms_sw_nltk'].apply(lambda x: list(set(x))) + #Gard['Synonyms_stem'] = Gard['Synonyms'].apply(lambda x: stem_text_list(x)) + #Gard['Synonyms_stem_bow']=Gard['Synonyms_stem'].apply(lambda x: generate_term_orders_list_of_sords(x) ) + Gard['Synonyms_sw_stem'] = Gard['Synonyms_sw'].apply(lambda x: stem_text_list(x)) + Gard['Synonyms_sw_stem_bow']=Gard['Synonyms_sw_stem'].apply(lambda x: generate_term_orders_list_of_sords(x) ) + Gard['Synonyms_sw_stem'] = Gard['Synonyms_sw_stem'].apply(lambda x:list(set(len_chcek(x))) ) + Gard['Synonyms_sw_stem_bow']=Gard['Synonyms_sw_stem_bow'].apply(lambda x: list(set(len_chcek(x))) ) + Gard['Synonyms_sw'] = Gard['Synonyms_sw_stem'].apply(lambda x: list(set(len_chcek(x))) ) + + Excluding_list = ['GARD:{:07d}'.format(int(gard_id.split(':')[1])) for gard_id in sysvars.gard_preprocessor_exclude] + Gard['GardId'] = Gard['GardId'].str.strip('"') + Gard = Gard[~Gard['GardId'].isin(Excluding_list)] + + return Gard +======== print(Gard) Gard['GardName'] = Gard['GardName'].apply(lambda x: str(x).replace('"', '').lower()) Gard['Synonyms'] = Gard['Synonyms'].apply(lambda x: extract_words_from_json_string(str(x).lower())) @@ -280,11 +318,21 @@ def GardNamePreprocessor(Gard): Gard['GardNamedef']=Gard.apply(lambda x: get_def(x['GardName']), axis=1) return Gard +>>>>>>>> devon_dev:RDAS_GFKG/methods.py def download_gard_data_from_db (): db = AlertCypher(sysvars.gard_db) in_progress = db.getConf('UPDATE_PROGRESS', 'grant_in_progress') +<<<<<<<< HEAD:RDAS.GFKG/methods.py + if not in_progress == 'True': + return None + + if not os.path.exists(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv'): + response = db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms').data() + + myFile = open(f'{sysvars.base_path}grant/src/raw/all_gards.csv', 'w') +======== #if not in_progress == 'True': #return None @@ -292,11 +340,21 @@ def download_gard_data_from_db (): response = db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms').data() myFile = open(f'{sysvars.gnt_files_path}raw/all_gards.csv', 'w') +>>>>>>>> devon_dev:RDAS_GFKG/methods.py writer = csv.writer(myFile) writer.writerow(['GardId', 'GardName', 'Synonyms']) for dictionary in response: writer.writerow(dictionary.values()) myFile.close() +<<<<<<<< HEAD:RDAS.GFKG/methods.py + df = pd.read_csv(f'{sysvars.base_path}grant/src/raw/all_gards.csv') + + df = GardNamePreprocessor(df) + df.to_csv(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv') + + else: + df = pd.read_csv(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv') +======== df = pd.read_csv(f'{sysvars.gnt_files_path}raw/all_gards.csv') df = GardNamePreprocessor(df) @@ -304,10 +362,13 @@ def download_gard_data_from_db (): else: df = pd.read_csv(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv') +>>>>>>>> devon_dev:RDAS_GFKG/methods.py df['Synonyms_sw'] = df['Synonyms_sw'].apply(lambda x: extract_words_from_json_string2(str(x).lower())) df['Synonyms_sw_bow'] = df['Synonyms_sw_bow'].apply(lambda x: extract_words_from_json_string2(str(x).lower())) df['Synonyms_sw_stem'] = df['Synonyms_sw_stem'].apply(lambda x: extract_words_from_json_string2(str(x).lower())) df['Synonyms_sw_stem_bow'] = df['Synonyms_sw_stem_bow'].apply(lambda x: extract_words_from_json_string2(str(x).lower())) +<<<<<<<< HEAD:RDAS.GFKG/methods.py +======== help=pd.read_csv(f'{sysvars.gnt_files_path}J_GARD_master.csv') for index, row in help.iterrows(): @@ -315,16 +376,23 @@ def download_gard_data_from_db (): source_description = row['SourceDescription'] if type(source_name) ==str: source_dict[source_name.lower()] = source_description +>>>>>>>> devon_dev:RDAS_GFKG/methods.py return df # Global Objects for Processing Gard = download_gard_data_from_db() +<<<<<<<< HEAD:RDAS.GFKG/methods.py + +''' +if not os.path.exists(f'{sysvars.base_path}grant/src/processed/all_gards_processed.csv'): +======== print(Gard) ''' if not os.path.exists(f'{sysvars.gnt_files_path}processed/all_gards_processed.csv'): +>>>>>>>> devon_dev:RDAS_GFKG/methods.py pass Gard = download_gard_data_from_db() else: @@ -589,7 +657,11 @@ def combine_dictionaries_sent(dict1, dict2): combined_dict[key] = value return combined_dict +<<<<<<<< HEAD:RDAS.GFKG/methods.py +def modified_dict(combined_dict,combined_dict_sen): +======== def modified_dict(combined_dict):#,combined_dict_sen): +>>>>>>>> devon_dev:RDAS_GFKG/methods.py keys_to_remove = set() for key1 in combined_dict: for key2 in combined_dict: @@ -729,6 +801,14 @@ def normalize_combined_dictionary(input_text,title_,dict1, dict2, dict3, dict4,m normalized_dict = {key: value for key, value in combined_dict.items()} result_dict = {} for key, value in normalized_dict.items(): +<<<<<<<< HEAD:RDAS.GFKG/methods.py + #if is_about_term(input_text.lower(), key) >=0.7: + result_dict[key] = [value, is_about_term(input_text.lower(), key)] + return result_dict + + +def gard_id(title_, Public_health_relevance_statement, abstract_, nlp): +======== #if is_about_term(input_text.lower(), key) >=0.5: #sen_has_gard=get_sen(input_text.lower(), key,title_) defin=get_def(key) @@ -753,6 +833,7 @@ def normalize_combined_dictionary(input_text,title_,dict1, dict2, dict3, dict4,m def grad_id(title_, Public_health_relevance_statement, abstract_): +>>>>>>>> devon_dev:RDAS_GFKG/methods.py if not isinstance(title_, str) and not isinstance(Public_health_relevance_statement, str) and not isinstance(abstract_, str): return '' # Return default values when no string input is provided if title_ and isinstance(title_, str): @@ -763,7 +844,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_): else: return normalize_combined_dictionary(title_,title_,name,{},{},{},1,1,'title') if Public_health_relevance_statement and isinstance(Public_health_relevance_statement, str): - A, B, C,D = check_sen(Public_health_relevance_statement) + A, B, C,D = check_sen(Public_health_relevance_statement, nlp) name1 = get_gard_abstract_stem_exact(A) name2 = get_gard_abstract_stem_exact(B) name3 = get_gard_abstract_stem_exact(C) @@ -772,7 +853,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_): if name and (name !={}): return name if abstract_ and isinstance(abstract_, str): - A, B, C , D = check_sen(abstract_) + A, B, C , D = check_sen(abstract_, nlp) name1 = get_gard_abstract_stem_exact(A) name2 = get_gard_abstract_stem_exact(B) name3 = get_gard_abstract_stem_exact(C) @@ -780,9 +861,13 @@ def grad_id(title_, Public_health_relevance_statement, abstract_): name=normalize_combined_dictionary(abstract_,title_,name1,name2,name3,name4,0,0.7,'abstract') if name and (name !={}): return name -def GardNameExtractor(project_title,phr_text,abstract_text): +def GardNameExtractor(project_title,phr_text,abstract_text, nlp): #Abstract1['Gard_name']=Abstract1.apply(lambda x: gard_id(x['project_title'],x['phr_text'],x['abstract_text']), axis=1) +<<<<<<<< HEAD:RDAS.GFKG/methods.py + gard_ids = gard_id(project_title,phr_text,abstract_text, nlp) +======== gard_ids = grad_id(project_title,phr_text,abstract_text) +>>>>>>>> devon_dev:RDAS_GFKG/methods.py if gard_ids: return update_dictionary(gard_ids) else: diff --git a/RDAS_GFKG/prep_neo4j_data.py b/RDAS_GFKG/prep_neo4j_data.py index d600a4d..20cd52d 100755 --- a/RDAS_GFKG/prep_neo4j_data.py +++ b/RDAS_GFKG/prep_neo4j_data.py @@ -109,6 +109,53 @@ def combine_normmap_results(): lock = threading.Lock() +<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py +def batch_normmap(df, thr, year, nlp): + r,c = df.shape + for idx in range(r): + try: + with lock: + print(f'{idx}/{r} [{thr}]') + + row = df.iloc[idx] + appl_id = row['APPLICATION_ID'] + abstract = row['ABSTRACT_TEXT'] + phr = row['PHR'] + title = row['PROJECT_TITLE'] + + gard_ids = rdas.GardNameExtractor(title, phr, abstract, nlp) + if gard_ids: + for gard,add_data in gard_ids.items(): + if add_data == 1: + add_data = [1,1] + + with lock: + print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]}) + with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f: + f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n']) + + except Exception as e: + print(e) + continue + +""" +def normmap_process (df): + r,c = df.shape + for idx in range(r): + try: + print(f'{idx}/{r}') + row = df.iloc[idx] + appl_id = row['APPLICATION_ID'] + abstract = row['ABSTRACT_TEXT'] + phr = row['PHR'] + title = row['PROJECT_TITLE'] + + #project_data = rdas.get_project_data(appl_id).get('results')[0] + + #title = project_data.get('project_title') + #phr = project_data.get('phr_text') + +======== def batch_normmap(df, thr, year): r,c = df.shape for idx in range(r): @@ -121,13 +168,27 @@ def batch_normmap(df, thr, year): abstract = row['ABSTRACT_TEXT'] phr = row['PHR'] title = row['PROJECT_TITLE'] +>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py - gard_ids = rdas.GardNameExtractor(title, phr, abstract) - if gard_ids: - for gard,add_data in gard_ids.items(): - if add_data == 1: - add_data = [1,1] + gard_ids = rdas.GardNameExtractor(title, phr, abstract) + if gard_ids: + for gard,add_data in gard_ids.items(): + if add_data == 1: + add_data = [1,1] + + with lock: + print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]}) + with open(data_raw('normmap_results.csv'), "a") as f: + f.writelines([f'{appl_id},{gard},{add_data[0]},{add_data[1]}\n']) + except Exception as e: + print(e) + continue +""" + + +<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py +======== with lock: print({'ID': appl_id, 'GARD_id': gard, 'CONF_SCORE': add_data[0], 'SEM_SIM': add_data[1]}) with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "a") as f: @@ -172,11 +233,16 @@ def normmap_process (df): """ +>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py def run_normmap(): print('Running NormMap') +<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py + nlp = spacy.load("en_core_web_sm") +======== #nlp = spacy.load("en_core_web_sm") +>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py abs_files = glob.glob(data_raw('abstracts') + '/*.csv') abs_files = sorted(abs_files) @@ -185,6 +251,20 @@ def run_normmap(): prj_files = sorted(prj_files) for idx, abs_file in enumerate(abs_files): +<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py + prj_file = prj_files[idx] + + print(abs_file, ' -merged- ',prj_file) + + tmp = pd.read_csv(('{filename}'.format(filename=abs_file)),index_col=False, encoding = "ISO-8859-1") + tmp2 = pd.read_csv(('{filename}'.format(filename=prj_file)),index_col=False, usecols=['APPLICATION_ID','PHR', 'PROJECT_TITLE'], encoding = "ISO-8859-1", low_memory=False) + + merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID']) + merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int) + + year = re.findall(r'\d+', abs_file)[0] + +======== year = re.findall(r'\d+', abs_file)[0] if os.path.exists(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv')): @@ -201,6 +281,7 @@ def run_normmap(): merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID']) merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int) +>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py merged_df.to_csv(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv'), index=False) @@ -226,7 +307,11 @@ def run_normmap(): # Create threads to process results for thrnum, lst in enumerate(list_df): +<<<<<<<< HEAD:RDAS.GFKG/prep_neo4j_data.py + thread = threading.Thread(target=batch_normmap, args=(lst, thrnum, year, nlp), daemon=True) +======== thread = threading.Thread(target=batch_normmap, args=(lst, thrnum, year), daemon=True) +>>>>>>>> devon_dev:RDAS_GFKG/prep_neo4j_data.py thread_list.append(thread) for thr in thread_list: diff --git a/RDAS_PAKG/update.py b/RDAS_PAKG/update.py index 0af57f5..a7df451 100644 --- a/RDAS_PAKG/update.py +++ b/RDAS_PAKG/update.py @@ -24,5 +24,6 @@ def main (update_from=False, update_to=False): RDAS_PAKG.init.main(update_from=update_from, update_to=update_to) + main() #TEST #get_node_counts() diff --git a/backup/RDAS_CTKG/README.md b/backup/RDAS_CTKG/README.md old mode 100755 new mode 100644 diff --git a/backup/RDAS_GARD/README.md b/backup/RDAS_GARD/README.md old mode 100755 new mode 100644 diff --git a/backup/RDAS_GFKG/README.md b/backup/RDAS_GFKG/README.md old mode 100755 new mode 100644 diff --git a/backup/RDAS_PAKG/README.md b/backup/RDAS_PAKG/README.md old mode 100755 new mode 100644