file restructuring

ncats · May 30, 2024 · 50f42f5 · 50f42f5
1 parent 8a02151
commit 50f42f5
Show file tree

Hide file tree

Showing 106 changed files with 16,982 additions and 243 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,36 +13,37 @@
 
 # End of https://www.gitignore.io/api/visualstudiocode
 personal-config.ini
-RDAS.CTKG/src/full_trial_data/
-RDAS.CTKG/src/chromedriver
-RDAS.CTKG/src/ctgov_nctids.json
-RDAS.CTKG/src/ctgov_webscraped_names.csv
-RDAS.CTKG/src/all_queries/
-RDAS.CTKG/src/metamap_cond.txt
-RDAS.CTKG/src/metamap_gard.txt
-RDAS.CTKG/src/metamap_cond_out.json
-RDAS.PAKG/src/
-RDAS.CTKG/src/*.csv
-RDAS.CTKG/src/*.json
-RDAS.CTKG/src/*.txt
+
+RDAS_CTKG/src/full_trial_data/
+RDAS_CTKG/src/chromedriver
+RDAS_CTKG/src/ctgov_nctids.json
+RDAS_CTKG/src/ctgov_webscraped_names.csv
+RDAS_CTKG/src/all_queries/
+RDAS_CTKG/src/metamap_cond.txt
+RDAS_CTKG/src/metamap_gard.txt
+RDAS_CTKG/src/metamap_cond_out.json
+RDAS_PAKG/src/
+RDAS_CTKG/src/*.csv
+RDAS_CTKG/src/*.json
+RDAS_CTKG/src/*.txt
 __pycache__/
-RDAS.CTKG/__pycache__/
-RDAS.GFKG/__pycache__/
-RDAS.PAKG/__pycache__/
-RDAS.PAKG/init_new.py
-RDAS.PAKG/methods_new.py
-RDAS.PAKG/archive/
-RDAS.GFKG/archive/
-RDAS.CTKG/__pycache__/
+RDAS_CTKG/__pycache__/
+RDAS_GFKG/__pycache__/
+RDAS_PAKG/__pycache__/
+RDAS_PAKG/init_new.py
+RDAS_PAKG/methods_new.py
+RDAS_PAKG/archive/
+RDAS_GFKG/archive/
+RDAS_CTKG/__pycache__/
 email/__pycache__/
 transfer/*.dump
 transfer/*.dump
 backup/*/*.dump
 migrated/*.dump
 approved/*.dump
-RDAS.GFKG/src/**/**/*.csv
-RDAS.GFKG/src/**/**/*.json
-RDAS.GFKG/src/**/**/*.zip
+RDAS_GFKG/src/**/**/*.csv
+RDAS_GFKG/src/**/**/*.json
+RDAS_GFKG/src/**/**/*.zip
 grant_2024/src/**/**/*.csv
 grant_2024/src/**/**/*.json
 crt/*.json
@@ -75,22 +76,21 @@ add_participant.txt
 epifix.py
 grant_pipeline.txt
 result.json
-RDAS.PAKG/test_affiliation.py
-RDAS.PAKG/test_genereview.py
+RDAS_PAKG/test_affiliation.py
+RDAS_PAKG/test_genereview.py
 test_affiliation.txt
 logs/
 terms_mapped.csv
 terms_unmapped.csv
 test_term_map.py
-RDAS.CTKG/src/ids_to_add.csv
-RDAS.CTKG/src/ids_to_update.csv
-
-RDAS.CTKG/src/ids_to_update_confirmed.csv
+RDAS_CTKG/src/ids_to_add.csv
+RDAS_CTKG/src/ids_to_update.csv
+RDAS_CTKG/src/ids_to_update_confirmed.csv
 GARD_disease_classification.csv
-gather_RDAS.GFKG_funding_remove.py
-RDAS.GFKG_funding_rdas.csv
+gather_grant_funding_remove.py
+grant_funding_rdas.csv
 fixyear.py
-fixRDAS.PAKGapi.py
+fixpubmedapi.py
 config.ini
 Cluster 7.csv
 codecamp.py
@@ -100,4 +100,11 @@ gather_grant_funding_remove.py
 grant_funding_rdas.csv
 thingforyanji.py
 deliverablesyanji
-NCATS COLLABORATION
+NCATS COLLABORATION
+allennlp
+meta_annotate.py
+genereviews_epi.csv
+genereviews_epi_processed.csv
+fix_pubtator_type.py
+etc
+RDAS_GFKG/src/FineTunned_Bert_2.pt
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,6 @@
-[submodule "/home/leadmandj/RDAS/pubmed/NaturalHistory4GARD"]
-	path = pubmed/NaturalHistory4GARD
-	url = [email protected]:ncats/NaturalHistory4GARD.git
-[submodule "pubmed/epi4GARD"]
-	path = pubmed/epi4GARD
+[submodule "RDAS_PAKG/epi4GARD"]
+	path = RDAS_PAKG/epi4GARD
 	url = https://github.com/ncats/epi4GARD.git
+[submodule "RDAS_PAKG/NaturalHistory4GARD"]
+	path = RDAS_PAKG/NaturalHistory4GARD
+	url = https://github.com/ncats/NaturalHistory4GARD.git
diff --git a/AlertCypher.py b/AlertCypher.py
diff --git a/RDAS.PAKG/NaturalHistory4GARD b/RDAS.PAKG/NaturalHistory4GARD
diff --git a/RDAS.CTKG/init.py → RDAS_CTKG/init.py b/RDAS.CTKG/init.py → RDAS_CTKG/init.py
diff --git a/RDAS.CTKG/methods.py → RDAS_CTKG/methods.py b/RDAS.CTKG/methods.py → RDAS_CTKG/methods.py
@@ -6,6 +6,7 @@
 workspace = os.path.dirname(os.path.abspath(__file__))
 print(workspace)
 sys.path.append(workspace)
+sys.path.append('/home/leadmandj/RDAS/')
 from src import data_model as dm
 import requests
 import html
@@ -23,7 +24,9 @@
 nltk.download("punkt")
 from spacy.matcher import Matcher
 import spacy
+import pandas as pd
 from fuzzywuzzy import fuzz
+import ijson
 import string
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 from transformers import pipeline
@@ -402,7 +405,13 @@ def format_node_data(db,now,trial,node_type,NCTID,update=None,return_single=None
     elif trial:
         for field in fields:
             if field in trial:
-                value = trial[field]
+                if field == 'Phase':
+                    if trial[field]:
+                        value = "; ".join(trial[field])
+                    else:
+                        value = "No Phase Specified"
+                else:
+                    value = trial[field]
                 node_data[field] = value
 
         node_data_list.append(node_data)
@@ -827,6 +836,93 @@ def umls_to_gard(db,CUI):
             names.extend([gard_name])
         return {'gard_id':data, 'gard_name':names}
 
+def convert_semantic_types(type_list):
+    names = pd.read_csv(f'{sysvars.ct_files_path}SemanticTypes_2018AB.txt', delimiter='|', usecols=[0,2], names=['ABBR', 'FULLSEM'])
+    names = dict(zip(names['ABBR'], names['FULLSEM']))
+
+    temp = list()
+    for entry in type_list:
+        temp.append(names[entry])
+    return temp
+
+def add_metamap_annotation(db, trial_info):
+    for k,v in trial_info.items():
+        concept = v['term']
+        score = v['score']
+        types = v['types']
+        nctid = v['nctid']
+        db.run(f'MATCH (y:ClinicalTrial) WHERE y.NCTId = \'{nctid}\' MERGE (x:Trial_Annotation {{umls_cui:\'{k}\', umls_concept:\'{concept}\', umls_types:{types}}}) MERGE (y)-[:has_metamap_annotation {{umls_score:{score}}}]->(x)')
+
+def metamap_trial_annotation(db, trial_info, update_metamap=True):
+    INSTANCE = Submission(os.environ['METAMAP_EMAIL'],os.environ['METAMAP_KEY'])
+    INSTANCE.init_generic_batch('metamap','-J acab,amas,aapp,anab,antb,bact,bacs,bodm,comd,chem,clnd,cgab,diap,dsyn,elii,enzy,emod,fngs,gngm,hops,horm,imft,irda,inpo,inch,inpr,mobd,mosq,neop,nnon,nusq,orch,podg,phsu,rcpt,sosy,topp,virs,vita --JSONn') #--sldiID
+    INSTANCE.form['SingLinePMID'] = True
+
+    trial_strs = [f"{k}|{normalize(v)}\n" for k,v in trial_info.items()]
+    with open(f'{sysvars.ct_files_path}metamap_trials.txt','w') as f:
+        f.writelines(trial_strs)
+
+    # Update MetaMap results if required
+    if update_metamap:
+        if os.path.exists(f'{sysvars.ct_files_path}metamap_trials_out.json'):
+            os.remove(f'{sysvars.ct_files_path}metamap_trials_out.json')
+            print('INITIATING UPDATE... METAMAP_TRIALS_OUT.JSON REMOVED')
+
+    # Run MetaMap and store results
+    if not os.path.exists(f'{sysvars.ct_files_path}metamap_trials_out.json'):
+        INSTANCE.set_batch_file(f'{sysvars.ct_files_path}metamap_trials.txt') #metamap_cond.txt
+        print('METAMAP JOB SUBMITTED')
+        response = INSTANCE.submit()
+
+        try:
+            data = response.content.decode().replace("\n"," ")
+            data = re.search(r"({.+})", data).group(0)
+
+        except Exception as e:
+            print(e)
+            data = None
+
+        try:
+            data = json.loads(data)
+            with open(f'{sysvars.ct_files_path}metamap_trials_out.json','w') as f:
+                json.dump(data,f)
+                data = data['AllDocuments']
+
+        except Exception as e:
+            print(e)
+
+    else:
+        print('USING PREVIOUSLY CREATED METAMAP_TRIALS_OUT.JSON')
+        with open(f'{sysvars.ct_files_path}metamap_trials_out.json','r') as f:
+            data = ijson.items(f,'AllDocuments.item')
+
+            # Process MetaMap results and update database
+            for idx, entry in enumerate(data):
+                print(f'{str(idx)}')
+                utterances = entry['Document']['Utterances'][0]
+                utt_text = utterances['UttText']
+                print(utt_text)
+                phrases = utterances['Phrases']
+
+                nctid = utterances['PMID']
+
+                meta_single_trial = dict()
+                cleaned_meta_single_trial = dict()
+                for phrase in phrases:
+                    if len(phrase['Mappings']) > 0:
+                        for phr in phrase['Mappings']:
+                            meta_term = phr['MappingCandidates'][0]['CandidatePreferred']
+                            meta_cui = phr['MappingCandidates'][0]['CandidateCUI']
+                            meta_score = int(phr['MappingScore'][1:])
+                            meta_types = convert_semantic_types(phr['MappingCandidates'][0]['SemTypes'])
+                            meta_single_trial[meta_cui] = {'term':meta_term.replace('\'',''), 'score':meta_score, 'types':meta_types, 'nctid':nctid}
+
+                        for k,v in meta_single_trial.items():
+                            if not k in cleaned_meta_single_trial:
+                                cleaned_meta_single_trial[k] = v
+
+                add_metamap_annotation(db, cleaned_meta_single_trial)
+                print('------------------------')
 
 
 
@@ -872,7 +968,7 @@ def condition_map(db, update_metamap=True):
 
     print('RUNNING METAMAP')
     # Fetch conditions from the database that havent already been annotated and are not acronyms
-    res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
+    res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Condition_Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
     cond_strs = [f"{i['cond_id']}|{normalize(i['condition'])}\n" for i in res if not is_acronym(i['condition'])]
 
     # Write condition strings to a file for MetaMap processing
@@ -937,8 +1033,6 @@ def condition_map(db, update_metamap=True):
             db.run(query)
 
     print('CREATING AND CONNECTING METAMAP ANNOTATIONS')
-    # Delete existing annotations DONT NEED, REMOVE STEP
-    #db.run('MATCH (x:Annotation) DETACH DELETE x')
     # Fetch relevant data from Condition nodes
     res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()
 
@@ -963,10 +1057,10 @@ def condition_map(db, update_metamap=True):
                 gard_ids = gard_ids['gard_id']
                 for gard_id in gard_ids:
                     # Create Annotation nodes and connect to Condition and GARD nodes
-                    db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
+                    db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Condition_Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
             else:
                 # Create Annotation nodes and connect to Condition nodes
-                db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
+                db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Condition_Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
 
     print('REMOVING UNNEEDED PROPERTIES')
     # Remove unnecessary properties from Condition nodes that were used during processing
@@ -980,7 +1074,7 @@ def condition_map(db, update_metamap=True):
     for entry in res:
         cond_id = entry['cond_id']
         cond = entry['cond']
-        db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
+        db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Condition_Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
 
 
 

diff --git a/RDAS.CTKG/src/data_model.py → RDAS_CTKG/src/data_model.py b/RDAS.CTKG/src/data_model.py → RDAS_CTKG/src/data_model.py
diff --git a/RDAS.CTKG/update.py → RDAS_CTKG/update.py b/RDAS.CTKG/update.py → RDAS_CTKG/update.py
@@ -2,6 +2,7 @@
 import sys
 workspace = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(workspace)
+sys.path.append('/home/leadmandj/RDAS/')
 from AlertCypher import AlertCypher
 from src import data_model as dm 
 from datetime import date,datetime
@@ -168,6 +169,7 @@ def main():
 
     # Add brand new trials
     print('Adding non existent trials in database')
+    metamap_trials = dict()
     for idx,ID in enumerate(ids_to_add):
         if idx < clinical_add_progress:
             continue
@@ -176,13 +178,18 @@ def main():
         print(idx, ID)
 
         trial_info = rdas.extract_fields(ID)
+        metamap_trials[ID] = trial_info['OfficialTitle'] + ' ' + trial_info['BriefSummary']
+
         if trial_info:
             print(f'Adding {ID}...')
             for node_type in dm.node_names:
                 data_string = rdas.format_node_data(db,today,trial_info,node_type,ID)
         else:
             print('Error in add for finding full trial data for ' + ID)
 
+    # Generates MetaMap Annotations for all of the new Clinical Trials
+    rdas.metamap_trial_annotation(db, metamap_trials)
+
     # Update trials already in the database
     print('Updating trials already in database')
     # Starts a new file if file exists but in_progress is false
@@ -211,24 +218,26 @@ def main():
                         wr = csv.writer(f,delimiter="\n")
                         wr.writerow([ID])
 
+    metamap_trials = dict()
     for idx,ID in enumerate(required_updates_nctids):
         if idx < clinical_update_progress:
             continue
         db.setConf('UPDATE_PROGRESS', 'clinical_update_progress', str(idx))
         print(idx, ID)
 
         trial_info = rdas.extract_fields(ID)
+        metamap_trials[ID] = trial_info['OfficialTitle'] + ' ' + trial_info['BriefSummary']
+
         if trial_info:
             for node_type in dm.node_names:
                 data_string = rdas.format_node_data(db,today,trial_info,node_type,ID,update=True)
         else:
             print('Error in add for finding full trial data for ' + ID)
-
-
             #BELOW CREATES HISTORY NODE, POSTPONED FOR NOW
-
             #create_history_query = 'MATCH (x:ClinicalTrial {{NCTId:\"{ID}\"}}) CREATE (y:History) SET y=properties(x) CREATE (z:ClinicalTrial {data_string}) MERGE (y)<-[:updated_from]-(x) SET x=properties(z) SET x.DateCreatedRDAS=\"{today}\" SET x.LastUpdatedRDAS=\"{today}\" DELETE z return y'.format(ID=ID,data_string=data_string,today=today)
             #db.run(create_history_query)
+
+    rdas.metamap_trial_annotation(db, metamap_trials)
 
     # Perform condition mapping
     if clinical_current_step == '':

diff --git a/RDAS.GARD/init.py → RDAS_GARD/init.py b/RDAS.GARD/init.py → RDAS_GARD/init.py
diff --git a/RDAS.GARD/methods.py → RDAS_GARD/methods.py b/RDAS.GARD/methods.py → RDAS_GARD/methods.py
diff --git a/RDAS.GARD/src/GARD.csv → RDAS_GARD/src/GARD.csv b/RDAS.GARD/src/GARD.csv → RDAS_GARD/src/GARD.csv
diff --git a/RDAS.GARD/src/GARD_classification.csv → RDAS_GARD/src/GARD_classification.csv b/RDAS.GARD/src/GARD_classification.csv → RDAS_GARD/src/GARD_classification.csv
diff --git a/RDAS.GARD/src/GARD_genes.csv → RDAS_GARD/src/GARD_genes.csv b/RDAS.GARD/src/GARD_genes.csv → RDAS_GARD/src/GARD_genes.csv
diff --git a/RDAS.GARD/src/GARD_phenotypes.csv → RDAS_GARD/src/GARD_phenotypes.csv b/RDAS.GARD/src/GARD_phenotypes.csv → RDAS_GARD/src/GARD_phenotypes.csv
diff --git a/RDAS.GARD/src/GARD_xrefs.csv → RDAS_GARD/src/GARD_xrefs.csv b/RDAS.GARD/src/GARD_xrefs.csv → RDAS_GARD/src/GARD_xrefs.csv
diff --git a/RDAS.GARD/src/README.md → RDAS_GARD/src/README.md b/RDAS.GARD/src/README.md → RDAS_GARD/src/README.md
diff --git a/RDAS.GARD/src/metamap_gard.txt → RDAS_GARD/src/metamap_gard.txt b/RDAS.GARD/src/metamap_gard.txt → RDAS_GARD/src/metamap_gard.txt
diff --git a/RDAS.GARD/src/metamap_gard_out.json → RDAS_GARD/src/metamap_gard_out.json b/RDAS.GARD/src/metamap_gard_out.json → RDAS_GARD/src/metamap_gard_out.json
diff --git a/...GardNameExtractor/GardNameExtractor.ipynb → ...GardNameExtractor/GardNameExtractor.ipynb b/...GardNameExtractor/GardNameExtractor.ipynb → ...GardNameExtractor/GardNameExtractor.ipynb
diff --git a/...KG/GardNameExtractor/GardNameExtractor.py → ...KG/GardNameExtractor/GardNameExtractor.py b/...KG/GardNameExtractor/GardNameExtractor.py → ...KG/GardNameExtractor/GardNameExtractor.py
diff --git a/...GardNameExtractor/GardNamePreprocessor.py → ...GardNameExtractor/GardNamePreprocessor.py b/...GardNameExtractor/GardNamePreprocessor.py → ...GardNameExtractor/GardNamePreprocessor.py