Skip to content

Commit

Permalink
file restructuring
Browse files Browse the repository at this point in the history
  • Loading branch information
Devon Joseph Leadman committed May 30, 2024
1 parent 8a02151 commit 50f42f5
Show file tree
Hide file tree
Showing 106 changed files with 16,982 additions and 243 deletions.
73 changes: 40 additions & 33 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,37 @@

# End of https://www.gitignore.io/api/visualstudiocode
personal-config.ini
RDAS.CTKG/src/full_trial_data/
RDAS.CTKG/src/chromedriver
RDAS.CTKG/src/ctgov_nctids.json
RDAS.CTKG/src/ctgov_webscraped_names.csv
RDAS.CTKG/src/all_queries/
RDAS.CTKG/src/metamap_cond.txt
RDAS.CTKG/src/metamap_gard.txt
RDAS.CTKG/src/metamap_cond_out.json
RDAS.PAKG/src/
RDAS.CTKG/src/*.csv
RDAS.CTKG/src/*.json
RDAS.CTKG/src/*.txt

RDAS_CTKG/src/full_trial_data/
RDAS_CTKG/src/chromedriver
RDAS_CTKG/src/ctgov_nctids.json
RDAS_CTKG/src/ctgov_webscraped_names.csv
RDAS_CTKG/src/all_queries/
RDAS_CTKG/src/metamap_cond.txt
RDAS_CTKG/src/metamap_gard.txt
RDAS_CTKG/src/metamap_cond_out.json
RDAS_PAKG/src/
RDAS_CTKG/src/*.csv
RDAS_CTKG/src/*.json
RDAS_CTKG/src/*.txt
__pycache__/
RDAS.CTKG/__pycache__/
RDAS.GFKG/__pycache__/
RDAS.PAKG/__pycache__/
RDAS.PAKG/init_new.py
RDAS.PAKG/methods_new.py
RDAS.PAKG/archive/
RDAS.GFKG/archive/
RDAS.CTKG/__pycache__/
RDAS_CTKG/__pycache__/
RDAS_GFKG/__pycache__/
RDAS_PAKG/__pycache__/
RDAS_PAKG/init_new.py
RDAS_PAKG/methods_new.py
RDAS_PAKG/archive/
RDAS_GFKG/archive/
RDAS_CTKG/__pycache__/
email/__pycache__/
transfer/*.dump
transfer/*.dump
backup/*/*.dump
migrated/*.dump
approved/*.dump
RDAS.GFKG/src/**/**/*.csv
RDAS.GFKG/src/**/**/*.json
RDAS.GFKG/src/**/**/*.zip
RDAS_GFKG/src/**/**/*.csv
RDAS_GFKG/src/**/**/*.json
RDAS_GFKG/src/**/**/*.zip
grant_2024/src/**/**/*.csv
grant_2024/src/**/**/*.json
crt/*.json
Expand Down Expand Up @@ -75,22 +76,21 @@ add_participant.txt
epifix.py
grant_pipeline.txt
result.json
RDAS.PAKG/test_affiliation.py
RDAS.PAKG/test_genereview.py
RDAS_PAKG/test_affiliation.py
RDAS_PAKG/test_genereview.py
test_affiliation.txt
logs/
terms_mapped.csv
terms_unmapped.csv
test_term_map.py
RDAS.CTKG/src/ids_to_add.csv
RDAS.CTKG/src/ids_to_update.csv

RDAS.CTKG/src/ids_to_update_confirmed.csv
RDAS_CTKG/src/ids_to_add.csv
RDAS_CTKG/src/ids_to_update.csv
RDAS_CTKG/src/ids_to_update_confirmed.csv
GARD_disease_classification.csv
gather_RDAS.GFKG_funding_remove.py
RDAS.GFKG_funding_rdas.csv
gather_grant_funding_remove.py
grant_funding_rdas.csv
fixyear.py
fixRDAS.PAKGapi.py
fixpubmedapi.py
config.ini
Cluster 7.csv
codecamp.py
Expand All @@ -100,4 +100,11 @@ gather_grant_funding_remove.py
grant_funding_rdas.csv
thingforyanji.py
deliverablesyanji
NCATS COLLABORATION
NCATS COLLABORATION
allennlp
meta_annotate.py
genereviews_epi.csv
genereviews_epi_processed.csv
fix_pubtator_type.py
etc
RDAS_GFKG/src/FineTunned_Bert_2.pt
10 changes: 5 additions & 5 deletions .gitmodules
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[submodule "/home/leadmandj/RDAS/pubmed/NaturalHistory4GARD"]
path = pubmed/NaturalHistory4GARD
url = [email protected]:ncats/NaturalHistory4GARD.git
[submodule "pubmed/epi4GARD"]
path = pubmed/epi4GARD
[submodule "RDAS_PAKG/epi4GARD"]
path = RDAS_PAKG/epi4GARD
url = https://github.com/ncats/epi4GARD.git
[submodule "RDAS_PAKG/NaturalHistory4GARD"]
path = RDAS_PAKG/NaturalHistory4GARD
url = https://github.com/ncats/NaturalHistory4GARD.git
Empty file modified AlertCypher.py
100644 → 100755
Empty file.
1 change: 0 additions & 1 deletion RDAS.PAKG/NaturalHistory4GARD
Submodule NaturalHistory4GARD deleted from 9fff58
File renamed without changes.
108 changes: 101 additions & 7 deletions RDAS.CTKG/methods.py → RDAS_CTKG/methods.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
workspace = os.path.dirname(os.path.abspath(__file__))
print(workspace)
sys.path.append(workspace)
sys.path.append('/home/leadmandj/RDAS/')
from src import data_model as dm
import requests
import html
Expand All @@ -23,7 +24,9 @@
nltk.download("punkt")
from spacy.matcher import Matcher
import spacy
import pandas as pd
from fuzzywuzzy import fuzz
import ijson
import string
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
Expand Down Expand Up @@ -402,7 +405,13 @@ def format_node_data(db,now,trial,node_type,NCTID,update=None,return_single=None
elif trial:
for field in fields:
if field in trial:
value = trial[field]
if field == 'Phase':
if trial[field]:
value = "; ".join(trial[field])
else:
value = "No Phase Specified"
else:
value = trial[field]
node_data[field] = value

node_data_list.append(node_data)
Expand Down Expand Up @@ -827,6 +836,93 @@ def umls_to_gard(db,CUI):
names.extend([gard_name])
return {'gard_id':data, 'gard_name':names}

def convert_semantic_types(type_list):
names = pd.read_csv(f'{sysvars.ct_files_path}SemanticTypes_2018AB.txt', delimiter='|', usecols=[0,2], names=['ABBR', 'FULLSEM'])
names = dict(zip(names['ABBR'], names['FULLSEM']))

temp = list()
for entry in type_list:
temp.append(names[entry])
return temp

def add_metamap_annotation(db, trial_info):
for k,v in trial_info.items():
concept = v['term']
score = v['score']
types = v['types']
nctid = v['nctid']
db.run(f'MATCH (y:ClinicalTrial) WHERE y.NCTId = \'{nctid}\' MERGE (x:Trial_Annotation {{umls_cui:\'{k}\', umls_concept:\'{concept}\', umls_types:{types}}}) MERGE (y)-[:has_metamap_annotation {{umls_score:{score}}}]->(x)')

def metamap_trial_annotation(db, trial_info, update_metamap=True):
INSTANCE = Submission(os.environ['METAMAP_EMAIL'],os.environ['METAMAP_KEY'])
INSTANCE.init_generic_batch('metamap','-J acab,amas,aapp,anab,antb,bact,bacs,bodm,comd,chem,clnd,cgab,diap,dsyn,elii,enzy,emod,fngs,gngm,hops,horm,imft,irda,inpo,inch,inpr,mobd,mosq,neop,nnon,nusq,orch,podg,phsu,rcpt,sosy,topp,virs,vita --JSONn') #--sldiID
INSTANCE.form['SingLinePMID'] = True

trial_strs = [f"{k}|{normalize(v)}\n" for k,v in trial_info.items()]
with open(f'{sysvars.ct_files_path}metamap_trials.txt','w') as f:
f.writelines(trial_strs)

# Update MetaMap results if required
if update_metamap:
if os.path.exists(f'{sysvars.ct_files_path}metamap_trials_out.json'):
os.remove(f'{sysvars.ct_files_path}metamap_trials_out.json')
print('INITIATING UPDATE... METAMAP_TRIALS_OUT.JSON REMOVED')

# Run MetaMap and store results
if not os.path.exists(f'{sysvars.ct_files_path}metamap_trials_out.json'):
INSTANCE.set_batch_file(f'{sysvars.ct_files_path}metamap_trials.txt') #metamap_cond.txt
print('METAMAP JOB SUBMITTED')
response = INSTANCE.submit()

try:
data = response.content.decode().replace("\n"," ")
data = re.search(r"({.+})", data).group(0)

except Exception as e:
print(e)
data = None

try:
data = json.loads(data)
with open(f'{sysvars.ct_files_path}metamap_trials_out.json','w') as f:
json.dump(data,f)
data = data['AllDocuments']

except Exception as e:
print(e)

else:
print('USING PREVIOUSLY CREATED METAMAP_TRIALS_OUT.JSON')
with open(f'{sysvars.ct_files_path}metamap_trials_out.json','r') as f:
data = ijson.items(f,'AllDocuments.item')

# Process MetaMap results and update database
for idx, entry in enumerate(data):
print(f'{str(idx)}')
utterances = entry['Document']['Utterances'][0]
utt_text = utterances['UttText']
print(utt_text)
phrases = utterances['Phrases']

nctid = utterances['PMID']

meta_single_trial = dict()
cleaned_meta_single_trial = dict()
for phrase in phrases:
if len(phrase['Mappings']) > 0:
for phr in phrase['Mappings']:
meta_term = phr['MappingCandidates'][0]['CandidatePreferred']
meta_cui = phr['MappingCandidates'][0]['CandidateCUI']
meta_score = int(phr['MappingScore'][1:])
meta_types = convert_semantic_types(phr['MappingCandidates'][0]['SemTypes'])
meta_single_trial[meta_cui] = {'term':meta_term.replace('\'',''), 'score':meta_score, 'types':meta_types, 'nctid':nctid}

for k,v in meta_single_trial.items():
if not k in cleaned_meta_single_trial:
cleaned_meta_single_trial[k] = v

add_metamap_annotation(db, cleaned_meta_single_trial)
print('------------------------')



Expand Down Expand Up @@ -872,7 +968,7 @@ def condition_map(db, update_metamap=True):

print('RUNNING METAMAP')
# Fetch conditions from the database that havent already been annotated and are not acronyms
res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Condition_Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
cond_strs = [f"{i['cond_id']}|{normalize(i['condition'])}\n" for i in res if not is_acronym(i['condition'])]

# Write condition strings to a file for MetaMap processing
Expand Down Expand Up @@ -937,8 +1033,6 @@ def condition_map(db, update_metamap=True):
db.run(query)

print('CREATING AND CONNECTING METAMAP ANNOTATIONS')
# Delete existing annotations DONT NEED, REMOVE STEP
#db.run('MATCH (x:Annotation) DETACH DELETE x')
# Fetch relevant data from Condition nodes
res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()

Expand All @@ -963,10 +1057,10 @@ def condition_map(db, update_metamap=True):
gard_ids = gard_ids['gard_id']
for gard_id in gard_ids:
# Create Annotation nodes and connect to Condition and GARD nodes
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Condition_Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
else:
# Create Annotation nodes and connect to Condition nodes
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Condition_Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))

print('REMOVING UNNEEDED PROPERTIES')
# Remove unnecessary properties from Condition nodes that were used during processing
Expand All @@ -980,7 +1074,7 @@ def condition_map(db, update_metamap=True):
for entry in res:
cond_id = entry['cond_id']
cond = entry['cond']
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Condition_Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))



Expand Down
File renamed without changes.
15 changes: 12 additions & 3 deletions RDAS.CTKG/update.py → RDAS_CTKG/update.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys
workspace = os.path.dirname(os.path.abspath(__file__))
sys.path.append(workspace)
sys.path.append('/home/leadmandj/RDAS/')
from AlertCypher import AlertCypher
from src import data_model as dm
from datetime import date,datetime
Expand Down Expand Up @@ -168,6 +169,7 @@ def main():

# Add brand new trials
print('Adding non existent trials in database')
metamap_trials = dict()
for idx,ID in enumerate(ids_to_add):
if idx < clinical_add_progress:
continue
Expand All @@ -176,13 +178,18 @@ def main():
print(idx, ID)

trial_info = rdas.extract_fields(ID)
metamap_trials[ID] = trial_info['OfficialTitle'] + ' ' + trial_info['BriefSummary']

if trial_info:
print(f'Adding {ID}...')
for node_type in dm.node_names:
data_string = rdas.format_node_data(db,today,trial_info,node_type,ID)
else:
print('Error in add for finding full trial data for ' + ID)

# Generates MetaMap Annotations for all of the new Clinical Trials
rdas.metamap_trial_annotation(db, metamap_trials)

# Update trials already in the database
print('Updating trials already in database')
# Starts a new file if file exists but in_progress is false
Expand Down Expand Up @@ -211,24 +218,26 @@ def main():
wr = csv.writer(f,delimiter="\n")
wr.writerow([ID])

metamap_trials = dict()
for idx,ID in enumerate(required_updates_nctids):
if idx < clinical_update_progress:
continue
db.setConf('UPDATE_PROGRESS', 'clinical_update_progress', str(idx))
print(idx, ID)

trial_info = rdas.extract_fields(ID)
metamap_trials[ID] = trial_info['OfficialTitle'] + ' ' + trial_info['BriefSummary']

if trial_info:
for node_type in dm.node_names:
data_string = rdas.format_node_data(db,today,trial_info,node_type,ID,update=True)
else:
print('Error in add for finding full trial data for ' + ID)


#BELOW CREATES HISTORY NODE, POSTPONED FOR NOW

#create_history_query = 'MATCH (x:ClinicalTrial {{NCTId:\"{ID}\"}}) CREATE (y:History) SET y=properties(x) CREATE (z:ClinicalTrial {data_string}) MERGE (y)<-[:updated_from]-(x) SET x=properties(z) SET x.DateCreatedRDAS=\"{today}\" SET x.LastUpdatedRDAS=\"{today}\" DELETE z return y'.format(ID=ID,data_string=data_string,today=today)
#db.run(create_history_query)

rdas.metamap_trial_annotation(db, metamap_trials)

# Perform condition mapping
if clinical_current_step == '':
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 50f42f5

Please sign in to comment.