Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Devon_dev #57

Merged
merged 16 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 44 additions & 26 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,36 @@

# End of https://www.gitignore.io/api/visualstudiocode
personal-config.ini
environment_setup.py
clinical/src/full_trial_data/
clinical/src/chromedriver
clinical/src/ctgov_nctids.json
clinical/src/ctgov_webscraped_names.csv
clinical/src/all_queries/
clinical/src/metamap_cond.txt
clinical/src/metamap_gard.txt
clinical/src/metamap_cond_out.json
pubmed/src/
gard/src/*.csv
gard/src/*.json
gard/src/*.txt
RDAS.CTKG/src/full_trial_data/
RDAS.CTKG/src/chromedriver
RDAS.CTKG/src/ctgov_nctids.json
RDAS.CTKG/src/ctgov_webscraped_names.csv
RDAS.CTKG/src/all_queries/
RDAS.CTKG/src/metamap_cond.txt
RDAS.CTKG/src/metamap_gard.txt
RDAS.CTKG/src/metamap_cond_out.json
RDAS.PAKG/src/
RDAS.CTKG/src/*.csv
RDAS.CTKG/src/*.json
RDAS.CTKG/src/*.txt
__pycache__/
clinical/__pycache__/
grant/__pycache__/
pubmed/__pycache__/
pubmed/init_new.py
pubmed/methods_new.py
pubmed/archive/
grant/archive/
gard/__pycache__/
RDAS.CTKG/__pycache__/
RDAS.GFKG/__pycache__/
RDAS.PAKG/__pycache__/
RDAS.PAKG/init_new.py
RDAS.PAKG/methods_new.py
RDAS.PAKG/archive/
RDAS.GFKG/archive/
RDAS.CTKG/__pycache__/
email/__pycache__/
transfer/*.dump
transfer/*.dump
backup/*/*.dump
migrated/*.dump
approved/*.dump
grant/src/**/**/*.csv
grant/src/**/**/*.json
grant/src/**/**/*.zip
RDAS.GFKG/src/**/**/*.csv
RDAS.GFKG/src/**/**/*.json
RDAS.GFKG/src/**/**/*.zip
grant_2024/src/**/**/*.csv
grant_2024/src/**/**/*.json
crt/*.json
Expand Down Expand Up @@ -76,10 +75,29 @@ add_participant.txt
epifix.py
grant_pipeline.txt
result.json
pubmed/test_affiliation.py
pubmed/test_genereview.py
RDAS.PAKG/test_affiliation.py
RDAS.PAKG/test_genereview.py
test_affiliation.txt
logs/
terms_mapped.csv
terms_unmapped.csv
test_term_map.py
RDAS.CTKG/src/ids_to_add.csv
RDAS.CTKG/src/ids_to_update.csv

RDAS.CTKG/src/ids_to_update_confirmed.csv
GARD_disease_classification.csv
gather_RDAS.GFKG_funding_remove.py
RDAS.GFKG_funding_rdas.csv
fixyear.py
fixRDAS.PAKGapi.py
config.ini
Cluster 7.csv
codecamp.py
cluster_trials.csv
fixpubmedapi.py
gather_grant_funding_remove.py
grant_funding_rdas.csv
thingforyanji.py
deliverablesyanji
NCATS COLLABORATION
File renamed without changes.
84 changes: 57 additions & 27 deletions clinical/methods.py → RDAS.CTKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from nltk.stem import PorterStemmer
nltk.download("punkt")
from spacy.matcher import Matcher
import spacy
from fuzzywuzzy import fuzz
import string
from transformers import AutoTokenizer, AutoModelForTokenClassification
Expand Down Expand Up @@ -128,23 +129,29 @@ def get_nctids(name_list):
response = requests.get(initial_query + query_end1).text.splitlines()
total_trials = int(response[4][16:-1])

# Add trials to a temporary list
trials = list()
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Break into extra queries of 1000 trials if necessary
for rank in range(1, total_trials//1000 + 1):
# Get next 1000 trials
query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
response = requests.get(initial_query + query_end2).text.splitlines()

# Add trials to the temporary list
try:
# Add trials to a temporary list
trials = list()
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Add the trials from the temporary list to the overall list
all_trials += trials
# Break into extra queries of 1000 trials if necessary
for rank in range(1, total_trials//1000 + 1):
# Get next 1000 trials
query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
response = requests.get(initial_query + query_end2).text.splitlines()

# Add trials to the temporary list
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Add the trials from the temporary list to the overall list
all_trials += trials

except Exception as e:
print(e)
print(initial_query + query_end2)
print(trial)

# Return the list of all retrived NCTIDs
return all_trials
Expand Down Expand Up @@ -256,6 +263,19 @@ def extract_fields(nctid):
return full_trial


def get_lastupdated_postdate (ID):
postdate_query = f'https://clinicaltrials.gov/api/query/field_values?expr={ID}&field=LastUpdatePostDate&fmt=json'
try:
# Make the API request and parse the JSON response
full_response = requests.get(postdate_query).json()
postdate = full_response['FieldValuesResponse']['FieldValues'][0]['FieldValue']

return postdate

except ValueError:
# Return None if there is an issue with the JSON response
return None



def cypher_generate(db,now,NCTID,data,node_type,update=None,return_single=None):
Expand Down Expand Up @@ -432,6 +452,7 @@ def unpack_nested_data (db, now, nctid, trial, node_type):
#ALSO POSTPONED
#create_leaf_nodes(db, trial, node_id, node_type)
"""
queries = None

if node_type == 'ClinicalTrial':
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
Expand Down Expand Up @@ -835,22 +856,23 @@ def condition_map(db, update_metamap=True):

print('RUNNING GARD POPULATION')
# Fetch GARD entries from the database
gard_res = gard_db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms, x.UMLS as gUMLS, x.UMLS_Source as usource')
gard_res = gard_db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.UMLS as gUMLS, x.GardName as GardName, x.Synonyms as Synonyms, x.UMLS_Source as usource')
for gres in gard_res.data():
gUMLS = gres['gUMLS']
name = gres['GardName']
gard_id = gres['GardId']
syns = gres['Synonyms']
usource = gres['usource']

# Check if UMLS data is present and create GARD node accordingly
if gUMLS:
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS:{gUMLS},UMLS_Source:\"{usource}\"}})'.format(name=gres['GardName'],gard_id=gres['GardId'],syns=gres['Synonyms'],gUMLS=gres['gUMLS'],usource=gres['usource']))
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS:{gUMLS},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,gUMLS=gUMLS,usource=usource))
else:
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=gres['GardName'],gard_id=gres['GardId'],syns=gres['Synonyms'],usource=gres['usource']))
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,usource=usource))

print('RUNNING METAMAP')
# Fetch conditions from the database
res = db.run('MATCH (c:Condition) RETURN c.Condition as condition, ID(c) as cond_id')
# Fetch conditions from the database that havent already been annotated and are not acronyms
res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
cond_strs = [f"{i['cond_id']}|{normalize(i['condition'])}\n" for i in res if not is_acronym(i['condition'])]

# Write condition strings to a file for MetaMap processing
Expand Down Expand Up @@ -915,8 +937,8 @@ def condition_map(db, update_metamap=True):
db.run(query)

print('CREATING AND CONNECTING METAMAP ANNOTATIONS')
# Delete existing annotations
db.run('MATCH (x:Annotation) DETACH DELETE x')
# Delete existing annotations DONT NEED, REMOVE STEP
#db.run('MATCH (x:Annotation) DETACH DELETE x')
# Fetch relevant data from Condition nodes
res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()

Expand All @@ -941,10 +963,10 @@ def condition_map(db, update_metamap=True):
gard_ids = gard_ids['gard_id']
for gard_id in gard_ids:
# Create Annotation nodes and connect to Condition and GARD nodes
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS: \"{umls}\", CandidatePreferred: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
else:
# Create Annotation nodes and connect to Condition nodes
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS: \"{umls}\", CandidatePreferred: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))

print('REMOVING UNNEEDED PROPERTIES')
# Remove unnecessary properties from Condition nodes that were used during processing
Expand All @@ -958,7 +980,7 @@ def condition_map(db, update_metamap=True):
for entry in res:
cond_id = entry['cond_id']
cond = entry['cond']
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{CandidatePreferred: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))



Expand Down Expand Up @@ -1006,6 +1028,7 @@ def create_drug_connection(db,rxdata,drug_id,wspacy=False):

# Create or merge Drug node with RxNormID
db.run('MATCH (x:Intervention) WHERE ID(x)={drug_id} MERGE (y:Drug {{RxNormID:{rxnormid}}}) MERGE (y)<-[:mapped_to_rxnorm {{WITH_SPACY: {wspacy}}}]-(x)'.format(rxnormid=rxnormid, drug_id=drug_id, wspacy=wspacy))
print(f'MAPPED {rxnormid}')

# Set additional properties on the Drug node
for k,v in rxdata.items():
Expand Down Expand Up @@ -1090,7 +1113,7 @@ def nlp_to_drug(db,doc,matches,drug_name,drug_id):



def rxnorm_map(db):
def rxnorm_map(db, rxnorm_progress):
"""
Map RxNorm data to Drug Interventions in the Neo4j database.

Expand All @@ -1109,11 +1132,18 @@ def rxnorm_map(db):
matcher = Matcher(nlp.vocab)
matcher.add('DRUG',[pattern])

# Retrieve drug interventions from the database
results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" RETURN x.InterventionName, ID(x)').data()
# Retrieve drug interventions from the database that do NOT already have a Drug node attached
results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
length = len(results)

# Iterate over drug interventions and map RxNorm data
for idx,res in enumerate(results):
if idx < rxnorm_progress:
continue

print(f'{str(idx)}/{length}')
db.setConf('UPDATE_PROGRESS', 'clinical_rxnorm_progress', str(idx))

drug_id = res['ID(x)']
drug = res['x.InterventionName']

Expand Down
File renamed without changes.
Loading
Loading