Skip to content

Commit

Permalink
Devon_dev (#57)
Browse files Browse the repository at this point in the history
* Updated grant GardPreprocessor code

* remove comment

* clinical and grant pipeline improvements

* Update to File Transfer Code

* file transfer v2

* File Transfer Fix v3

* File Transfer Fix v4

* File Transfer Fix v4 - Typo

* neo4j-test transfer detect fix

* cluster seeding modification

* more transfer pipeline fixes

* config and sysvar file change

* small pubmed pipeline fix

* dev-test-prod pipeline done

* Database Renaming

---------

Co-authored-by: Devon Joseph Leadman <[email protected]>
Co-authored-by: Devon Joseph Leadman-m <[email protected]>
  • Loading branch information
3 people authored May 30, 2024
1 parent daf8966 commit d410a7e
Show file tree
Hide file tree
Showing 73 changed files with 281,323 additions and 497 deletions.
70 changes: 44 additions & 26 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,36 @@

# End of https://www.gitignore.io/api/visualstudiocode
personal-config.ini
environment_setup.py
clinical/src/full_trial_data/
clinical/src/chromedriver
clinical/src/ctgov_nctids.json
clinical/src/ctgov_webscraped_names.csv
clinical/src/all_queries/
clinical/src/metamap_cond.txt
clinical/src/metamap_gard.txt
clinical/src/metamap_cond_out.json
pubmed/src/
gard/src/*.csv
gard/src/*.json
gard/src/*.txt
RDAS.CTKG/src/full_trial_data/
RDAS.CTKG/src/chromedriver
RDAS.CTKG/src/ctgov_nctids.json
RDAS.CTKG/src/ctgov_webscraped_names.csv
RDAS.CTKG/src/all_queries/
RDAS.CTKG/src/metamap_cond.txt
RDAS.CTKG/src/metamap_gard.txt
RDAS.CTKG/src/metamap_cond_out.json
RDAS.PAKG/src/
RDAS.CTKG/src/*.csv
RDAS.CTKG/src/*.json
RDAS.CTKG/src/*.txt
__pycache__/
clinical/__pycache__/
grant/__pycache__/
pubmed/__pycache__/
pubmed/init_new.py
pubmed/methods_new.py
pubmed/archive/
grant/archive/
gard/__pycache__/
RDAS.CTKG/__pycache__/
RDAS.GFKG/__pycache__/
RDAS.PAKG/__pycache__/
RDAS.PAKG/init_new.py
RDAS.PAKG/methods_new.py
RDAS.PAKG/archive/
RDAS.GFKG/archive/
RDAS.CTKG/__pycache__/
email/__pycache__/
transfer/*.dump
transfer/*.dump
backup/*/*.dump
migrated/*.dump
approved/*.dump
grant/src/**/**/*.csv
grant/src/**/**/*.json
grant/src/**/**/*.zip
RDAS.GFKG/src/**/**/*.csv
RDAS.GFKG/src/**/**/*.json
RDAS.GFKG/src/**/**/*.zip
grant_2024/src/**/**/*.csv
grant_2024/src/**/**/*.json
crt/*.json
Expand Down Expand Up @@ -76,10 +75,29 @@ add_participant.txt
epifix.py
grant_pipeline.txt
result.json
pubmed/test_affiliation.py
pubmed/test_genereview.py
RDAS.PAKG/test_affiliation.py
RDAS.PAKG/test_genereview.py
test_affiliation.txt
logs/
terms_mapped.csv
terms_unmapped.csv
test_term_map.py
RDAS.CTKG/src/ids_to_add.csv
RDAS.CTKG/src/ids_to_update.csv

RDAS.CTKG/src/ids_to_update_confirmed.csv
GARD_disease_classification.csv
gather_RDAS.GFKG_funding_remove.py
RDAS.GFKG_funding_rdas.csv
fixyear.py
fixRDAS.PAKGapi.py
config.ini
Cluster 7.csv
codecamp.py
cluster_trials.csv
fixpubmedapi.py
gather_grant_funding_remove.py
grant_funding_rdas.csv
thingforyanji.py
deliverablesyanji
NCATS COLLABORATION
File renamed without changes.
84 changes: 57 additions & 27 deletions clinical/methods.py → RDAS.CTKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from nltk.stem import PorterStemmer
nltk.download("punkt")
from spacy.matcher import Matcher
import spacy
from fuzzywuzzy import fuzz
import string
from transformers import AutoTokenizer, AutoModelForTokenClassification
Expand Down Expand Up @@ -128,23 +129,29 @@ def get_nctids(name_list):
response = requests.get(initial_query + query_end1).text.splitlines()
total_trials = int(response[4][16:-1])

# Add trials to a temporary list
trials = list()
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Break into extra queries of 1000 trials if necessary
for rank in range(1, total_trials//1000 + 1):
# Get next 1000 trials
query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
response = requests.get(initial_query + query_end2).text.splitlines()

# Add trials to the temporary list
try:
# Add trials to a temporary list
trials = list()
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Add the trials from the temporary list to the overall list
all_trials += trials
# Break into extra queries of 1000 trials if necessary
for rank in range(1, total_trials//1000 + 1):
# Get next 1000 trials
query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
response = requests.get(initial_query + query_end2).text.splitlines()

# Add trials to the temporary list
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Add the trials from the temporary list to the overall list
all_trials += trials

except Exception as e:
print(e)
print(initial_query + query_end2)
print(trial)

# Return the list of all retrived NCTIDs
return all_trials
Expand Down Expand Up @@ -256,6 +263,19 @@ def extract_fields(nctid):
return full_trial


def get_lastupdated_postdate (ID):
postdate_query = f'https://clinicaltrials.gov/api/query/field_values?expr={ID}&field=LastUpdatePostDate&fmt=json'
try:
# Make the API request and parse the JSON response
full_response = requests.get(postdate_query).json()
postdate = full_response['FieldValuesResponse']['FieldValues'][0]['FieldValue']

return postdate

except ValueError:
# Return None if there is an issue with the JSON response
return None



def cypher_generate(db,now,NCTID,data,node_type,update=None,return_single=None):
Expand Down Expand Up @@ -432,6 +452,7 @@ def unpack_nested_data (db, now, nctid, trial, node_type):
#ALSO POSTPONED
#create_leaf_nodes(db, trial, node_id, node_type)
"""
queries = None

if node_type == 'ClinicalTrial':
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
Expand Down Expand Up @@ -835,22 +856,23 @@ def condition_map(db, update_metamap=True):

print('RUNNING GARD POPULATION')
# Fetch GARD entries from the database
gard_res = gard_db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms, x.UMLS as gUMLS, x.UMLS_Source as usource')
gard_res = gard_db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.UMLS as gUMLS, x.GardName as GardName, x.Synonyms as Synonyms, x.UMLS_Source as usource')
for gres in gard_res.data():
gUMLS = gres['gUMLS']
name = gres['GardName']
gard_id = gres['GardId']
syns = gres['Synonyms']
usource = gres['usource']

# Check if UMLS data is present and create GARD node accordingly
if gUMLS:
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS:{gUMLS},UMLS_Source:\"{usource}\"}})'.format(name=gres['GardName'],gard_id=gres['GardId'],syns=gres['Synonyms'],gUMLS=gres['gUMLS'],usource=gres['usource']))
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS:{gUMLS},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,gUMLS=gUMLS,usource=usource))
else:
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=gres['GardName'],gard_id=gres['GardId'],syns=gres['Synonyms'],usource=gres['usource']))
db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,usource=usource))

print('RUNNING METAMAP')
# Fetch conditions from the database
res = db.run('MATCH (c:Condition) RETURN c.Condition as condition, ID(c) as cond_id')
# Fetch conditions from the database that havent already been annotated and are not acronyms
res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
cond_strs = [f"{i['cond_id']}|{normalize(i['condition'])}\n" for i in res if not is_acronym(i['condition'])]

# Write condition strings to a file for MetaMap processing
Expand Down Expand Up @@ -915,8 +937,8 @@ def condition_map(db, update_metamap=True):
db.run(query)

print('CREATING AND CONNECTING METAMAP ANNOTATIONS')
# Delete existing annotations
db.run('MATCH (x:Annotation) DETACH DELETE x')
# Delete existing annotations DONT NEED, REMOVE STEP
#db.run('MATCH (x:Annotation) DETACH DELETE x')
# Fetch relevant data from Condition nodes
res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()

Expand All @@ -941,10 +963,10 @@ def condition_map(db, update_metamap=True):
gard_ids = gard_ids['gard_id']
for gard_id in gard_ids:
# Create Annotation nodes and connect to Condition and GARD nodes
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS: \"{umls}\", CandidatePreferred: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
else:
# Create Annotation nodes and connect to Condition nodes
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS: \"{umls}\", CandidatePreferred: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))

print('REMOVING UNNEEDED PROPERTIES')
# Remove unnecessary properties from Condition nodes that were used during processing
Expand All @@ -958,7 +980,7 @@ def condition_map(db, update_metamap=True):
for entry in res:
cond_id = entry['cond_id']
cond = entry['cond']
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{CandidatePreferred: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))



Expand Down Expand Up @@ -1006,6 +1028,7 @@ def create_drug_connection(db,rxdata,drug_id,wspacy=False):

# Create or merge Drug node with RxNormID
db.run('MATCH (x:Intervention) WHERE ID(x)={drug_id} MERGE (y:Drug {{RxNormID:{rxnormid}}}) MERGE (y)<-[:mapped_to_rxnorm {{WITH_SPACY: {wspacy}}}]-(x)'.format(rxnormid=rxnormid, drug_id=drug_id, wspacy=wspacy))
print(f'MAPPED {rxnormid}')

# Set additional properties on the Drug node
for k,v in rxdata.items():
Expand Down Expand Up @@ -1090,7 +1113,7 @@ def nlp_to_drug(db,doc,matches,drug_name,drug_id):



def rxnorm_map(db):
def rxnorm_map(db, rxnorm_progress):
"""
Map RxNorm data to Drug Interventions in the Neo4j database.
Expand All @@ -1109,11 +1132,18 @@ def rxnorm_map(db):
matcher = Matcher(nlp.vocab)
matcher.add('DRUG',[pattern])

# Retrieve drug interventions from the database
results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" RETURN x.InterventionName, ID(x)').data()
# Retrieve drug interventions from the database that do NOT already have a Drug node attached
results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
length = len(results)

# Iterate over drug interventions and map RxNorm data
for idx,res in enumerate(results):
if idx < rxnorm_progress:
continue

print(f'{str(idx)}/{length}')
db.setConf('UPDATE_PROGRESS', 'clinical_rxnorm_progress', str(idx))

drug_id = res['ID(x)']
drug = res['x.InterventionName']

Expand Down
File renamed without changes.
Loading

0 comments on commit d410a7e

Please sign in to comment.