Devon_dev (#57)

* Updated grant GardPreprocessor code * remove comment * clinical and grant pipeline improvements * Update to File Transfer Code * file transfer v2 * File Transfer Fix v3 * File Transfer Fix v4 * File Transfer Fix v4 - Typo * neo4j-test transfer detect fix * cluster seeding modification * more transfer pipeline fixes * config and sysvar file change * small pubmed pipeline fix * dev-test-prod pipeline done * Database Renaming --------- Co-authored-by: Devon Joseph Leadman <[email protected]> Co-authored-by: Devon Joseph Leadman-m <[email protected]>
ncats · May 30, 2024 · d410a7e · d410a7e
1 parent daf8966
commit d410a7e
Show file tree

Hide file tree

Showing 73 changed files with 281,323 additions and 497 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,37 +13,36 @@
 
 # End of https://www.gitignore.io/api/visualstudiocode
 personal-config.ini
-environment_setup.py
-clinical/src/full_trial_data/
-clinical/src/chromedriver
-clinical/src/ctgov_nctids.json
-clinical/src/ctgov_webscraped_names.csv
-clinical/src/all_queries/
-clinical/src/metamap_cond.txt
-clinical/src/metamap_gard.txt
-clinical/src/metamap_cond_out.json
-pubmed/src/
-gard/src/*.csv
-gard/src/*.json
-gard/src/*.txt
+RDAS.CTKG/src/full_trial_data/
+RDAS.CTKG/src/chromedriver
+RDAS.CTKG/src/ctgov_nctids.json
+RDAS.CTKG/src/ctgov_webscraped_names.csv
+RDAS.CTKG/src/all_queries/
+RDAS.CTKG/src/metamap_cond.txt
+RDAS.CTKG/src/metamap_gard.txt
+RDAS.CTKG/src/metamap_cond_out.json
+RDAS.PAKG/src/
+RDAS.CTKG/src/*.csv
+RDAS.CTKG/src/*.json
+RDAS.CTKG/src/*.txt
 __pycache__/
-clinical/__pycache__/
-grant/__pycache__/
-pubmed/__pycache__/
-pubmed/init_new.py
-pubmed/methods_new.py
-pubmed/archive/
-grant/archive/
-gard/__pycache__/
+RDAS.CTKG/__pycache__/
+RDAS.GFKG/__pycache__/
+RDAS.PAKG/__pycache__/
+RDAS.PAKG/init_new.py
+RDAS.PAKG/methods_new.py
+RDAS.PAKG/archive/
+RDAS.GFKG/archive/
+RDAS.CTKG/__pycache__/
 email/__pycache__/
 transfer/*.dump
 transfer/*.dump
 backup/*/*.dump
 migrated/*.dump
 approved/*.dump
-grant/src/**/**/*.csv
-grant/src/**/**/*.json
-grant/src/**/**/*.zip
+RDAS.GFKG/src/**/**/*.csv
+RDAS.GFKG/src/**/**/*.json
+RDAS.GFKG/src/**/**/*.zip
 grant_2024/src/**/**/*.csv
 grant_2024/src/**/**/*.json
 crt/*.json
@@ -76,10 +75,29 @@ add_participant.txt
 epifix.py
 grant_pipeline.txt
 result.json
-pubmed/test_affiliation.py
-pubmed/test_genereview.py
+RDAS.PAKG/test_affiliation.py
+RDAS.PAKG/test_genereview.py
 test_affiliation.txt
 logs/
 terms_mapped.csv
 terms_unmapped.csv
 test_term_map.py
+RDAS.CTKG/src/ids_to_add.csv
+RDAS.CTKG/src/ids_to_update.csv
+
+RDAS.CTKG/src/ids_to_update_confirmed.csv
+GARD_disease_classification.csv
+gather_RDAS.GFKG_funding_remove.py
+RDAS.GFKG_funding_rdas.csv
+fixyear.py
+fixRDAS.PAKGapi.py
+config.ini
+Cluster 7.csv
+codecamp.py
+cluster_trials.csv
+fixpubmedapi.py
+gather_grant_funding_remove.py
+grant_funding_rdas.csv
+thingforyanji.py
+deliverablesyanji
+NCATS COLLABORATION
diff --git a/clinical/init.py → RDAS.CTKG/init.py b/clinical/init.py → RDAS.CTKG/init.py
diff --git a/clinical/methods.py → RDAS.CTKG/methods.py b/clinical/methods.py → RDAS.CTKG/methods.py
@@ -22,6 +22,7 @@
 from nltk.stem import PorterStemmer
 nltk.download("punkt")
 from spacy.matcher import Matcher
+import spacy
 from fuzzywuzzy import fuzz
 import string
 from transformers import AutoTokenizer, AutoModelForTokenClassification
@@ -128,23 +129,29 @@ def get_nctids(name_list):
             response = requests.get(initial_query + query_end1).text.splitlines()
             total_trials = int(response[4][16:-1])
 
-        # Add trials to a temporary list
-        trials = list()
-        for trial in response[11:]:
-            trials.append(trial.split(',')[1][1:-1])
-
-        # Break into extra queries of 1000 trials if necessary
-        for rank in range(1, total_trials//1000 + 1):
-            # Get next 1000 trials
-            query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
-            response = requests.get(initial_query + query_end2).text.splitlines()
-
-            # Add trials to the temporary list
+        try:
+            # Add trials to a temporary list
+            trials = list()
             for trial in response[11:]:
                 trials.append(trial.split(',')[1][1:-1])
 
-        # Add the trials from the temporary list to the overall list
-        all_trials += trials
+            # Break into extra queries of 1000 trials if necessary
+            for rank in range(1, total_trials//1000 + 1):
+                # Get next 1000 trials
+                query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
+                response = requests.get(initial_query + query_end2).text.splitlines()
+
+                # Add trials to the temporary list
+                for trial in response[11:]:
+                    trials.append(trial.split(',')[1][1:-1])
+
+            # Add the trials from the temporary list to the overall list
+            all_trials += trials
+
+        except Exception as e:
+            print(e)
+            print(initial_query + query_end2)
+            print(trial)
 
     # Return the list of all retrived NCTIDs
     return all_trials
@@ -256,6 +263,19 @@ def extract_fields(nctid):
     return full_trial
 
 
+def get_lastupdated_postdate (ID):
+    postdate_query = f'https://clinicaltrials.gov/api/query/field_values?expr={ID}&field=LastUpdatePostDate&fmt=json'
+    try:
+        # Make the API request and parse the JSON response
+        full_response = requests.get(postdate_query).json()
+        postdate = full_response['FieldValuesResponse']['FieldValues'][0]['FieldValue']
+
+        return postdate
+
+    except ValueError:
+        # Return None if there is an issue with the JSON response
+        return None
+
 
 
 def cypher_generate(db,now,NCTID,data,node_type,update=None,return_single=None):
@@ -432,6 +452,7 @@ def unpack_nested_data (db, now, nctid, trial, node_type):
         #ALSO POSTPONED
         #create_leaf_nodes(db, trial, node_id, node_type)
     """
+    queries = None
 
     if node_type == 'ClinicalTrial':
         tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
@@ -835,22 +856,23 @@ def condition_map(db, update_metamap=True):
 
     print('RUNNING GARD POPULATION')
     # Fetch GARD entries from the database
-    gard_res = gard_db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.GardName as GardName, x.Synonyms as Synonyms, x.UMLS as gUMLS, x.UMLS_Source as usource')
+    gard_res = gard_db.run('MATCH (x:GARD) RETURN x.GardId as GardId, x.UMLS as gUMLS, x.GardName as GardName, x.Synonyms as Synonyms, x.UMLS_Source as usource')
     for gres in gard_res.data():
         gUMLS = gres['gUMLS']
         name = gres['GardName']
         gard_id = gres['GardId']
         syns = gres['Synonyms']
+        usource = gres['usource']
 
         # Check if UMLS data is present and create GARD node accordingly
         if gUMLS:
-            db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS:{gUMLS},UMLS_Source:\"{usource}\"}})'.format(name=gres['GardName'],gard_id=gres['GardId'],syns=gres['Synonyms'],gUMLS=gres['gUMLS'],usource=gres['usource']))
+            db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS:{gUMLS},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,gUMLS=gUMLS,usource=usource))
         else:
-            db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=gres['GardName'],gard_id=gres['GardId'],syns=gres['Synonyms'],usource=gres['usource']))
+            db.run('MERGE (x:GARD {{GardId:\"{gard_id}\",GardName:\"{name}\",Synonyms:{syns},UMLS_Source:\"{usource}\"}})'.format(name=name,gard_id=gard_id,syns=syns,usource=usource))
 
     print('RUNNING METAMAP')
-    # Fetch conditions from the database
-    res = db.run('MATCH (c:Condition) RETURN c.Condition as condition, ID(c) as cond_id')
+    # Fetch conditions from the database that havent already been annotated and are not acronyms
+    res = db.run('MATCH (c:Condition) WHERE NOT EXISTS((c)--(:Annotation)) RETURN c.Condition as condition, ID(c) as cond_id')
     cond_strs = [f"{i['cond_id']}|{normalize(i['condition'])}\n" for i in res if not is_acronym(i['condition'])]
 
     # Write condition strings to a file for MetaMap processing
@@ -915,8 +937,8 @@ def condition_map(db, update_metamap=True):
             db.run(query)
 
     print('CREATING AND CONNECTING METAMAP ANNOTATIONS')
-    # Delete existing annotations
-    db.run('MATCH (x:Annotation) DETACH DELETE x')
+    # Delete existing annotations DONT NEED, REMOVE STEP
+    #db.run('MATCH (x:Annotation) DETACH DELETE x')
     # Fetch relevant data from Condition nodes
     res = db.run('MATCH (x:Condition) WHERE x.METAMAP_OUTPUT IS NOT NULL RETURN ID(x) AS cond_id, x.METAMAP_OUTPUT AS cumls, x.METAMAP_PREFERRED_TERM AS prefs, x.FUZZY_SCORE as fuzz, x.METAMAP_SCORE as meta').data()
 
@@ -941,10 +963,10 @@ def condition_map(db, update_metamap=True):
                 gard_ids = gard_ids['gard_id']
                 for gard_id in gard_ids:
                     # Create Annotation nodes and connect to Condition and GARD nodes
-                    db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS: \"{umls}\", CandidatePreferred: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
+                    db.run('MATCH (z:GARD) WHERE z.GardId = \"{gard_id}\" MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y) MERGE (z)<-[:mapped_to_gard]-(x)'.format(gard_id=gard_id,cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
             else:
                 # Create Annotation nodes and connect to Condition nodes
-                db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS: \"{umls}\", CandidatePreferred: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
+                db.run('MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (x:Annotation {{UMLS_CUI: \"{umls}\", UMLSPreferredName: \"{pref}\", SEMANTIC_TYPE: {sems}, MATCH_TYPE: \"METAMAP\"}}) MERGE (x)<-[:has_annotation {{FUZZY_SCORE: {fuzz}, METAMAP_SCORE: {meta}}}]-(y)'.format(cond_id=cond_id,umls=umls,pref=prefs[idx],sems=sems[idx],fuzz=fuzzy_scores[idx],meta=meta_scores[idx]))
 
     print('REMOVING UNNEEDED PROPERTIES')
     # Remove unnecessary properties from Condition nodes that were used during processing
@@ -958,7 +980,7 @@ def condition_map(db, update_metamap=True):
     for entry in res:
         cond_id = entry['cond_id']
         cond = entry['cond']
-        db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{CandidatePreferred: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
+        db.run('MATCH (x:GARD) WHERE toLower(x.GardName) = toLower(\"{cond}\") MATCH (y:Condition) WHERE ID(y) = {cond_id} MERGE (z:Annotation {{UMLSPreferredName: \"{cond}\", MATCH_TYPE: \"STRING\"}}) MERGE (z)<-[:has_annotation]-(y) MERGE (x)<-[:mapped_to_gard]-(z)'.format(cond=cond,cond_id=cond_id))
 
 
 
@@ -1006,6 +1028,7 @@ def create_drug_connection(db,rxdata,drug_id,wspacy=False):
 
     # Create or merge Drug node with RxNormID
     db.run('MATCH (x:Intervention) WHERE ID(x)={drug_id} MERGE (y:Drug {{RxNormID:{rxnormid}}}) MERGE (y)<-[:mapped_to_rxnorm {{WITH_SPACY: {wspacy}}}]-(x)'.format(rxnormid=rxnormid, drug_id=drug_id, wspacy=wspacy))
+    print(f'MAPPED {rxnormid}')
 
     # Set additional properties on the Drug node
     for k,v in rxdata.items():
@@ -1090,7 +1113,7 @@ def nlp_to_drug(db,doc,matches,drug_name,drug_id):
 
 
 
-def rxnorm_map(db):
+def rxnorm_map(db, rxnorm_progress):
     """
     Map RxNorm data to Drug Interventions in the Neo4j database.
 
@@ -1109,11 +1132,18 @@ def rxnorm_map(db):
     matcher = Matcher(nlp.vocab)
     matcher.add('DRUG',[pattern])
 
-    # Retrieve drug interventions from the database
-    results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" RETURN x.InterventionName, ID(x)').data()
+    # Retrieve drug interventions from the database that do NOT already have a Drug node attached
+    results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
+    length = len(results)
 
     # Iterate over drug interventions and map RxNorm data
     for idx,res in enumerate(results):
+        if idx < rxnorm_progress:
+            continue
+
+        print(f'{str(idx)}/{length}')
+        db.setConf('UPDATE_PROGRESS', 'clinical_rxnorm_progress', str(idx))
+
         drug_id = res['ID(x)']
         drug = res['x.InterventionName']
 

diff --git a/clinical/src/data_model.py → RDAS.CTKG/src/data_model.py b/clinical/src/data_model.py → RDAS.CTKG/src/data_model.py