Devon_dev (#73)

* Grant Pipeline-New GARD Mapping Process * Change get_node_counts * Removal of leftover merge text * More merge text removal * gfkg and pakg bug fixes * New CTKG pipeline --------- Co-authored-by: Devon Joseph Leadman <[email protected]>
ncats · Aug 28, 2024 · 146ecff · 146ecff
1 parent e1da955
commit 146ecff
Show file tree

Hide file tree

Showing 23 changed files with 1,856 additions and 193 deletions.
diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,8 @@ RDAS_CTKG/eligibility_extraction/
 RDAS_CTKG/metamap_cond_out.json
 RDAS_CTKG/metamap_cond.txt
 RDAS_GFKG/convert_csv_fields.py
+fix_normmap_file_sep.py
+project_check.py
+project_check_missing.csv
+project_check_new.csv
+project_check_old.csv
diff --git a/RDAS_CTKG/methods.py b/RDAS_CTKG/methods.py
@@ -91,74 +91,46 @@ def webscrape_ctgov_diseases():
 
 
 
+def call_get_nctids (query, pageToken=None):
+    try:
+        if pageToken: query += f'&pageToken={pageToken}'
+        response = requests.get(query)
+        response_txt = response.json()
+    except Exception as e:
+        print('Unable to Process Query')
+        response_txt = None
+    return response_txt
 
-def get_nctids(name_list):
-    """
-    Retrieves ClinicalTrials.gov Identifiers (NCTIDs) for a list of rare disease names.
-
-    Args:
-        name_list (list): List of rare disease names.
-
-    Returns:
-        list: List of ClinicalTrials.gov Identifiers (NCTIDs) associated with the provided rare disease names.
-
-    Example:
-        disease_names = ["Disease1", "Disease2", ...]
-        nct_ids = get_nctids(disease_names)
-        print(nct_ids)
-        # Output: ["NCT123", "NCT456", ...]
-    """
-
-    # Initialize a list to store all retrieved NCTIDs
+def get_nctids(names,lastupdate):
+    # Date format: 05/01/1975
     all_trials = list()
-
-    # Iterate through each rare disease name
-    for name in name_list:
-        # Replace double quotes to prevent issues with the URL
+    for name in names:
+        trials = list()
         name = name.replace('"','\"')
 
-        # Construct the initial API query to get the total number of trials
-        initial_query = 'https://clinicaltrials.gov/api/query/study_fields?expr=AREA[ConditionBrowseBranchAbbrev] Rare AND \"' + name + '\"&fields=NCTId&'
-        query_end1 = 'min_rnk=1&max_rnk=1000&fmt=csv'
-
-        try:
-            # Make the API request to get the total number of trials
-            response = requests.get(initial_query + query_end1).text.splitlines()
-            total_trials = int(response[4][16:-1])
-        except Exception as e:
-            # Retry in case of an error
-            print('ERROR in retrieving NCTIDS, retrying...')
-            print(response)
-            response = requests.get(initial_query + query_end1).text.splitlines()
-            total_trials = int(response[4][16:-1])
-
+        initial_query = f'https://clinicaltrials.gov/api/v2/studies?query.cond=(EXPANSION[Concept]{name} OR AREA[DetailedDescription]EXPANSION[Concept]{name} OR AREA[BriefSummary]EXPANSION[Concept]{name}) AND AREA[LastUpdatePostDate]RANGE[{lastupdate},MAX]&fields=NCTId&pageSize=1000&countTotal=true'
         try:
-            # Add trials to a temporary list
-            trials = list()
-            for trial in response[11:]:
-                trials.append(trial.split(',')[1][1:-1])
-
-            # Break into extra queries of 1000 trials if necessary
-            for rank in range(1, total_trials//1000 + 1):
-                # Get next 1000 trials
-                query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
-                response = requests.get(initial_query + query_end2).text.splitlines()
-
-                # Add trials to the temporary list
-                for trial in response[11:]:
-                    trials.append(trial.split(',')[1][1:-1])
-
-            # Add the trials from the temporary list to the overall list
-            all_trials += trials
-
+            pageToken = None
+            while True:
+                response_txt = call_get_nctids(initial_query, pageToken=pageToken)
+                if response_txt:
+                    trials_list = response_txt['studies']
+
+                    for trial in trials_list:
+                        nctid = trial['protocolSection']['identificationModule']['nctId']
+                        trials.append(nctid)
+                    all_trials += trials
+                    if not 'nextPageToken' in response_txt:
+                        break
+                    else:
+                        pageToken = response_txt['nextPageToken']
+                else:
+                    break
+
         except Exception as e:
             print(e)
-            print(initial_query + query_end2)
-            print(trial)
-
-    # Return the list of all retrived NCTIDs
-    return all_trials
 
+    return [list(set(all_trials))]
 
 
 
@@ -247,17 +219,14 @@ def extract_fields(nctid):
         print(trial_fields)
         # Output: {"field1": "value1", "field2": {"nested_field": "nested_value"}}
     """
-
-    # Contruct the API query to retrieve full study information
-    full_trial_query = 'https://clinicaltrials.gov/api/query/full_studies?expr=' + nctid + '&min_rnk=1&max_rnk=1&fmt=json'
-    sleep(0.5)
-
     try:
         # Make the API request and parse the JSON response
-        full_trial_response = requests.get(full_trial_query).json()
-
+        query = f'https://clinicaltrials.gov/api/v2/studies/{nctid}'
+        response = requests.get(query)
+        response_txt = response.json()
+        sleep(0.34)
         # Use the parse_trial_fields function to flatten the nested structure
-        full_trial = parse_trial_fields(full_trial_response)
+        full_trial = parse_trial_fields(response_txt)
     except ValueError:
         # Return None if there is an issue with the JSON response
         return None
@@ -279,6 +248,17 @@ def get_lastupdated_postdate (ID):
         # Return None if there is an issue with the JSON response
         return None
 
+def check_neo4j_trial_updates(db, nctids):
+    new_trials = list()
+    trials_to_check = list()
+    for nctid in nctids:
+        response = db.run(f'MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" RETURN x.NCTId').data()
+        if len(response) > 0:
+            new_trials.append(nctid)
+        else:
+            trials_to_check.append(nctid)
+
+    return [new_trials, trials_to_check]
 
 
 def cypher_generate(db,now,NCTID,data,node_type,update=None,return_single=None):
@@ -675,7 +655,7 @@ def mask_name(nlp, name):
 
 
 
-def is_acronym(word):
+def is_acronym(words):
     """
     Checks if a word is an acronym.
 
@@ -691,13 +671,14 @@ def is_acronym(word):
     """
 
     # Check if the word contains spaces
-    if len(word.split(' ')) > 1:
-        return False
-    # Check if the word follows the pattern of an acronym
-    elif bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()):
-        return True
-    else:
-        return False
+    if len(words.split()) > 1: return False
+
+    for word in words.split():
+        # Check if the word follows the pattern of an acronym
+        if bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()): # aGG2
+            print('ACRONYM REMOVED::', words)
+            return True
+    return False
 
 
 
@@ -1237,7 +1218,7 @@ def rxnorm_map(db, rxnorm_progress):
     matcher.add('DRUG',[pattern])
 
     # Retrieve drug interventions from the database that do NOT already have a Drug node attached
-    results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
+    results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "DRUG" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
     length = len(results)
 
     # Iterate over drug interventions and map RxNorm data

diff --git a/RDAS_CTKG/src/data_model.py b/RDAS_CTKG/src/data_model.py
@@ -65,7 +65,7 @@
 ##################################################################
 #new
 
-node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference']
+"""node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference']
 
 abbreviations = {
     'ClinicalTrial': 'ct',
@@ -181,4 +181,122 @@
 # Propeties that are in list form
 fields_as_properties = {
 'ClinicalTrial': ['Phase']
+}"""
+
+node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference']
+
+abbreviations = {
+    'ClinicalTrial': 'ct',
+    'IndividualPatientData': 'ind',
+    'Organization': 'org',
+    'PrimaryOutcome': 'pout',
+    'Investigator': 'inv',
+    #'Sponsor': 'spo',
+    #'Collaborator': 'col',
+    'Condition': 'con',
+    'StudyDesign': 'stu',
+    'Participant': 'par',
+    'ExpandedAccess': 'exp',
+    'Intervention': 'int',
+    'Location': 'loc',
+    'PatientRegistry': 'pat',
+    'Reference': 'ref'
+}
+
+relationships = {
+    'IndividualPatientData': 'has_individual_patient_data',
+    #'Sponsor': 'sponsored_by',
+    #'Collaborator': 'collaborated_with',
+    'Organization': 'conducted_by',
+    'PrimaryOutcome': 'has_outcome',
+    'Investigator': 'investigated_by',
+    'Condition': 'investigates_condition',
+    'StudyDesign': 'has_study_design',
+    'Participant': 'has_participant_info',
+    'ExpandedAccess': 'expanded_access_info',
+    'Intervention': 'has_intervention',
+    'Location': 'in_locations',
+    'PatientRegistry': 'patient_registry_info',
+    'Reference': 'is_about'
+}
+
+rel_directions = {
+    'IndividualPatientData': ['-','->'],
+    #'Sponsor': ['-','->'],
+    #'Collaborator': ['-','->'],
+    'Organization': ['-','->'],
+    'PrimaryOutcome': ['-','->'],
+    'Investigator': ['-','->'],
+    'Condition': ['-','->'],
+    'StudyDesign': ['-','->'],
+    'Participant': ['-','->'],
+    'ExpandedAccess': ['-','->'],
+    'Intervention': ['-','->'],
+    'Location': ['-','->'],
+    'PatientRegistry': ['-','->'],
+    'Reference': ['<-','-']
+
+}
+
+fields = {
+    'ClinicalTrial': ['acronym','briefSummary','briefTitle','CompletionDate','CompletionDateType','LastKnownStatus',
+    'LastUpdatePostDate','LastUpdatePostDateType','LastUpdateSubmitDate','nctId','NCTIdAlias','OfficialTitle',
+    'OverallStatus','Phase','PrimaryCompletionDate','PrimaryCompletionDateType','ResultsFirstPostDate',
+    'ResultsFirstPostDateType','ResultsFirstPostedQCCommentsDate','ResultsFirstPostedQCCommentsDateType',
+    'StartDate','StartDateType','StudyFirstPostDate','StudyFirstPostDateType','StudyType'],
+
+    'IndividualPatientData': ['AvailIPDComment','AvailIPDId','AvailIPDType','AvailIPDURL','IPDSharing',
+    'IPDSharingAccessCriteria','IPDSharingDescription','IPDSharingInfoType','IPDSharingTimeFrame','IPDSharingURL'],
+
+    #'Sponsor': ['LeadSponsorName','LeadSponsorClass'],
+
+    #'Collaborator': ['CollaboratorName','CollaboratorClass'],
+
+    'Organization': ['OrgName', 'OrgClass', 'OrgType'],
+
+    'Investigator': ['OfficialName', 'ContactEmail', 'OfficialAffiliation', 'ContactPhone', 'OfficialRole'],
+
+    'Condition': ['Condition'], #'ConditionAncestorId', 'ConditionAncestorTerm', 'ConditionBrowseBranchAbbrev', 'ConditionBrowseBranchName', 'ConditionBrowseLeafAsFound', 'ConditionBrowseLeafId', 'ConditionBrowseLeafName', 'ConditionBrowseLeafRelevance', 'ConditionMeshId', 'ConditionMeshTerm'
+
+    'StudyDesign': ['DesignAllocation','DesignInterventionModel','DesignInterventionModelDescription','DesignMasking',
+    'DesignMaskingDescription','DesignObservationalModel','DesignPrimaryPurpose','DesignTimePerspective',
+    'DetailedDescription','SamplingMethod'],
+
+    'PrimaryOutcome': ['PrimaryOutcomeDescription', 'PrimaryOutcomeMeasure', 'PrimaryOutcomeTimeFrame'],
+
+    'Participant': ['EligibilityCriteria', 'EnrollmentCount', 'EnrollmentType', 'Gender', 'GenderBased', 'GenderDescription',
+                'HealthyVolunteers', 'MaxiumumAge', 'MinimumAge', 'StdAge', 'StudyPopulation'],
+
+    'ExpandedAccess': ['ExpAccTypeIndividual','ExpAccTypeIntermediate','ExpAccTypeTreatment','ExpandedAccessNCTId',
+    'ExpandedAccessStatusForNCTId','HasExpandedAccess'],
+
+    'Intervention': ['InterventionName','InterventionType','InterventionDescription', 'InterventionOtherName', 'IsFDARegulatedDevice', 'IsFDARegulatedDrug'], # 'InterventionBrowseLeafId', 'InterventionBrowseLeafName', 'InterventionBrowseLeafRelevance', 'InterventionMeshId', 'InterventionMeshTerm', 'InterventionOtherName'
+
+    'Location': ['LocationCity','LocationCountry','LocationFacility','LocationState',
+    'LocationStatus','LocationZip'],
+
+    'PatientRegistry': ['PatientRegistry'],
+
+    'Reference': ['Citation','ReferencePMID','ReferenceType']
+
+}
+
+# Nodes that need additional processing to create additional nodes
+process_nodes = ['Intervention', 'Condition', 'ClinicalTrial', 'Organization', 'PrimaryOutcome']
+
+# Types of nodes that contain more than one entry
+lists_of_nodes = {
+'Collaborator': 'Collaborator',
+'Condition': 'Condition',
+'Intervention': 'Intervention',
+'Location': 'Location',
+'Reference': 'Reference',
+'PrimaryOutcome':'PrimaryOutcome',
+
+'Unassigned': ['SecondaryIdInfo','ArmGroup','SecondaryOutcome','OtherOutcome','StdAge','OverallOfficial','IPDSharingInfoType', 'ConditionMesh', 'ConditionAncestor','ConditionBrowseLeaf','ConditionBrowseBranch','InterventionMesh','InterventionAncestor','InterventionBrowseLeaf','InterventionBrowseBranch']
 }
+
+# Propeties that are in list form
+fields_as_properties = {
+'ClinicalTrial': ['Phase']
+}