diff --git a/.gitignore b/.gitignore index 38b4cb6..ad90dfc 100644 --- a/.gitignore +++ b/.gitignore @@ -113,3 +113,8 @@ RDAS_CTKG/eligibility_extraction/ RDAS_CTKG/metamap_cond_out.json RDAS_CTKG/metamap_cond.txt RDAS_GFKG/convert_csv_fields.py +fix_normmap_file_sep.py +project_check.py +project_check_missing.csv +project_check_new.csv +project_check_old.csv diff --git a/RDAS_CTKG/methods.py b/RDAS_CTKG/methods.py index fb0e00d..ef8b33f 100755 --- a/RDAS_CTKG/methods.py +++ b/RDAS_CTKG/methods.py @@ -91,74 +91,46 @@ def webscrape_ctgov_diseases(): +def call_get_nctids (query, pageToken=None): + try: + if pageToken: query += f'&pageToken={pageToken}' + response = requests.get(query) + response_txt = response.json() + except Exception as e: + print('Unable to Process Query') + response_txt = None + return response_txt -def get_nctids(name_list): - """ - Retrieves ClinicalTrials.gov Identifiers (NCTIDs) for a list of rare disease names. - - Args: - name_list (list): List of rare disease names. - - Returns: - list: List of ClinicalTrials.gov Identifiers (NCTIDs) associated with the provided rare disease names. - - Example: - disease_names = ["Disease1", "Disease2", ...] - nct_ids = get_nctids(disease_names) - print(nct_ids) - # Output: ["NCT123", "NCT456", ...] - """ - - # Initialize a list to store all retrieved NCTIDs +def get_nctids(names,lastupdate): + # Date format: 05/01/1975 all_trials = list() - - # Iterate through each rare disease name - for name in name_list: - # Replace double quotes to prevent issues with the URL + for name in names: + trials = list() name = name.replace('"','\"') - # Construct the initial API query to get the total number of trials - initial_query = 'https://clinicaltrials.gov/api/query/study_fields?expr=AREA[ConditionBrowseBranchAbbrev] Rare AND \"' + name + '\"&fields=NCTId&' - query_end1 = 'min_rnk=1&max_rnk=1000&fmt=csv' - - try: - # Make the API request to get the total number of trials - response = requests.get(initial_query + query_end1).text.splitlines() - total_trials = int(response[4][16:-1]) - except Exception as e: - # Retry in case of an error - print('ERROR in retrieving NCTIDS, retrying...') - print(response) - response = requests.get(initial_query + query_end1).text.splitlines() - total_trials = int(response[4][16:-1]) - + initial_query = f'https://clinicaltrials.gov/api/v2/studies?query.cond=(EXPANSION[Concept]{name} OR AREA[DetailedDescription]EXPANSION[Concept]{name} OR AREA[BriefSummary]EXPANSION[Concept]{name}) AND AREA[LastUpdatePostDate]RANGE[{lastupdate},MAX]&fields=NCTId&pageSize=1000&countTotal=true' try: - # Add trials to a temporary list - trials = list() - for trial in response[11:]: - trials.append(trial.split(',')[1][1:-1]) - - # Break into extra queries of 1000 trials if necessary - for rank in range(1, total_trials//1000 + 1): - # Get next 1000 trials - query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv' - response = requests.get(initial_query + query_end2).text.splitlines() - - # Add trials to the temporary list - for trial in response[11:]: - trials.append(trial.split(',')[1][1:-1]) - - # Add the trials from the temporary list to the overall list - all_trials += trials - + pageToken = None + while True: + response_txt = call_get_nctids(initial_query, pageToken=pageToken) + if response_txt: + trials_list = response_txt['studies'] + + for trial in trials_list: + nctid = trial['protocolSection']['identificationModule']['nctId'] + trials.append(nctid) + all_trials += trials + if not 'nextPageToken' in response_txt: + break + else: + pageToken = response_txt['nextPageToken'] + else: + break + except Exception as e: print(e) - print(initial_query + query_end2) - print(trial) - - # Return the list of all retrived NCTIDs - return all_trials + return [list(set(all_trials))] @@ -247,17 +219,14 @@ def extract_fields(nctid): print(trial_fields) # Output: {"field1": "value1", "field2": {"nested_field": "nested_value"}} """ - - # Contruct the API query to retrieve full study information - full_trial_query = 'https://clinicaltrials.gov/api/query/full_studies?expr=' + nctid + '&min_rnk=1&max_rnk=1&fmt=json' - sleep(0.5) - try: # Make the API request and parse the JSON response - full_trial_response = requests.get(full_trial_query).json() - + query = f'https://clinicaltrials.gov/api/v2/studies/{nctid}' + response = requests.get(query) + response_txt = response.json() + sleep(0.34) # Use the parse_trial_fields function to flatten the nested structure - full_trial = parse_trial_fields(full_trial_response) + full_trial = parse_trial_fields(response_txt) except ValueError: # Return None if there is an issue with the JSON response return None @@ -279,6 +248,17 @@ def get_lastupdated_postdate (ID): # Return None if there is an issue with the JSON response return None +def check_neo4j_trial_updates(db, nctids): + new_trials = list() + trials_to_check = list() + for nctid in nctids: + response = db.run(f'MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" RETURN x.NCTId').data() + if len(response) > 0: + new_trials.append(nctid) + else: + trials_to_check.append(nctid) + + return [new_trials, trials_to_check] def cypher_generate(db,now,NCTID,data,node_type,update=None,return_single=None): @@ -675,7 +655,7 @@ def mask_name(nlp, name): -def is_acronym(word): +def is_acronym(words): """ Checks if a word is an acronym. @@ -691,13 +671,14 @@ def is_acronym(word): """ # Check if the word contains spaces - if len(word.split(' ')) > 1: - return False - # Check if the word follows the pattern of an acronym - elif bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()): - return True - else: - return False + if len(words.split()) > 1: return False + + for word in words.split(): + # Check if the word follows the pattern of an acronym + if bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()): # aGG2 + print('ACRONYM REMOVED::', words) + return True + return False @@ -1237,7 +1218,7 @@ def rxnorm_map(db, rxnorm_progress): matcher.add('DRUG',[pattern]) # Retrieve drug interventions from the database that do NOT already have a Drug node attached - results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data() + results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "DRUG" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data() length = len(results) # Iterate over drug interventions and map RxNorm data diff --git a/RDAS_CTKG/src/data_model.py b/RDAS_CTKG/src/data_model.py index 15040f4..9222512 100755 --- a/RDAS_CTKG/src/data_model.py +++ b/RDAS_CTKG/src/data_model.py @@ -65,7 +65,7 @@ ################################################################## #new -node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference'] +"""node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference'] abbreviations = { 'ClinicalTrial': 'ct', @@ -181,4 +181,122 @@ # Propeties that are in list form fields_as_properties = { 'ClinicalTrial': ['Phase'] +}""" + +node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference'] + +abbreviations = { + 'ClinicalTrial': 'ct', + 'IndividualPatientData': 'ind', + 'Organization': 'org', + 'PrimaryOutcome': 'pout', + 'Investigator': 'inv', + #'Sponsor': 'spo', + #'Collaborator': 'col', + 'Condition': 'con', + 'StudyDesign': 'stu', + 'Participant': 'par', + 'ExpandedAccess': 'exp', + 'Intervention': 'int', + 'Location': 'loc', + 'PatientRegistry': 'pat', + 'Reference': 'ref' +} + +relationships = { + 'IndividualPatientData': 'has_individual_patient_data', + #'Sponsor': 'sponsored_by', + #'Collaborator': 'collaborated_with', + 'Organization': 'conducted_by', + 'PrimaryOutcome': 'has_outcome', + 'Investigator': 'investigated_by', + 'Condition': 'investigates_condition', + 'StudyDesign': 'has_study_design', + 'Participant': 'has_participant_info', + 'ExpandedAccess': 'expanded_access_info', + 'Intervention': 'has_intervention', + 'Location': 'in_locations', + 'PatientRegistry': 'patient_registry_info', + 'Reference': 'is_about' +} + +rel_directions = { + 'IndividualPatientData': ['-','->'], + #'Sponsor': ['-','->'], + #'Collaborator': ['-','->'], + 'Organization': ['-','->'], + 'PrimaryOutcome': ['-','->'], + 'Investigator': ['-','->'], + 'Condition': ['-','->'], + 'StudyDesign': ['-','->'], + 'Participant': ['-','->'], + 'ExpandedAccess': ['-','->'], + 'Intervention': ['-','->'], + 'Location': ['-','->'], + 'PatientRegistry': ['-','->'], + 'Reference': ['<-','-'] + +} + +fields = { + 'ClinicalTrial': ['acronym','briefSummary','briefTitle','CompletionDate','CompletionDateType','LastKnownStatus', + 'LastUpdatePostDate','LastUpdatePostDateType','LastUpdateSubmitDate','nctId','NCTIdAlias','OfficialTitle', + 'OverallStatus','Phase','PrimaryCompletionDate','PrimaryCompletionDateType','ResultsFirstPostDate', + 'ResultsFirstPostDateType','ResultsFirstPostedQCCommentsDate','ResultsFirstPostedQCCommentsDateType', + 'StartDate','StartDateType','StudyFirstPostDate','StudyFirstPostDateType','StudyType'], + + 'IndividualPatientData': ['AvailIPDComment','AvailIPDId','AvailIPDType','AvailIPDURL','IPDSharing', + 'IPDSharingAccessCriteria','IPDSharingDescription','IPDSharingInfoType','IPDSharingTimeFrame','IPDSharingURL'], + + #'Sponsor': ['LeadSponsorName','LeadSponsorClass'], + + #'Collaborator': ['CollaboratorName','CollaboratorClass'], + + 'Organization': ['OrgName', 'OrgClass', 'OrgType'], + + 'Investigator': ['OfficialName', 'ContactEmail', 'OfficialAffiliation', 'ContactPhone', 'OfficialRole'], + + 'Condition': ['Condition'], #'ConditionAncestorId', 'ConditionAncestorTerm', 'ConditionBrowseBranchAbbrev', 'ConditionBrowseBranchName', 'ConditionBrowseLeafAsFound', 'ConditionBrowseLeafId', 'ConditionBrowseLeafName', 'ConditionBrowseLeafRelevance', 'ConditionMeshId', 'ConditionMeshTerm' + + 'StudyDesign': ['DesignAllocation','DesignInterventionModel','DesignInterventionModelDescription','DesignMasking', + 'DesignMaskingDescription','DesignObservationalModel','DesignPrimaryPurpose','DesignTimePerspective', + 'DetailedDescription','SamplingMethod'], + + 'PrimaryOutcome': ['PrimaryOutcomeDescription', 'PrimaryOutcomeMeasure', 'PrimaryOutcomeTimeFrame'], + + 'Participant': ['EligibilityCriteria', 'EnrollmentCount', 'EnrollmentType', 'Gender', 'GenderBased', 'GenderDescription', + 'HealthyVolunteers', 'MaxiumumAge', 'MinimumAge', 'StdAge', 'StudyPopulation'], + + 'ExpandedAccess': ['ExpAccTypeIndividual','ExpAccTypeIntermediate','ExpAccTypeTreatment','ExpandedAccessNCTId', + 'ExpandedAccessStatusForNCTId','HasExpandedAccess'], + + 'Intervention': ['InterventionName','InterventionType','InterventionDescription', 'InterventionOtherName', 'IsFDARegulatedDevice', 'IsFDARegulatedDrug'], # 'InterventionBrowseLeafId', 'InterventionBrowseLeafName', 'InterventionBrowseLeafRelevance', 'InterventionMeshId', 'InterventionMeshTerm', 'InterventionOtherName' + + 'Location': ['LocationCity','LocationCountry','LocationFacility','LocationState', + 'LocationStatus','LocationZip'], + + 'PatientRegistry': ['PatientRegistry'], + + 'Reference': ['Citation','ReferencePMID','ReferenceType'] + +} + +# Nodes that need additional processing to create additional nodes +process_nodes = ['Intervention', 'Condition', 'ClinicalTrial', 'Organization', 'PrimaryOutcome'] + +# Types of nodes that contain more than one entry +lists_of_nodes = { +'Collaborator': 'Collaborator', +'Condition': 'Condition', +'Intervention': 'Intervention', +'Location': 'Location', +'Reference': 'Reference', +'PrimaryOutcome':'PrimaryOutcome', + +'Unassigned': ['SecondaryIdInfo','ArmGroup','SecondaryOutcome','OtherOutcome','StdAge','OverallOfficial','IPDSharingInfoType', 'ConditionMesh', 'ConditionAncestor','ConditionBrowseLeaf','ConditionBrowseBranch','InterventionMesh','InterventionAncestor','InterventionBrowseLeaf','InterventionBrowseBranch'] } + +# Propeties that are in list form +fields_as_properties = { +'ClinicalTrial': ['Phase'] +} \ No newline at end of file diff --git a/RDAS_CTKG/update.py b/RDAS_CTKG/update.py index 4fb0517..14e13ff 100755 --- a/RDAS_CTKG/update.py +++ b/RDAS_CTKG/update.py @@ -50,6 +50,108 @@ def process_trial_update(thr, db, today, current_nctids, ids_to_update): """ def main(): + """ + NEW CT API WITH DATA PIPELINE PROCESS: + for each diseases and their terms x + filter our acronyms in disease terms x + then get all full trials data from the date of last update to today x + https://clinicaltrials.gov/api/v2/studies?query.cond=(EXPANSION[Concept]{name} OR AREA[DetailedDescription]EXPANSION[Concept]{name} OR AREA[BriefSummary]EXPANSION[Concept]{name}) AND AREA[LastUpdatePostDate]RANGE[05/01/1975,MAX]&fields=NCTId&pageSize=1000&countTotal=true + do a check to see if trial already exists + """ + print(f"[CT] Database Selected: {sysvars.ct_db}\nContinuing with script in 5 seconds...") + sleep(5) + + + # Connect to the Neo4j database + db = AlertCypher(sysvars.ct_db) + gard_db = AlertCypher(sysvars.gard_db) + + + # Get last updated date and current date + today = date.today().strftime('%m/%d/%y') + lastupdate_str = db.getConf('UPDATE_PROGRESS','rdas.ctkg_update') + lastupdate = datetime.strptime(lastupdate_str, "%m/%d/%y") + lastupdate = lastupdate.strftime('%m/%d/%Y') + + + in_progress = db.getConf('UPDATE_PROGRESS', 'clinical_in_progress') + print(f'in_progress:: {in_progress}') + if in_progress == 'True': + clinical_disease_progress = db.getConf('UPDATE_PROGRESS', 'clinical_disease_progress') + if not clinical_disease_progress == '': + clinical_disease_progress = int(clinical_disease_progress) + else: + clinical_disease_progress = 0 + + clinical_rxnorm_progress = db.getConf('UPDATE_PROGRESS', 'clinical_rxnorm_progress') + if not clinical_rxnorm_progress == '': + clinical_rxnorm_progress = int(clinical_rxnorm_progress) + else: + clinical_required_update_progress = 0 + clinical_current_step = db.getConf('UPDATE_PROGRESS', 'clinical_current_step') + else: + clinical_disease_progress = 0 + clinical_rxnorm_progress = 0 + clinical_current_step = '' + db.setConf('UPDATE_PROGRESS', 'clinical_in_progress', 'True') + + + if clinical_current_step == '': + gard_response = gard_db.run('MATCH (x:GARD) RETURN x.GardId as gid, x.GardName as gname, x.Synonyms as syns LIMIT 50').data() + for idx,response in enumerate(gard_response): + name = response['gname'] + gid = response['gid'] + syns = response['syns'] + syns = [syn for syn in syns if not rdas.is_acronym(syn)] + names = [name] + syns + + print('CURRENT DISEASE IN API QUERY::', str(idx), name, gid) + + nctids = rdas.get_nctids(names, lastupdate) + new_trials, trials_to_check = rdas.check_neo4j_trial_updates(db, nctids) + + # iterates through all trials that where already found in the database + for idx,nctid_check in enumerate(trials_to_check): + trial_info = rdas.extract_fields(nctid_check) + ID = trial_info['nctId'] + if trial_info: + for node_type in dm.node_names: + # parse and convert data into a neo4j query, updates an existing trial + rdas.format_node_data(db,today,trial_info,node_type,ID,update=True) + else: + print('Error in add for finding full trial data for ' + nctid_check) + + # iterates through all trials that were NOT found in the database + for idx,nctid_add in enumerate(new_trials): + trial_info = rdas.extract_fields(nctid_add) + ID = trial_info['nctId'] + if trial_info: + for node_type in dm.node_names: + # parse and convert data into a neo4j query, adds a new trial + rdas.format_node_data(db,today,trial_info,node_type,ID) + else: + print('Error in add for finding full trial data for ' + nctid_add) + + db.getConf('UPDATE_PROGRESS', 'clinical_disease_progress', str(idx)) + + db.setConf('UPDATE_PROGRESS', 'clinical_current_step', 'rxnorm_map') + + + if clinical_current_step == 'rxnorm_map': + rdas.rxnorm_map(db, clinical_rxnorm_progress) + + if clinical_current_step == 'clear_progress': + # Update config values + db.setConf('UPDATE_PROGRESS', 'clinical_update', datetime.strftime(datetime.now(),"%m/%d/%y")) + db.setConf('UPDATE_PROGRESS', 'clinical_in_progress', 'False') + db.setConf('UPDATE_PROGRESS', 'clinical_current_step', '') + db.setConf('UPDATE_PROGRESS', 'clinical_disease_progress', '') + db.setConf('UPDATE_PROGRESS', 'clinical_rxnorm_progress', '') + + + + + """ Main function for the data processing and updating of the Clinical Trial Neo4j Database. @@ -69,6 +171,7 @@ def main(): # Initialize variables containing NCTIDs to add and update ids_to_update = list() ids_to_add = list() + # Retrieve NCT IDs and last update dates from the database response = db.run('MATCH (x:ClinicalTrial) RETURN x.NCTId,x.LastUpdatePostDate').data() current_nctids = {i['x.NCTId']:i['x.LastUpdatePostDate'] for i in response} diff --git a/RDAS_CTKG_REMAKE/aact.py b/RDAS_CTKG_REMAKE/aact.py new file mode 100644 index 0000000..0eacc66 --- /dev/null +++ b/RDAS_CTKG_REMAKE/aact.py @@ -0,0 +1,1036 @@ +import os +import sys +import requests +workspace = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(workspace) +sys.path.append('/home/leadmandj/RDAS/') +import sysvars +from AlertCypher import AlertCypher +from bs4 import BeautifulSoup +#import RDAS_CTKG.methods as rdas +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import Select +from RDAS_CTKG.src import data_model as dm +from datetime import date,datetime +import re +import nltk +import pandas as pd +from time import sleep +from spacy.matcher import Matcher +import spacy +import string +from transformers import AutoTokenizer, AutoModelForTokenClassification +from transformers import pipeline +nltk.download('averaged_perceptron_tagger') + +def is_acronym(words): + """ + Checks if a word is an acronym. + + Args: + word (str): The word to be checked. + + Returns: + bool: True if the word is an acronym, False otherwise. + + Example: + result = is_acronym("NASA") + print(result) # Output: True + """ + if len(words.split()) > 1: return False + + for word in words.split(): + # Check if the word follows the pattern of an acronym + if bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()): # aGG2 + print('ACRONYM REMOVED::', words) + return True + return False + + +def get_full_studies(nctids): + for nctid in nctids: + query = f'https://clinicaltrials.gov/api/v2/studies/{nctid}' + response = requests.get(query) + response_txt = response.json() + #response_txt = parse_trial_fields(response_txt) + yield response_txt + + +def call_get_nctids (query, pageToken=None): + try: + if pageToken: query += f'&pageToken={pageToken}' + response = requests.get(query) + response_txt = response.json() + except Exception as e: + print('Unable to Process Query') + response_txt = None + return response_txt + +def get_nctids(names, lastupdate): + all_trials = list() + for name in names: + trials = list() + name = name.replace('"','\"') + + initial_query = f'https://clinicaltrials.gov/api/v2/studies?query.cond=(EXPANSION[Concept]{name} OR AREA[DetailedDescription]EXPANSION[Concept]{name} OR AREA[BriefSummary]EXPANSION[Concept]{name}) AND AREA[LastUpdatePostDate]RANGE[{lastupdate},MAX]&fields=NCTId&pageSize=1000&countTotal=true' + print(initial_query) + try: + pageToken = None + while True: + response_txt = call_get_nctids(initial_query, pageToken=pageToken) + if response_txt: + trials_list = response_txt['studies'] + + for trial in trials_list: + nctid = trial['protocolSection']['identificationModule']['nctId'] + trials.append(nctid) + all_trials += trials + if not 'nextPageToken' in response_txt: + break + else: + pageToken = response_txt['nextPageToken'] + else: + break + + except Exception as e: + print(e) + + return list(set(all_trials)) + + +def rxnorm_map(nlp, intervention): + def cypher_Drug(rxdata,intervention_name,wspacy=False): + rxnormid = rxdata['RxNormID'] + + # Create or merge Drug node with RxNormID + query = 'MERGE (x:Drug {{RxNormID: {rxnormid} }}) WITH x MATCH (y:Intervention {{InterventionName: \"{intervention_name}\" }}) MERGE (y)-[:mapped_to_rxnorm {{WITH_SPACY: {wspacy} }}]->(x)'.format(rxnormid=rxnormid, intervention_name=intervention_name, wspacy=wspacy) + yield query + + # Set additional properties on the Drug node + for k,v in rxdata.items(): + key = k.replace(' ','') + query = ('MATCH (y:Drug {{RxNormID: {rxnormid} }}) SET y.{key} = {value}'.format(rxnormid=rxnormid, key=key, value=v)) + yield query + + + def nlp_to_drug(doc,matches,drug_name): + for match_id, start, end in matches: + span = doc[start:end].text + + # Retrieve RxNorm data for the drug name + rxdata = get_rxnorm_data(span.replace(' ','+')) + + if rxdata: + # Create connections in the database using RxNorm data + for query in cypher_Drug(rxdata,drug_name,wspacy=True): yield query + else: + print('Map to RxNorm failed for intervention name: {drug_name}'.format(drug_name=drug_name)) + + def get_rxnorm_data(url): + # Form RxNav API request to get RxNormID based on drug name + rq = 'https://rxnav.nlm.nih.gov/REST/rxcui.json?name={drug}&search=2'.format(drug=url) + response = requests.get(rq) + try: + rxdata = dict() + # Extract RxNormID from the response + response = response.json()['idGroup']['rxnormId'][0] + rxdata['RxNormID'] = response + # Form RxNav API request to get all properties of the drug using RxNormID + rq2 = 'https://rxnav.nlm.nih.gov/REST/rxcui/{rxnormid}/allProperties.json?prop=codes+attributes+names+sources'.format(rxnormid=response) + response = requests.get(rq2) + response = response.json()['propConceptGroup']['propConcept'] + # Extract and organize properties of the drug + for r in response: + if r['propName'] in rxdata: + rxdata[r['propName']].append(r['propValue']) + else: + rxdata[r['propName']] = [r['propValue']] + return rxdata + + except Exception as e: + return + + def drug_normalize(drug_name): + # Remove non-ASCII characters + new_val = drug_name.encode("ascii", "ignore") + # Decode the bytes to string + updated_str = new_val.decode() + # Replace non-word characters with spaces + updated_str = re.sub('\W+',' ', updated_str) + return updated_str + + + drug = drug_normalize(intervention) + drug_url = drug.replace(' ','+') + + # Retrieve RxNorm data for the drug name + rxdata = get_rxnorm_data(drug_url) + + if rxdata: + # Create connections in the database using RxNorm data + for query in cypher_Drug(rxdata, drug): yield query + else: + # If RxNorm data not found, use SpaCy NLP to detect drug names and map to RxNorm + doc = nlp(drug) + matches = matcher(doc) + for query in nlp_to_drug(doc,matches,drug): yield query + + + +def clean_data_extract(data): + temp = data + for k,v in data.items(): + if v == str() or v == list() or v == dict(): + temp[k] = '\"\"' + elif type(v) == str: + text = re.sub(r'[^\w\s\-\/@.+]+', '', v) + temp[k] = f'\"{text}\"' + else: + temp[k] = v + return temp + + +def cypher_GARD_populate(): + gard_response = gard_db.run('MATCH (x:GARD) RETURN x.GardId as gid, x.GardName as gname, x.Synonyms as syns').data() + for response in gard_response: + name = response['gname'] + gid = response['gid'] + syns = response['syns'] + + gard_node = {'GardId':gid, 'GardName':name, 'Synonyms':syns} + gard_query = cypher_GARD(gard_node) + db.run(gard_query) + + +def cypher_GARD(gard_node): + gardid = gard_node['GardId'] + gardname = gard_node['GardName'] + syns = gard_node['Synonyms'] + + query = """ + MERGE (x:GARD {{GardId: \"{gardid}\"}}) + ON CREATE + SET x.GardName = \"{gardname}\" + SET x.Synonyms = {syns} + """.format( + gardid=gardid, + gardname=gardname, + syns=syns + ) + + return query + +def cypher_ClinicalTrial(db, study, gard_node, today, update=False): + data_extract = dict() + #gardid = gard_node['GardId'] + identification_module = study.get('protocolSection',dict()).get('identificationModule',dict()) + status_module = study.get('protocolSection',dict()).get('statusModule',dict()) + description_module = study.get('protocolSection',dict()).get('descriptionModule',dict()) + design_module = study.get('protocolSection',dict()).get('designModule',dict()) + ipd_module = study.get('protocolSection',dict()).get('ipdSharingStatementModule',dict()) + contact_module = study.get('protocolSection',dict()).get('contactsLocationsModule',dict()) + + if status_module == dict() and description_module == dict() and design_module == dict() and ipd_module == dict() and contact_module == dict(): + return None + + # Identification Module + nctid = identification_module.get('nctId', '') + data_extract['NCTIdAlias'] = identification_module.get('nctIdAliases', list()) + data_extract['Acronym'] = identification_module.get('acronym', '') + data_extract['BriefTitle'] = identification_module.get('briefTitle', '') + data_extract['OfficialTitle'] = identification_module.get('officialTitle', '') + + # Status Module + data_extract['LastKnownStatus'] = status_module.get('lastKnownStatus','') + data_extract['CompletionDate'] = status_module.get('completionDateStruct', dict()).get('date', '') + data_extract['CompletionDateType'] = status_module.get('completionDateStruct', dict()).get('type', '') + data_extract['LastUpdatePostDate'] = status_module.get('lastUpdatePostDateStruct', dict()).get('date', '') + data_extract['LastUpdatePostDateType'] = status_module.get('lastUpdatePostDateStruct', dict()).get('type', '') + data_extract['LastUpdateSubmitDate'] = status_module.get('lastUpdateSubmitDate', '') + data_extract['OverallStatus'] = status_module.get('overallStatus', '') + data_extract['PrimaryCompletionDate'] = status_module.get('completionDateStruct', dict()).get('date', '') + data_extract['PrimaryCompletionDateType'] = status_module.get('completionDateStruct', dict()).get('type', '') + data_extract['ResultsFirstPostDate'] = status_module.get('studyFirstPostDateStruct', dict()).get('date', '') + data_extract['ResultsFirstPostDateType'] = status_module.get('studyFirstPostDateStruct', dict()).get('type', '') + data_extract['ResultsFirstPostedQCCommentsDate'] = status_module.get('studyFirstSubmitQcDate', '') + data_extract['StartDate'] = status_module.get('startDateStruct', dict()).get('date', '') + data_extract['StartDateType'] = status_module.get('startDateStruct', dict()).get('type', '') + data_extract['StudyFirstPostDate'] = status_module.get('studyFirstPostDateStruct', dict()).get('date', '') + data_extract['StudyFirstPostDateType'] = status_module.get('studyFirstPostDateStruct', dict()).get('type', '') + + # Description Module + data_extract['BriefSummary'] = description_module.get('briefSummary', '') + + # Design Module + data_extract['Phase'] = design_module.get('phases', list()) + data_extract['StudyType'] = design_module.get('studyType','') + data_extract['PatientRegistry'] = design_module.get('patientRegistry',False) + + # IPD Module + data_extract['IPDSharing'] = ipd_module.get('ipdSharing','') + data_extract['IPDSharingDescription'] = ipd_module.get('description','') + data_extract['IPDSharingInfoType'] = ipd_module.get('infoTypes',list()) + data_extract['IPDSharingTimeFrame'] = ipd_module.get('timeFrame','') + data_extract['IPDSharingAccessCriteria'] = ipd_module.get('accessCriteria','') + + # Remove existing relationships if CT node already exists and requires an update + if update: db.run(f'MATCH (x:ClinicalTrial)-[r]-() WHERE x.NCTId = \"{nctid}\" AND NOT TYPE(r) = \"mapped_to_gard\" DELETE r') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:GARD) WHERE x.GardId = \"{gardid}\" + MERGE (y:ClinicalTrial {{NCTId: \"{nctid}\"}}) + ON CREATE + SET y.StudyType = {studyType}, + y.LastKnownStatus = {lknownstat}, + y.NCTIdAlias = {nct_alias}, + y.Acronym = {acro}, + y.BriefTitle = {btitle}, + y.BriefSummary = {bsummary}, + y.OfficialTitle = {otitle}, + y.CompletionDate = {cdate}, + y.CompletionDateType = {cdatetype}, + y.LastUpdatePostDate = {lupdatedate}, + y.LastUpdatePostDateType = {lupdatetype}, + y.LastUpdateSubmitDate = {lupdatesubmitdate}, + y.OverallStatus = {overall}, + y.PrimaryCompletionDate = {pcompletedate}, + y.PrimaryCompletionDateType = {pcompletetype}, + y.ResultsFirstPostDate = {rfirstdate}, + y.ResultsFirstPostDateType = {rfirsttype}, + y.ResultsFirstPostedQCCommentsDate = {qcdate}, + y.StartDate = {startdate}, + y.LastUpdatedRDAS = \"{curdate}\", + y.DateCreatedRDAS = \"{curdate}\", + y.IPDSharing = {ipd}, + y.IPDSharingDescription = {desc}, + y.IPDSharingInfoType = {info}, + y.IPDSharingTimeFrame = {frame}, + y.IPDSharingAccessCriteria = {criteria}, + y.PatientRegistry = {register}, + y.StartDateType = {startdatetype} + ON MATCH + SET y.StudyType = {studyType}, + y.LastKnownStatus = {lknownstat}, + y.NCTIdAlias = {nct_alias}, + y.Acronym = {acro}, + y.BriefTitle = {btitle}, + y.BriefSummary = {bsummary}, + y.OfficialTitle = {otitle}, + y.CompletionDate = {cdate}, + y.Phase = {phases}, + y.CompletionDateType = {cdatetype}, + y.LastUpdatePostDate = {lupdatedate}, + y.LastUpdatePostDateType = {lupdatetype}, + y.LastUpdateSubmitDate = {lupdatesubmitdate}, + y.OverallStatus = {overall}, + y.PrimaryCompletionDate = {pcompletedate}, + y.PrimaryCompletionDateType = {pcompletetype}, + y.ResultsFirstPostDate = {rfirstdate}, + y.ResultsFirstPostDateType = {rfirsttype}, + y.ResultsFirstPostedQCCommentsDate = {qcdate}, + y.StartDate = {startdate}, + y.LastUpdatedRDAS = \"{curdate}\", + y.IPDSharing = {ipd}, + y.IPDSharingDescription = {desc}, + y.IPDSharingInfoType = {info}, + y.IPDSharingTimeFrame = {frame}, + y.IPDSharingAccessCriteria = {criteria}, + y.PatientRegistry = {register}, + y.StartDateType = {startdatetype} + MERGE (x)<-[:mapped_to_gard]-(y) + RETURN ID(y) AS ct_id + """.format( + studyType = data_extract['StudyType'], + lknownstat=data_extract['LastKnownStatus'], + nct_alias=data_extract['NCTIdAlias'], + acro=data_extract['Acronym'], + gardid=gard_node['GardId'], + nctid=nctid, + btitle=data_extract['BriefTitle'], + bsummary=data_extract['BriefSummary'], + phases=data_extract['Phase'], + otitle=data_extract['OfficialTitle'], + cdate=data_extract['CompletionDate'], + cdatetype=data_extract['CompletionDateType'], + lupdatedate=data_extract['LastUpdatePostDate'], + lupdatetype=data_extract['LastUpdatePostDateType'], + lupdatesubmitdate=data_extract['LastUpdateSubmitDate'], + overall=data_extract['OverallStatus'], + pcompletedate=data_extract['PrimaryCompletionDate'], + pcompletetype=data_extract['PrimaryCompletionDateType'], + rfirstdate=data_extract['ResultsFirstPostDate'], + rfirsttype=data_extract['ResultsFirstPostDateType'], + qcdate=data_extract['ResultsFirstPostedQCCommentsDate'], + startdate=data_extract['StartDate'], + startdatetype=data_extract['StartDateType'], + ipd=data_extract['IPDSharing'], + desc=data_extract['IPDSharingDescription'], + info=data_extract['IPDSharingInfoType'], + frame=data_extract['IPDSharingTimeFrame'], + criteria=data_extract['IPDSharingAccessCriteria'], + register=data_extract['PatientRegistry'], + curdate=today + ) + + ct_id = db.run(query).data()[0]['ct_id'] + + centralContacts = contact_module.get('centralContacts',list()) + if centralContacts == list(): return None + + for contact in centralContacts: + data_extract['ContactName'] = contact.get('name','') + data_extract['ContactRole'] = contact.get('role','') + data_extract['ContactPhone'] = contact.get('phone','') + data_extract['ContactPhoneExt'] = contact.get('phoneExt','') + data_extract['ContactEmail'] = contact.get('email','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE ID(x) = {ct_id} + MERGE (y:Contact {{ + ContactName: {name}, + ContactRole: {role}, + ContactPhone: {phone}, + ContactPhoneExt: {phoneExt}, + ContactEmail: {email}, + ContactScope: \"Central\" + }}) + MERGE (x)-[:has_contact]->(y) + """.format( + ct_id=ct_id, + name=data_extract['ContactName'], + role=data_extract['ContactRole'], + phone=data_extract['ContactPhone'], + phoneExt=data_extract['ContactPhoneExt'], + email=data_extract['ContactEmail'] + ) + + yield query + +def cypher_IndividualPatientData(study): + data_extract = dict() + + identification_module = study.get('protocolSection',dict()).get('identificationModule',dict()) + ipd_module = study.get('protocolSection',dict()).get('ipdSharingStatementModule',dict()) + + if ipd_module == dict(): + return None + + nctid = identification_module.get('nctId', '') + + data_extract['IPDSharing'] = ipd_module.get('ipdSharing','') + data_extract['IPDSharingDescription'] = ipd_module.get('description','') + data_extract['IPDSharingInfoType'] = ipd_module.get('infoTypes',list()) + data_extract['IPDSharingTimeFrame'] = ipd_module.get('timeFrame','') + data_extract['IPDSharingAccessCriteria'] = ipd_module.get('accessCriteria','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:IndividualPatientData {{ + IPDSharing: {ipd}, + IPDSharingDescription: {desc}, + IPDSharingInfoType: {info}, + IPDSharingTimeFrame: {frame}, + IPDSharingAccessCriteria: {criteria} + }}) + MERGE (x)-[:has_individual_patient_data]->(y) + """.format( + nctid=nctid, + ipd=data_extract['IPDSharing'], + desc=data_extract['IPDSharingDescription'], + info=data_extract['IPDSharingInfoType'], + frame=data_extract['IPDSharingTimeFrame'], + criteria=data_extract['IPDSharingAccessCriteria'], + ) + + return query + + +def cypher_Investigator(study): + identification_module = study.get('protocolSection', dict()).get('identificationModule', dict()) + contact_module = study.get('protocolSection', dict()).get('contactsLocationsModule', dict()) + + nctid = identification_module.get('nctId', '') + officials = contact_module.get('overallOfficials', list()) + + if officials == list(): + return None + + for official in officials: + data_extract = dict() + data_extract['OfficialName'] = official.get('name','') + data_extract['OfficialAffiliation'] = official.get('affiliation','') + data_extract['OfficialRole'] = official.get('role','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MATCH (z:Contact) WHERE z.ContactName = {official_name} + MERGE (y:Investigator {{ + OfficialName: {official_name}, + OfficialAffiliation: {aff}, + OfficialRole: {role} }}) + MERGE (x)<-[:investigates]-(y) + MERGE (z)<-[:has_contact]-(y) + """.format( + nctid=nctid, + official_name=data_extract['OfficialName'], + aff=data_extract['OfficialAffiliation'], + role=data_extract['OfficialRole'] + ) + + yield query + + +def cypher_Condition(db, study, gard_names_dict): + identification_module = study.get('protocolSection','').get('identificationModule','') + conditions_module = study.get('protocolSection','').get('conditionsModule','') + + nctid = identification_module.get('nctId', '') + + conditions = conditions_module.get('conditions', list()) + + if conditions == list(): + return None + + for condition in conditions: + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:Condition {{Condition: \"{cond}\"}}) + MERGE (x)-[:investigates_condition]->(y) + RETURN ID(y) AS cond_id + """.format( + nctid=nctid, + cond=condition, + ) + cond_id = db.run(query).data()[0]['cond_id'] + + # Exact match with GARD node + condition_normalized = gard_text_normalize(condition) + for k,v in gard_names_dict.items(): + for term in v: + if condition_normalized == term: + print('MATCH::', condition_normalized) + query = """ + MATCH (x:GARD) WHERE x.GardId = \"{gardid}\" + MATCH (y:Condition) WHERE ID(y) = {cond_id} + MERGE (y)-[:mapped_to_gard]->(x) + """.format( + gardid=k, + cond_id=cond_id + ) + yield query + ### + +def cypher_AssociatedEntity(study): + def generate_entity_query(nctid, data, node_type): + data_extract = clean_data_extract(data) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:AssociatedEntity {{EntityName: {ename}, EntityClass: {eclass}, EntityType: \"{etype}\" }}) + MERGE (x)<-[:associated_with]-(y) + """.format( + nctid=nctid, + ename=data_extract['Name'], + eclass=data_extract['Class'], + etype=node_type + ) + + return query + + identification_module = study.get('protocolSection',dict()).get('identificationModule',dict()) + collab_module = study.get('protocolSection',dict()).get('sponsorCollaboratorsModule',dict()) + + nctid = identification_module.get('nctId', '') + organization = identification_module.get('organization', dict()) + collaborators = collab_module.get('collaborators', list()) + leadSponsor = collab_module.get('leadSponsor',dict()) + + if not organization == dict(): + data_extract = dict() + data_extract['Name'] = identification_module.get('organization',dict()).get('fullName','') + data_extract['Class'] = identification_module.get('organization',dict()).get('class','') + yield generate_entity_query(nctid, data_extract, 'Organization') + + if not leadSponsor == dict(): + data_extract = dict() + data_extract['Name'] = leadSponsor.get('name', '') + data_extract['Class'] = leadSponsor.get('class', '') + yield generate_entity_query(nctid, data_extract, 'Sponsor') + + if not collaborators == list(): + for collaborator in collaborators: + data_extract = dict() + data_extract['Name'] = collaborator.get('name','') + data_extract['Class'] = collaborator.get('class','') + yield generate_entity_query(nctid, data_extract, 'Collaborator') + + + +def cypher_StudyDesign(study): + data_extract = dict() + + identification_module = study.get('protocolSection', dict()).get('identificationModule', dict()) + design_module = study.get('protocolSection', dict()).get('designModule', dict()) + desc_module = study.get('protocolSection', dict()).get('descriptionModule', dict()) + status_module = study.get('protocolSection','').get('statusModule','') + + nctid = identification_module.get('nctId', '') + + designInfo = design_module.get('designInfo', dict()) + data_extract['DesignObservationalModel'] = designInfo.get('observationalModel','') + data_extract['DesignTimePerspective'] = designInfo.get('timePerspective','') + data_extract['DesignAllocation'] = designInfo.get('allocation','') + data_extract['DesignInterventionModel'] = designInfo.get('interventionModel','') + data_extract['DesignPrimaryPurpose'] = designInfo.get('primaryPurpose','') + data_extract['DesignInterventionModel'] = designInfo.get('interventionModel','') + data_extract['DesignInterventionModelDescription'] = designInfo.get('interventionModelDescription','') + + maskingInfo = designInfo.get('maskingInfo',dict()) + data_extract['DesignMasking'] = maskingInfo.get('masking','') + + expandedAccessInfo = status_module.get('expandedAccessInfo',dict()) + data_extract['HasExpandedAccess'] = expandedAccessInfo.get('hasExpandedAccess','') + + data_extract['DetailedDescription'] = desc_module.get('detailedDescription','') + + if designInfo == dict() and maskingInfo == dict() and expandedAccessInfo == dict(): + return None + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:StudyDesign {{ + DesignObservationalModel: {observe}, + DesignInterventionModel: {int_model}, + DesignInterventionModelDescription: {int_desc}, + DesignTimePerspective: {persp}, + DesignAllocation: {alloc}, + DesignPrimaryPurpose: {purp}, + DesignInterventionModel: {intervention}, + DesignMasking: {mask}, + DetailedDescription: {det_desc}, + HasExpandedAccess: {access} + }}) + MERGE (x)-[:has_study_design]->(y) + """.format( + nctid=nctid, + observe=data_extract['DesignObservationalModel'], + persp=data_extract['DesignTimePerspective'], + alloc=data_extract['DesignAllocation'], + purp=data_extract['DesignPrimaryPurpose'], + intervention=data_extract['DesignInterventionModel'], + mask=data_extract['DesignMasking'], + int_model=data_extract['DesignInterventionModel'], + int_desc=data_extract['DesignInterventionModelDescription'], + det_desc=data_extract['DetailedDescription'], + access=data_extract['HasExpandedAccess'] + ) + + return query + + +def cypher_PrimaryOutcome(study): + identification_module = study.get('protocolSection', dict()).get('identificationModule', dict()) + outcomes_module = study.get('protocolSection', dict()).get('outcomesModule', dict()) + + nctid = identification_module.get('nctId', '') + primaryOutcomes = outcomes_module.get('primaryOutcomes', list()) + + if primaryOutcomes == list(): + return None + + for outcome in primaryOutcomes: + data_extract = dict() + + data_extract['PrimaryOutcomeMeasure'] = outcome.get('measure','') + data_extract['PrimaryOutcomeTimeFrame'] = outcome.get('timeFrame','') + data_extract['PrimaryOutcomeDescription'] = outcome.get('description','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:PrimaryOutcome {{PrimaryOutcomeMeasure: {measure}, PrimaryOutcomeTimeFrame: {timeframe}, PrimaryOutcomeDescription: {desc}}}) + MERGE (x)-[:has_outcome]->(y) + """.format( + nctid=nctid, + measure=data_extract['PrimaryOutcomeMeasure'], + timeframe=data_extract['PrimaryOutcomeTimeFrame'], + desc=data_extract['PrimaryOutcomeDescription'] + ) + + yield query + + +def cypher_Participant(study): + data_extract = dict() + + identification_module = study.get('protocolSection',dict()).get('identificationModule',dict()) + design_module = study.get('protocolSection',dict()).get('designModule',dict()) + eligibility_module = study.get('protocolSection',dict()).get('eligibilityModule',dict()) + + nctid = identification_module.get('nctId', '') + + data_extract['EligibilityCriteria'] = eligibility_module.get('eligibilityCriteria', '') + data_extract['HealthyVolunteers'] = eligibility_module.get('healthyVolunteers', '') + data_extract['Gender'] = eligibility_module.get('sex', '') + data_extract['StdAge'] = eligibility_module.get('stdAges', '') + data_extract['MinimumAge'] = eligibility_module.get('minimumAge', '') + data_extract['MaximumAge'] = eligibility_module.get('maximumAge', '') + + data_extract['EnrollmentCount'] = design_module.get('enrollmentInfo', dict()).get('count', '') + data_extract['EnrollmentType'] = design_module.get('enrollmentInfo', dict()).get('type', '') + + if eligibility_module == dict() and design_module == dict(): + return None + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:Participant {{ + EligibilityCriteria: {criteria}, + HealthyVolunteers: {healthy}, + Gender: {gender}, + StdAge: {age}, + MinimumAge: {minage}, + MaximumAge: {maxage}, + EnrollmentCount: {cnt}, + EnrollmentType: {enrolltype} }}) + MERGE (x)-[:has_participant_info]->(y) + """.format( + nctid=nctid, + criteria=data_extract['EligibilityCriteria'], + healthy=data_extract['HealthyVolunteers'], + gender=data_extract['Gender'], + minage=data_extract['MinimumAge'], + maxage=data_extract['MaximumAge'], + age=data_extract['StdAge'], + cnt=data_extract['EnrollmentCount'], + enrolltype=data_extract['EnrollmentType'], + ) + + return query + + +def cypher_Intervention(study, nlp): + identification_module = study.get('protocolSection', dict()).get('identificationModule', dict()) + intervention_module = study.get('protocolSection', dict()).get('armsInterventionsModule', dict()) + + nctid = identification_module.get('nctId', '') + interventions = intervention_module.get('interventions', list()) + + if interventions == list(): + return None + + for intervention in interventions: + data_extract = dict() + + data_extract['InterventionName'] = intervention.get('name','') + intervention_name = data_extract['InterventionName'] + data_extract['InterventionType'] = intervention.get('type','') + intervention_type = data_extract['InterventionType'] + data_extract['InterventionDescription'] = intervention.get('description','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:Intervention {{ + InterventionName: {name}, + InterventionType: {type}, + InterventionDescription: {desc} + }}) + MERGE (x)-[:has_intervention]->(y) + """.format( + nctid=nctid, + name=data_extract['InterventionName'], + type=data_extract['InterventionType'], + desc=data_extract['InterventionDescription'] + ) + + yield query + + if intervention_type == 'DRUG': + for rxnorm_query in rxnorm_map(nlp, intervention_name): yield rxnorm_query + + +def cypher_Location(db, study): + identification_module = study.get('protocolSection', dict()).get('identificationModule', dict()) + loc_module = study.get('protocolSection', dict()).get('contactsLocationsModule', dict()) + + nctid = identification_module.get('nctId', '') + locations = loc_module.get('locations', list()) + + if locations == list(): + return None + + for loc in locations: + data_extract = dict() + + data_extract['LocationFacility'] = loc.get('facility','') + data_extract['LocationStatus'] = loc.get('status','') + data_extract['LocationCity'] = loc.get('city','') + data_extract['LocationZip'] = loc.get('zip','') + data_extract['LocationCountry'] = loc.get('country','') + data_extract['LocationState'] = loc.get('state','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:Location {{ + LocationCity: {city}, + LocationCountry: {country}, + LocationFacility: {facility}, + LocationState: {state}, + LocationStatus: {status}, + LocationZip: {zipcode} + }}) + MERGE (x)-[:in_locations]->(y) + RETURN ID(y) AS loc_id, ID(x) as ct_id + """.format( + nctid=nctid, + city=data_extract['LocationCity'], + country=data_extract['LocationCountry'], + facility=data_extract['LocationFacility'], + state=data_extract['LocationState'], + status=data_extract['LocationStatus'], + zipcode=data_extract['LocationZip'] + ) + loc_id = db.run(query).data()[0]['loc_id'] + ct_id = db.run(query).data()[0]['ct_id'] + + data_extract = dict() + loc_contacts = loc.get('contacts',list()) + if loc_contacts == list(): continue + + for contact in loc_contacts: + data_extract['ContactName'] = contact.get('name','') + data_extract['ContactRole'] = contact.get('role','') + data_extract['ContactPhone'] = contact.get('phone','') + data_extract['ContactPhoneExt'] = contact.get('phoneExt','') + data_extract['ContactEmail'] = contact.get('email','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (z:ClinicalTrial) WHERE ID(z) = {ct_id} + MATCH (x:Location) WHERE ID(x) = {loc_id} + MERGE (y:Contact {{ + ContactName: {name}, + ContactRole: {role}, + ContactPhone: {phone}, + ContactPhoneExt: {phoneExt}, + ContactEmail: {email}, + ContactScope: \"Location\" + }}) + MERGE (z)-[:has_contact]->(y) + MERGE (y)-[:contact_for_location]->(x) + """.format( + loc_id=loc_id, + ct_id=ct_id, + name=data_extract['ContactName'], + role=data_extract['ContactRole'], + phone=data_extract['ContactPhone'], + phoneExt=data_extract['ContactPhoneExt'], + email=data_extract['ContactEmail'] + ) + + yield query + + +def cypher_Reference(study): + identification_module = study.get('protocolSection', dict()).get('identificationModule', dict()) + ref_module = study.get('protocolSection', dict()).get('referencesModule', dict()) + + nctid = identification_module.get('nctId', '') + refs = ref_module.get('references', list()) + + if refs == list(): + return None + + for ref in refs: + data_extract = dict() + + data_extract['ReferencePMID'] = ref.get('pmid','') + data_extract['ReferenceType'] = ref.get('type','') + data_extract['Citation'] = ref.get('citation','') + + data_extract = clean_data_extract(data_extract) + + query = """ + MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" + MERGE (y:Reference {{ + Citation: {cite}, + ReferencePMID: {pmid}, + ReferenceType: {ref_type} + }}) + MERGE (x)<-[:is_about]-(y) + """.format( + nctid=nctid, + cite=data_extract['Citation'], + pmid=data_extract['ReferencePMID'], + ref_type=data_extract['ReferenceType'] + ) + + yield query + + +def gard_text_normalize(text): + text = text.lower() + text = re.sub('\W+',' ', text) + text = re.sub(' +', ' ', text) + text = text.strip() + + return text + + +def get_GARD_names_syns(db): + temp = dict() + response = db.run('MATCH (x:GARD) RETURN x.GardId as gid, x.GardName as gname, x.Synonyms as syns').data() + for res in response: + gardid = res['gid'] + gardname = res['gname'] + gardsyns = res['syns'] + + gardsyns_no_acro = [syn for syn in gardsyns if not is_acronym(syn)] + termlist = [gardname] + gardsyns_no_acro + + termlist = [gard_text_normalize(term) for term in termlist] + + temp[gardid] = termlist + + return temp + + +def generate_queries(db, nlp, study, gard_node, gard_names_dict, today, update=False): + # IF update=True it will remove all relationships from the CT node and recreate the connected nodes + # IF update=False it assumes the CT node doesnt exist and will create connected nodes normally + # Extract and populate GARD info + ###yield cypher_GARD(gard_node) + # Extract and populate ClinicalTrial info + for query in cypher_ClinicalTrial(db, study, gard_node, today, update=update): yield query + # Extract AssociatedEntity info + for query in cypher_AssociatedEntity(study): yield query + # Extract and populate Location info + for query in cypher_Location(db, study): yield query + # Extract and populate Investigator info + for query in cypher_Investigator(study): yield query + # Extract and populate Condition info + for query in cypher_Condition(db, study, gard_names_dict): yield query + # Extract and populate StudyDesign info + yield cypher_StudyDesign(study) + # Extract and populate PrimaryOutcome info + for query in cypher_PrimaryOutcome(study): yield query + # Extract and populate Participant info + yield cypher_Participant(study) + # Extract and populate Intervention info + for query in cypher_Intervention(study, nlp): yield query + # Extract and populate Reference info + for query in cypher_Reference(study): yield query + +########################################################### +#START + +print(f"[CT] Database Selected: {sysvars.ct_db}\nContinuing with script in 5 seconds...") +sleep(5) + + +# Connect to the Neo4j database +db = AlertCypher(sysvars.ct_db) +gard_db = AlertCypher(sysvars.gard_db) + +# Setup NLP for RxNORM Mapping +nlp = spacy.load('en_ner_bc5cdr_md') +pattern = [{'ENT_TYPE':'CHEMICAL'}] +matcher = Matcher(nlp.vocab) +matcher.add('DRUG',[pattern]) + +# Get last updated date and current date +today = date.today().strftime('%m/%d/%y') +lastupdate_str = db.getConf('UPDATE_PROGRESS','rdas.ctkg_update') +lastupdate = datetime.strptime(lastupdate_str, "%m/%d/%y") +lastupdate = lastupdate.strftime('%m/%d/%Y') + + +in_progress = db.getConf('UPDATE_PROGRESS', 'clinical_in_progress') +print(f'in_progress:: {in_progress}') +if in_progress == 'True': + clinical_disease_progress = db.getConf('UPDATE_PROGRESS', 'clinical_disease_progress') + if not clinical_disease_progress == '': + clinical_disease_progress = int(clinical_disease_progress) + else: + clinical_disease_progress = 0 + + clinical_rxnorm_progress = db.getConf('UPDATE_PROGRESS', 'clinical_rxnorm_progress') + if not clinical_rxnorm_progress == '': + clinical_rxnorm_progress = int(clinical_rxnorm_progress) + else: + clinical_required_update_progress = 0 + clinical_current_step = db.getConf('UPDATE_PROGRESS', 'clinical_current_step') +else: + clinical_disease_progress = 0 + clinical_rxnorm_progress = 0 + clinical_current_step = '' + db.setConf('UPDATE_PROGRESS', 'clinical_in_progress', 'True') + cypher_GARD_populate() + + +if clinical_current_step == '': + gard_names_dict = get_GARD_names_syns(gard_db) + gard_response = gard_db.run('MATCH (x:GARD) RETURN x.GardId as gid, x.GardName as gname, x.Synonyms as syns').data() + for idx,response in enumerate(gard_response): + if idx < clinical_disease_progress: + continue + + name = response['gname'] + gid = response['gid'] + syns = response['syns'] + + gard_node = {'GardId':gid, 'GardName':name, 'Synonyms':syns} + gard_query = cypher_GARD(gard_node) + db.run(gard_query) + + names_no_filter = [name] + syns + syns = [syn for syn in syns if not is_acronym(syn)] + names = [name] + syns + + nctids = get_nctids(names, lastupdate) + print(str(idx) + f' -------- {name} -------- {gid} --- {len(nctids)} Trials') + + if len(nctids) > 0: + for full_study in get_full_studies(nctids): + if full_study: + + api_nctid = full_study.get('protocolSection',dict()).get('identificationModule',dict()).get('nctId',None) + check_for_trial = db.run(f'MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{api_nctid}\" RETURN ID(x) AS trial_id').data() + + if len(check_for_trial) > 0: + # Initiates Node Update + node_update = True + print('UPDATE TRUE::', api_nctid) + else: + # Initiates Node Creation + node_update = False + print('CREATE TRUE::', api_nctid) + + for query in generate_queries(db, nlp, full_study, gard_node, gard_names_dict, today, update=node_update): + if query: db.run(query) + print('created') + else: + print('Error in add for finding full trial data for ' + full_study['nctId']) + + db.setConf('UPDATE_PROGRESS', 'clinical_disease_progress', str(idx)) + diff --git a/RDAS_CTKG_REMAKE/acronym_test.csv b/RDAS_CTKG_REMAKE/acronym_test.csv new file mode 100644 index 0000000..ca6537e --- /dev/null +++ b/RDAS_CTKG_REMAKE/acronym_test.csv @@ -0,0 +1,52 @@ +0,1,2,3,4,5 +GARD,ORIG_TERMS,ACRONYM DETECT,ACRONYM_LEN,FILTERED_TRIALS,ACRO_ONLY_TRIALS +GARD:0000001,"['GRACILE syndrome', 'Fellman disease', 'Growth restriction-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome']",[],[],[[]],[[]] +GARD:0000003,['Ablepharon macrostomia syndrome'],[],[],[[]],[[]] +GARD:0000005,"['Abetalipoproteinemia', 'Bassen-Kornzweig disease', 'Homozygous familial hypobetalipoproteinemia']",[],[],"[['NCT01463735', 'NCT02435940', 'NCT00007228', 'NCT00004574', 'NCT05208879']]",[[]] +GARD:0000006,['Acromesomelic dysplasia'],[],[],[[]],[[]] +GARD:0000007,['Acromicric dysplasia'],[],[],[[]],[[]] +GARD:0000011,['Alternating hemiplegia of childhood'],['AHC'],[3],"[['NCT03857607', 'NCT06248645', 'NCT00682513', 'NCT06007521', 'NCT00931164', 'NCT04020848', 'NCT02408354', 'NCT06153186', 'NCT04944927', 'NCT04513002']]","[['NCT04020848', 'NCT03674164', 'NCT02408354', 'NCT04701242', 'NCT04944927', 'NCT00485186', 'NCT03331952', 'NCT06019988', 'NCT06248645', 'NCT03857555', 'NCT00931164', 'NCT06128655', 'NCT03601793', 'NCT06073301', 'NCT06490757', 'NCT03857607', 'NCT03206775', 'NCT02409537', 'NCT00682513', 'NCT03885401', 'NCT01289652', 'NCT05042934', 'NCT03900689', 'NCT05661318', 'NCT06007521', 'NCT02977026']]" +GARD:0000012,"['Hypersensitivity pneumonitis', 'Extrinsic allergic alveolitisniridia-cerebellar ataxia-intellectual disability syndrome', 'Gillespie syndrome']",[],[],[['NCT05390801']],[[]] +GARD:0000016,"['Ocular motor apraxia, Cogan type', 'Oculomotor apraxia; Cogan type']",[],[],[[]],[[]] +GARD:0000017,['Arachnoid cyst'],[],[],"[['NCT02482181', 'NCT05639491', 'NCT03656016', 'NCT01391702', 'NCT05726201', 'NCT04158284', 'NCT00011245', 'NCT04569201', 'NCT04046523', 'NCT04249921', 'NCT06352931', 'NCT03485612']]",[[]] +GARD:0000019,"['Dihydropyrimidine dehydrogenase deficiency', 'Familial pyrimidinemia']",[],[],"[['NCT02324452', 'NCT04541381', 'NCT06475352', 'NCT00130936', 'NCT04918264', 'NCT06092346', 'NCT00126867', 'NCT01224730', 'NCT04269369', 'NCT04194957', 'NCT00131599', 'NCT06245356', 'NCT00953537', 'NCT01547923']]",[[]] +GARD:0000022,"['Björnstad syndrome', 'Deafness-pili torti-hypogonadism syndrome', 'Hearing loss-pili torti-hypogonadism syndrome']",[],[],[[]],[[]] +GARD:0000023,['Blepharophimosis-ptosis-epicanthus inversus syndrome'],['BPES'],[4],[[]],[['NCT05358002']] +GARD:0000026,['Cat-eye syndromeat-scratch disease', 'Bartonellosis due to Bartonella henselae infection']",[],[],"[['NCT03132116', 'NCT01469702']]",[[]] +GARD:0000028,"['Catel-Manzke syndrome', 'Hyperphalangy-clinodactyly of index finger with Pierre Robin syndrome', 'Index finger anomaly-Pierre Robin syndrome', 'Micrognathia digital syndrome', 'Palatodigital syndrome; Catel-Manzke type', 'Pierre Robin sequence-hyperphalangy-clinodactyly syndrome', 'Pierre Robin syndrome-hyperphalangy-clinodactyly syndrome']",[],[],[[]],[[]] +GARD:0000029,"['CHARGE syndrome', 'CHARGE association', 'Coloboma-heart defects-atresia choanae-retardation of growth and development-genitourinary problems-ear abnormalities syndrome', 'Hall-Hittner syndrome']",[],[],"[['NCT03186144', 'NCT04463316', 'NCT06475651', 'NCT01314534', 'NCT05764980']]",[[]] +GARD:0000031,"['Serpiginous choroiditis', 'Geographic helicoid peripapillary choroidopathy']",[],[],"[['NCT00407121', 'NCT00645697']]",[[]] +GARD:0000035,"['Tetrasomy 18p', 'Isochromosome 18p']",[],[],[[]],[[]] +GARD:0000037,"['Partial deletion of the short arm of chromosome 3', 'Partial deletion of chromosome 3p', 'Partial monosomy of chromosome 3p', 'Partial monosomy of the short arm of chromosome 3']",[],[],[[]],[[]] +GARD:0000039,['WT limb-blood syndrome'],[],[],[[]],[[]] +GARD:0000042,"['Tetrasomy 9p', 'Isochromosome 9p']",[],[],[[]],[[]] +GARD:0000043,"['Mosaic trisomy 9', 'Mosaic trisomy chromosome 9', 'Trisomy 9 mosaicism']",[],[],[[]],[[]] +GARD:0000044,"['Haim-Munk syndrome', 'Keratosis palmoplantaris-periodontopathia-onychogryposis syndrome', 'Palmoplantar hyperkeratosis-periodontopathia-onychogryposis syndrome', 'Palmoplantar keratoderma-periodontopathia-onychogryposis syndrome']",[],[],[[]],[[]] +GARD:0000045,"['Congenital varicella syndrome', 'Antenatal varicella virus infection', 'Mother-to-child transmission of varicella syndrome']",[],[],[['NCT05923970']],[[]] +GARD:0000047,"['Crigler-Najjar syndrome type 1', 'Bilirubin uridinediphosphate glucuronosyltransferase deficiency type 1', 'Bilirubin-UGT deficiency type 1']",[],[],"[['NCT05687474', 'NCT02051049', 'NCT00154960', 'NCT02302690', 'NCT04216797', 'NCT00461799', 'NCT03466463', 'NCT03343756', 'NCT00515307', 'NCT01345578', 'NCT01765283', 'NCT03223194', 'NCT03078881', 'NCT06518005', 'NCT02356978', 'NCT00564707']]",[[]] +GARD:0000048,"['Isolated cytochrome C oxidase deficiency', 'Isolated COX deficiency', 'Isolated mitochondrial respiratory chain complex IV deficiency']",[],[],[[]],[[]] +GARD:0000049,"['De Barsy syndrome', 'Cutis laxa-corneal clouding-intellectual disability syndrome', 'Progeroid syndrome; De Barsy type']",[],[],[[]],[[]] +GARD:0000054,['Duodenal atresia'],[],[],"[['NCT03256669', 'NCT06115226', 'NCT04907266', 'NCT06394453', 'NCT05142839', 'NCT02562157', 'NCT03054987', 'NCT04114279', 'NCT03056261', 'NCT03463668']]",[[]] +GARD:0000059,"['Spinocerebellar ataxia type 34', 'Erythrokeratodermia with ataxia', 'Spinocerebellar ataxia and erythrokeratodermia']",['SCA34'],[5],[['NCT01793168']],"[['NCT04591483', 'NCT01793168']]" +GARD:0000060,"['Iridocorneal endothelial syndrome', 'ICE syndrome']",[],[],"[['NCT04025801', 'NCT02020044', 'NCT00800111', 'NCT03270761', 'NCT00001161']]",[[]] +GARD:0000061,"['Femoral-facial syndrome', 'Femoral hypoplasia-unusual facies syndromeilippi syndrome', 'Type 1 syndactyly-microcephaly-intellectual disability syndrome']",[],[],[[]],[[]] +GARD:0000064,"['Fountain syndrome', 'Deafness-skeletal dysplasia-coarse face with full lips syndrome', 'Deafness-skeletal dysplasia-lip granuloma syndrome', 'Hearing loss-skeletal dysplasia-coarse face with full lips syndrome', 'Hearing loss-skeletal dysplasia-lip granuloma syndrome']",[],[],[[]],[[]] +GARD:0000065,"['Galloway-Mowat syndrome', 'Galloway syndrome', 'Microcephaly-hiatus hernia-nephrotic syndrome', 'Nephrosis-neuronal dysmigration syndrome']",[],[],[[]],[[]] +GARD:0000066,"['Gorlin-Chaudhry-Moss syndrome', 'Craniofacial dysostosis-genital; dental; cardiac anomalies syndrome', 'Cranofacial dysostosis-hypertrichosis-hypoplasia of labia majora syndrome', 'Dental and eye anomalies-patent ductus arteriosus-normal intelligence syndrome', 'GCM syndrome']",[],[],[[]],[[]] +GARD:0000068,"['Hypoglossia-hypodactyly syndrome', 'Aglossia-adactylia syndrome', 'Hanhart syndrome', 'Jussieu syndrome']",[],[],[[]],[[]] +GARD:0000069,['Hantavirus pulmonary syndrome'],[],[],"[['NCT05415904', 'NCT04323904', 'NCT04020536', 'NCT03682107', 'NCT01502345', 'NCT00868946', 'NCT03718130', 'NCT02116205', 'NCT02455375', 'NCT00001123', 'NCT00623168', 'NCT00533767', 'NCT04834713', 'NCT04333459', 'NCT04375098', 'NCT00128180', 'NCT04338672', 'NCT06009042']]",[[]] +GARD:0000070,"['Kasabach-Merritt syndrome', 'Hemangioma-thrombocytopenia syndrome']",[],[],"[['NCT04775173', 'NCT04448873', 'NCT04056962', 'NCT02110069', 'NCT00576888', 'NCT04598204', 'NCT05324384', 'NCT03188068', 'NCT05351216', 'NCT04077515', 'NCT04409691']]",[[]] +GARD:0000073,"['X-linked hyper-IgM syndrome', 'Hyper-IgM syndrome due to CD40 ligand deficiency', 'Hyper-IgM syndrome due to CD40L deficiency', 'Hyper-IgM syndrome type 1']","['HIGM1', 'XHIGM']","[5, 5]","[['NCT01652092', 'NCT00266513', 'NCT00006054', 'NCT00001244', 'NCT00004341', 'NCT01998633', 'NCT00634569', 'NCT06092346', 'NCT01963143', 'NCT03965260', 'NCT03383380', 'NCT00006319', 'NCT01289847', 'NCT00730314', 'NCT01884311', 'NCT00001145']]","[['NCT00982358', 'NCT01774838', 'NCT00053391', 'NCT00730314', 'NCT00513149', 'NCT06092346', 'NCT00302809', 'NCT01604031', 'NCT00020540', 'NCT00006054', 'NCT03759262', 'NCT06169007', 'NCT01998633', 'NCT03329950', 'NCT00056134', 'NCT00006319', 'NCT00078520', 'NCT00840931', 'NCT00579306', 'NCT00322959', 'NCT00466050', 'NCT00320164', 'NCT02789670', 'NCT01257880', 'NCT00634569', 'NCT03110445', 'NCT03333486', 'NCT00004341', 'NCT03965260', 'NCT05502887', 'NCT01162824', 'NCT01652092', 'NCT00008749', 'NCT05733598', 'NCT02097199', 'NCT01422317', 'NCT00058786', 'NCT05059431', 'NCT00328887', 'NCT02320357', 'NCT00001145', 'NCT01812200', 'NCT05076760', 'NCT06305286', 'NCT00504322', 'NCT02140996', 'NCT00266513', 'NCT01289847', 'NCT00005970', 'NCT02123004', 'NCT01963143', 'NCT00063856', 'NCT03383380', 'NCT00058799', 'NCT00001789', 'NCT03480568', 'NCT03288623', 'NCT04630132', 'NCT04329377', 'NCT00019591', 'NCT01276678', 'NCT04859218', 'NCT01112137', 'NCT01741324', 'NCT02974192', 'NCT00224354', 'NCT00458679', 'NCT05743270', 'NCT04322149', 'NCT04080453', 'NCT01541319', 'NCT01884311', 'NCT00441844', 'NCT04735978', 'NCT00001244', 'NCT04703647', 'NCT01530698', 'NCT02533596', 'NCT06312852', 'NCT04420013', 'NCT00006113']]" +GARD:0000076,"['Hypohidrotic ectodermal dysplasia', 'Anhidrotic ectodermal dysplasia']",['HED'],[3],"[['NCT01308333', 'NCT01992289', 'NCT01386775', 'NCT02099552', 'NCT04938622', 'NCT01293565', 'NCT04980638', 'NCT01398813', 'NCT04741412', 'NCT01629940', 'NCT01398397', 'NCT01871714', 'NCT03912792', 'NCT04223167', 'NCT05378932', 'NCT01629927', 'NCT01135888', 'NCT01109290', 'NCT01775462', 'NCT01342133', 'NCT01108770', 'NCT01564225']]","[['NCT01246037', 'NCT03344159', 'NCT01992289', 'NCT02669563', 'NCT01847300', 'NCT02184013', 'NCT06147206', 'NCT04549454', 'NCT01386775', 'NCT05263271', 'NCT03996460', 'NCT02099552', 'NCT05802472', 'NCT01293565', 'NCT04980638', 'NCT03339986', 'NCT01398813', 'NCT06019481', 'NCT04027426', 'NCT02899910', 'NCT04741412', 'NCT01629940', 'NCT02372188', 'NCT04416711', 'NCT01871714', 'NCT01398397', 'NCT04579068', 'NCT05453539', 'NCT05378932', 'NCT03769883', 'NCT03337438', 'NCT02216669', 'NCT01629927', 'NCT04027608', 'NCT00756366', 'NCT02116140', 'NCT05372042', 'NCT01851603', 'NCT01135888', 'NCT04783519', 'NCT04774770', 'NCT04089137', 'NCT00586183', 'NCT01648790', 'NCT01109290', 'NCT02932241', 'NCT03191682', 'NCT02481206', 'NCT01775462', 'NCT02404480', 'NCT06094933', 'NCT04535193', 'NCT06436586', 'NCT01108770', 'NCT03076489', 'NCT01564225', 'NCT01197352']]" +GARD:0000079,"['Metaphyseal chondrodysplasia, Jansen type']",[],[],[['NCT01793168']],[[]] +GARD:0000080,['Johanson-Blizzard syndrome'],['JBS'],[3],[[]],"[['NCT02902731', 'NCT04671823', 'NCT01114399', 'NCT04463316']]" +GARD:0000081,"['Intellectual developmental disorder, x-linked, syndromic, turner type', 'mental retardation and macrocephaly syndrome', 'mental retardation; x-linked; with growth retardation; deafness; and microgenitalism', 'juberg-marsidi syndrome', 'mental retardation; x-linked; syndromic; brooks-wisniewski-brown type', 'Mental retardation; x-linked; syndromic; turner type', 'brooks-wisniewski-brown syndrome']",[],[],[['NCT01238250']],[[]] +GARD:0000082,"['KBG syndrome', 'Short stature-facial and skeletal anomalies-intellectual disability-macrodontia syndrome']",[],[],[['NCT06465641']],[[]] +GARD:0000083,['Autosomal dominant Kenny-Caffey syndrome'],[],[],[[]],[[]] +GARD:0000084,"['Lipodystrophy, congenital generalized, type 1', 'lipodystrophy; berardinelli-seip congenital; type 1', 'Berardinelli-seip congenital lipodystrophy; type 1', 'brunzell syndrome; agpat2-related']",[],[],[[]],[[]] +GARD:0000085,['Thanatophoric dysplasiahudley-McCullough syndrome'],[],[],[[]],[[]] +GARD:0000087,"['Microphthalmia, Lenz type', 'Lenz microphthalmia']",[],[],[['NCT00011843']],[[]] diff --git a/RDAS_CTKG_REMAKE/acronym_test_expansion_concept.csv b/RDAS_CTKG_REMAKE/acronym_test_expansion_concept.csv new file mode 100644 index 0000000..adf8d14 --- /dev/null +++ b/RDAS_CTKG_REMAKE/acronym_test_expansion_concept.csv @@ -0,0 +1,51 @@ +GARD,ORIG_TERMS,ACRONYM DETECT,ACRONYM_LEN,FILTERED_TRIALS,ACRO_ONLY_TRIALS +GARD:0000001,"['GRACILE syndrome', 'Fellman disease', 'Growth restriction-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome']",[],[],[[]],[[]] +GARD:0000003,['Ablepharon macrostomia syndrome'],[],[],[[]],[[]] +GARD:0000005,"['Abetalipoproteinemia', 'Bassen-Kornzweig disease', 'Homozygous familial hypobetalipoproteinemia']",[],[],"[['NCT01463735', 'NCT02435940', 'NCT00007228', 'NCT00004574', 'NCT05208879']]",[[]] +GARD:0000006,['Acromesomelic dysplasia'],[],[],[[]],[[]] +GARD:0000007,['Acromicric dysplasia'],[],[],[[]],[[]] +GARD:0000011,['Alternating hemiplegia of childhood'],['AHC'],[3],"[['NCT03857607', 'NCT06248645', 'NCT00682513', 'NCT06007521', 'NCT00931164', 'NCT04020848', 'NCT02408354', 'NCT06153186', 'NCT04944927', 'NCT04513002']]","[['NCT04020848', 'NCT03674164', 'NCT02408354', 'NCT04701242', 'NCT04944927', 'NCT00485186', 'NCT03331952', 'NCT06019988', 'NCT06248645', 'NCT03857555', 'NCT00931164', 'NCT06128655', 'NCT03601793', 'NCT06073301', 'NCT06490757', 'NCT03857607', 'NCT03206775', 'NCT02409537', 'NCT00682513', 'NCT03885401', 'NCT01289652', 'NCT05042934', 'NCT03900689', 'NCT05661318', 'NCT06007521', 'NCT02977026']]" +GARD:0000012,"['Hypersensitivity pneumonitis', 'Extrinsic allergic alveolitisniridia-cerebellar ataxia-intellectual disability syndrome', 'Gillespie syndrome']",[],[],[['NCT05390801']],[[]] +GARD:0000016,"['Ocular motor apraxia, Cogan type', 'Oculomotor apraxia; Cogan type']",[],[],[[]],[[]] +GARD:0000017,['Arachnoid cyst'],[],[],"[['NCT02482181', 'NCT05639491', 'NCT03656016', 'NCT01391702', 'NCT05726201', 'NCT04158284', 'NCT00011245', 'NCT04569201', 'NCT04046523', 'NCT04249921', 'NCT06352931', 'NCT03485612']]",[[]] +GARD:0000019,"['Dihydropyrimidine dehydrogenase deficiency', 'Familial pyrimidinemia']",[],[],"[['NCT02324452', 'NCT04541381', 'NCT06475352', 'NCT00130936', 'NCT04918264', 'NCT06092346', 'NCT00126867', 'NCT01224730', 'NCT04269369', 'NCT04194957', 'NCT00131599', 'NCT06245356', 'NCT00953537', 'NCT01547923']]",[[]] +GARD:0000022,"['Björnstad syndrome', 'Deafness-pili torti-hypogonadism syndrome', 'Hearing loss-pili torti-hypogonadism syndrome']",[],[],[[]],[[]] +GARD:0000023,['Blepharophimosis-ptosis-epicanthus inversus syndrome'],['BPES'],[4],[[]],[['NCT05358002']] +GARD:0000026,['Cat-eye syndromeat-scratch disease', 'Bartonellosis due to Bartonella henselae infection']",[],[],"[['NCT03132116', 'NCT01469702']]",[[]] +GARD:0000028,"['Catel-Manzke syndrome', 'Hyperphalangy-clinodactyly of index finger with Pierre Robin syndrome', 'Index finger anomaly-Pierre Robin syndrome', 'Micrognathia digital syndrome', 'Palatodigital syndrome; Catel-Manzke type', 'Pierre Robin sequence-hyperphalangy-clinodactyly syndrome', 'Pierre Robin syndrome-hyperphalangy-clinodactyly syndrome']",[],[],[[]],[[]] +GARD:0000029,"['CHARGE syndrome', 'CHARGE association', 'Coloboma-heart defects-atresia choanae-retardation of growth and development-genitourinary problems-ear abnormalities syndrome', 'Hall-Hittner syndrome']",[],[],"[['NCT03186144', 'NCT04463316', 'NCT06475651', 'NCT01314534', 'NCT05764980']]",[[]] +GARD:0000031,"['Serpiginous choroiditis', 'Geographic helicoid peripapillary choroidopathy']",[],[],"[['NCT00407121', 'NCT00645697']]",[[]] +GARD:0000035,"['Tetrasomy 18p', 'Isochromosome 18p']",[],[],[[]],[[]] +GARD:0000037,"['Partial deletion of the short arm of chromosome 3', 'Partial deletion of chromosome 3p', 'Partial monosomy of chromosome 3p', 'Partial monosomy of the short arm of chromosome 3']",[],[],[[]],[[]] +GARD:0000039,['WT limb-blood syndrome'],[],[],[[]],[[]] +GARD:0000042,"['Tetrasomy 9p', 'Isochromosome 9p']",[],[],[[]],[[]] +GARD:0000043,"['Mosaic trisomy 9', 'Mosaic trisomy chromosome 9', 'Trisomy 9 mosaicism']",[],[],[[]],[[]] +GARD:0000044,"['Haim-Munk syndrome', 'Keratosis palmoplantaris-periodontopathia-onychogryposis syndrome', 'Palmoplantar hyperkeratosis-periodontopathia-onychogryposis syndrome', 'Palmoplantar keratoderma-periodontopathia-onychogryposis syndrome']",[],[],[[]],[[]] +GARD:0000045,"['Congenital varicella syndrome', 'Antenatal varicella virus infection', 'Mother-to-child transmission of varicella syndrome']",[],[],[['NCT05923970']],[[]] +GARD:0000047,"['Crigler-Najjar syndrome type 1', 'Bilirubin uridinediphosphate glucuronosyltransferase deficiency type 1', 'Bilirubin-UGT deficiency type 1']",[],[],"[['NCT05687474', 'NCT02051049', 'NCT00154960', 'NCT02302690', 'NCT04216797', 'NCT00461799', 'NCT03466463', 'NCT03343756', 'NCT00515307', 'NCT01345578', 'NCT01765283', 'NCT03223194', 'NCT03078881', 'NCT06518005', 'NCT02356978', 'NCT00564707']]",[[]] +GARD:0000048,"['Isolated cytochrome C oxidase deficiency', 'Isolated COX deficiency', 'Isolated mitochondrial respiratory chain complex IV deficiency']",[],[],[[]],[[]] +GARD:0000049,"['De Barsy syndrome', 'Cutis laxa-corneal clouding-intellectual disability syndrome', 'Progeroid syndrome; De Barsy type']",[],[],[[]],[[]] +GARD:0000054,['Duodenal atresia'],[],[],"[['NCT03256669', 'NCT06115226', 'NCT04907266', 'NCT06394453', 'NCT05142839', 'NCT02562157', 'NCT03054987', 'NCT04114279', 'NCT03056261', 'NCT03463668']]",[[]] +GARD:0000059,"['Spinocerebellar ataxia type 34', 'Erythrokeratodermia with ataxia', 'Spinocerebellar ataxia and erythrokeratodermia']",['SCA34'],[5],[['NCT01793168']],"[['NCT04591483', 'NCT01793168']]" +GARD:0000060,"['Iridocorneal endothelial syndrome', 'ICE syndrome']",[],[],"[['NCT04025801', 'NCT02020044', 'NCT00800111', 'NCT03270761', 'NCT00001161']]",[[]] +GARD:0000061,"['Femoral-facial syndrome', 'Femoral hypoplasia-unusual facies syndromeilippi syndrome', 'Type 1 syndactyly-microcephaly-intellectual disability syndrome']",[],[],[[]],[[]] +GARD:0000064,"['Fountain syndrome', 'Deafness-skeletal dysplasia-coarse face with full lips syndrome', 'Deafness-skeletal dysplasia-lip granuloma syndrome', 'Hearing loss-skeletal dysplasia-coarse face with full lips syndrome', 'Hearing loss-skeletal dysplasia-lip granuloma syndrome']",[],[],[[]],[[]] +GARD:0000065,"['Galloway-Mowat syndrome', 'Galloway syndrome', 'Microcephaly-hiatus hernia-nephrotic syndrome', 'Nephrosis-neuronal dysmigration syndrome']",[],[],[[]],[[]] +GARD:0000066,"['Gorlin-Chaudhry-Moss syndrome', 'Craniofacial dysostosis-genital; dental; cardiac anomalies syndrome', 'Cranofacial dysostosis-hypertrichosis-hypoplasia of labia majora syndrome', 'Dental and eye anomalies-patent ductus arteriosus-normal intelligence syndrome', 'GCM syndrome']",[],[],[[]],[[]] +GARD:0000068,"['Hypoglossia-hypodactyly syndrome', 'Aglossia-adactylia syndrome', 'Hanhart syndrome', 'Jussieu syndrome']",[],[],[[]],[[]] +GARD:0000069,['Hantavirus pulmonary syndrome'],[],[],"[['NCT05415904', 'NCT04323904', 'NCT04020536', 'NCT03682107', 'NCT01502345', 'NCT00868946', 'NCT03718130', 'NCT02116205', 'NCT02455375', 'NCT00001123', 'NCT00623168', 'NCT00533767', 'NCT04834713', 'NCT04333459', 'NCT04375098', 'NCT00128180', 'NCT04338672', 'NCT06009042']]",[[]] +GARD:0000070,"['Kasabach-Merritt syndrome', 'Hemangioma-thrombocytopenia syndrome']",[],[],"[['NCT04775173', 'NCT04448873', 'NCT04056962', 'NCT02110069', 'NCT00576888', 'NCT04598204', 'NCT05324384', 'NCT03188068', 'NCT05351216', 'NCT04077515', 'NCT04409691']]",[[]] +GARD:0000073,"['X-linked hyper-IgM syndrome', 'Hyper-IgM syndrome due to CD40 ligand deficiency', 'Hyper-IgM syndrome due to CD40L deficiency', 'Hyper-IgM syndrome type 1']","['HIGM1', 'XHIGM']","[5, 5]","[['NCT01652092', 'NCT00266513', 'NCT00006054', 'NCT00001244', 'NCT00004341', 'NCT01998633', 'NCT00634569', 'NCT06092346', 'NCT01963143', 'NCT03965260', 'NCT03383380', 'NCT00006319', 'NCT01289847', 'NCT00730314', 'NCT01884311', 'NCT00001145']]","[['NCT00982358', 'NCT01774838', 'NCT00053391', 'NCT00730314', 'NCT00513149', 'NCT06092346', 'NCT00302809', 'NCT01604031', 'NCT00020540', 'NCT00006054', 'NCT03759262', 'NCT06169007', 'NCT01998633', 'NCT03329950', 'NCT00056134', 'NCT00006319', 'NCT00078520', 'NCT00840931', 'NCT00579306', 'NCT00322959', 'NCT00466050', 'NCT00320164', 'NCT02789670', 'NCT01257880', 'NCT00634569', 'NCT03110445', 'NCT03333486', 'NCT00004341', 'NCT03965260', 'NCT05502887', 'NCT01162824', 'NCT01652092', 'NCT00008749', 'NCT05733598', 'NCT02097199', 'NCT01422317', 'NCT00058786', 'NCT05059431', 'NCT00328887', 'NCT02320357', 'NCT00001145', 'NCT01812200', 'NCT05076760', 'NCT06305286', 'NCT00504322', 'NCT02140996', 'NCT00266513', 'NCT01289847', 'NCT00005970', 'NCT02123004', 'NCT01963143', 'NCT00063856', 'NCT03383380', 'NCT00058799', 'NCT00001789', 'NCT03480568', 'NCT03288623', 'NCT04630132', 'NCT04329377', 'NCT00019591', 'NCT01276678', 'NCT04859218', 'NCT01112137', 'NCT01741324', 'NCT02974192', 'NCT00224354', 'NCT00458679', 'NCT05743270', 'NCT04322149', 'NCT04080453', 'NCT01541319', 'NCT01884311', 'NCT00441844', 'NCT04735978', 'NCT00001244', 'NCT04703647', 'NCT01530698', 'NCT02533596', 'NCT06312852', 'NCT04420013', 'NCT00006113']]" +GARD:0000076,"['Hypohidrotic ectodermal dysplasia', 'Anhidrotic ectodermal dysplasia']",['HED'],[3],"[['NCT01308333', 'NCT01992289', 'NCT01386775', 'NCT02099552', 'NCT04938622', 'NCT01293565', 'NCT04980638', 'NCT01398813', 'NCT04741412', 'NCT01629940', 'NCT01398397', 'NCT01871714', 'NCT03912792', 'NCT04223167', 'NCT05378932', 'NCT01629927', 'NCT01135888', 'NCT01109290', 'NCT01775462', 'NCT01342133', 'NCT01108770', 'NCT01564225']]","[['NCT01246037', 'NCT03344159', 'NCT01992289', 'NCT02669563', 'NCT01847300', 'NCT02184013', 'NCT06147206', 'NCT04549454', 'NCT01386775', 'NCT05263271', 'NCT03996460', 'NCT02099552', 'NCT05802472', 'NCT01293565', 'NCT04980638', 'NCT03339986', 'NCT01398813', 'NCT06019481', 'NCT04027426', 'NCT02899910', 'NCT04741412', 'NCT01629940', 'NCT02372188', 'NCT04416711', 'NCT01871714', 'NCT01398397', 'NCT04579068', 'NCT05453539', 'NCT05378932', 'NCT03769883', 'NCT03337438', 'NCT02216669', 'NCT01629927', 'NCT04027608', 'NCT00756366', 'NCT02116140', 'NCT05372042', 'NCT01851603', 'NCT01135888', 'NCT04783519', 'NCT04774770', 'NCT04089137', 'NCT00586183', 'NCT01648790', 'NCT01109290', 'NCT02932241', 'NCT03191682', 'NCT02481206', 'NCT01775462', 'NCT02404480', 'NCT06094933', 'NCT04535193', 'NCT06436586', 'NCT01108770', 'NCT03076489', 'NCT01564225', 'NCT01197352']]" +GARD:0000079,"['Metaphyseal chondrodysplasia, Jansen type']",[],[],[['NCT01793168']],[[]] +GARD:0000080,['Johanson-Blizzard syndrome'],['JBS'],[3],[[]],"[['NCT02902731', 'NCT04671823', 'NCT01114399', 'NCT04463316']]" +GARD:0000081,"['Intellectual developmental disorder, x-linked, syndromic, turner type', 'mental retardation and macrocephaly syndrome', 'mental retardation; x-linked; with growth retardation; deafness; and microgenitalism', 'juberg-marsidi syndrome', 'mental retardation; x-linked; syndromic; brooks-wisniewski-brown type', 'Mental retardation; x-linked; syndromic; turner type', 'brooks-wisniewski-brown syndrome']",[],[],[['NCT01238250']],[[]] +GARD:0000082,"['KBG syndrome', 'Short stature-facial and skeletal anomalies-intellectual disability-macrodontia syndrome']",[],[],[['NCT06465641']],[[]] +GARD:0000083,['Autosomal dominant Kenny-Caffey syndrome'],[],[],[[]],[[]] +GARD:0000084,"['Lipodystrophy, congenital generalized, type 1', 'lipodystrophy; berardinelli-seip congenital; type 1', 'Berardinelli-seip congenital lipodystrophy; type 1', 'brunzell syndrome; agpat2-related']",[],[],[[]],[[]] +GARD:0000085,['Thanatophoric dysplasiahudley-McCullough syndrome'],[],[],[[]],[[]] +GARD:0000087,"['Microphthalmia, Lenz type', 'Lenz microphthalmia']",[],[],[['NCT00011843']],[[]] diff --git a/RDAS_CTKG_REMAKE/acronym_test_expansion_concept_new_trials.csv b/RDAS_CTKG_REMAKE/acronym_test_expansion_concept_new_trials.csv new file mode 100644 index 0000000..1535811 --- /dev/null +++ b/RDAS_CTKG_REMAKE/acronym_test_expansion_concept_new_trials.csv @@ -0,0 +1,51 @@ +GARD,ORIG_TERMS,ACRONYM DETECT,ACRONYM_LEN,FILTERED_TRIALS,ACRO_ONLY_TRIALS,isLarger,NEW_TRIALS +GARD:0000001,"['GRACILE syndrome', 'Fellman disease', 'Growth restriction-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000003,['Ablepharon macrostomia syndrome'],[],[],[[]],[[]],same size,[] +GARD:0000005,"['Abetalipoproteinemia', 'Bassen-Kornzweig disease', 'Homozygous familial hypobetalipoproteinemia']",[],[],"[['NCT01463735', 'NCT02435940', 'NCT00007228', 'NCT00004574', 'NCT05208879']]",[[]],concept,"['NCT01463735', 'NCT00007228']" +GARD:0000006,['Acromesomelic dysplasia'],[],[],[[]],[[]],same size,[] +GARD:0000007,['Acromicric dysplasia'],[],[],[[]],[[]],same size,[] +GARD:0000011,['Alternating hemiplegia of childhood'],['AHC'],[3],"[['NCT03857607', 'NCT06248645', 'NCT00682513', 'NCT06007521', 'NCT00931164', 'NCT04020848', 'NCT02408354', 'NCT06153186', 'NCT04944927', 'NCT04513002']]","[['NCT04020848', 'NCT03674164', 'NCT02408354', 'NCT04701242', 'NCT04944927', 'NCT00485186', 'NCT03331952', 'NCT06019988', 'NCT06248645', 'NCT03857555', 'NCT00931164', 'NCT06128655', 'NCT03601793', 'NCT06073301', 'NCT06490757', 'NCT03857607', 'NCT03206775', 'NCT02409537', 'NCT00682513', 'NCT03885401', 'NCT01289652', 'NCT05042934', 'NCT03900689', 'NCT05661318', 'NCT06007521', 'NCT02977026']]",same size,[] +GARD:0000012,"['Hypersensitivity pneumonitis', 'Extrinsic allergic alveolitisconcept,"['NCT03678519', 'NCT05842681', 'NCT02631603']" +GARD:0000013,"['Aniridia-cerebellar ataxia-intellectual disability syndrome', 'Gillespie syndrome']",[],[],[['NCT05390801']],[[]],same size,[] +GARD:0000016,"['Ocular motor apraxia, Cogan type', 'Oculomotor apraxia; Cogan type']",[],[],[[]],[[]],same size,[] +GARD:0000017,['Arachnoid cyst'],[],[],"[['NCT02482181', 'NCT05639491', 'NCT03656016', 'NCT01391702', 'NCT05726201', 'NCT04158284', 'NCT00011245', 'NCT04569201', 'NCT04046523', 'NCT04249921', 'NCT06352931', 'NCT03485612']]",[[]],concept,"['NCT05639491', 'NCT04158284', 'NCT05726201', 'NCT06352931', 'NCT02482181', 'NCT00011245']" +GARD:0000019,"['Dihydropyrimidine dehydrogenase deficiency', 'Familial pyrimidinemia']",[],[],"[['NCT02324452', 'NCT04541381', 'NCT06475352', 'NCT00130936', 'NCT04918264', 'NCT06092346', 'NCT00126867', 'NCT01224730', 'NCT04269369', 'NCT04194957', 'NCT00131599', 'NCT06245356', 'NCT00953537', 'NCT01547923']]",[[]],concept,"['NCT04918264', 'NCT01547923', 'NCT01224730', 'NCT04269369', 'NCT02324452', 'NCT00126867', 'NCT00131599', 'NCT04194957', 'NCT00953537', 'NCT06475352', 'NCT00130936']" +GARD:0000022,"['Björnstad syndrome', 'Deafness-pili torti-hypogonadism syndrome', 'Hearing loss-pili torti-hypogonadism syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000023,['Blepharophimosis-ptosis-epicanthus inversus syndrome'],['BPES'],[4],[[]],[['NCT05358002']],same size,[] +GARD:0000026,['Cat-eye syndromesame size,[] +GARD:0000027,"['Cat-scratch disease', 'Bartonellosis due to Bartonella henselae infection']",[],[],"[['NCT03132116', 'NCT01469702']]",[[]],same size,[] +GARD:0000028,"['Catel-Manzke syndrome', 'Hyperphalangy-clinodactyly of index finger with Pierre Robin syndrome', 'Index finger anomaly-Pierre Robin syndrome', 'Micrognathia digital syndrome', 'Palatodigital syndrome; Catel-Manzke type', 'Pierre Robin sequence-hyperphalangy-clinodactyly syndrome', 'Pierre Robin syndrome-hyperphalangy-clinodactyly syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000029,"['CHARGE syndrome', 'CHARGE association', 'Coloboma-heart defects-atresia choanae-retardation of growth and development-genitourinary problems-ear abnormalities syndrome', 'Hall-Hittner syndrome']",[],[],"[['NCT03186144', 'NCT04463316', 'NCT06475651', 'NCT01314534', 'NCT05764980']]",[[]],same size,[] +GARD:0000031,"['Serpiginous choroiditis', 'Geographic helicoid peripapillary choroidopathy']",[],[],"[['NCT00407121', 'NCT00645697']]",[[]],same size,[] +GARD:0000035,"['Tetrasomy 18p', 'Isochromosome 18p']",[],[],[[]],[[]],same size,[] +GARD:0000037,"['Partial deletion of the short arm of chromosome 3', 'Partial deletion of chromosome 3p', 'Partial monosomy of chromosome 3p', 'Partial monosomy of the short arm of chromosome 3']",[],[],[[]],[[]],same size,[] +GARD:0000039,['WT limb-blood syndrome'],[],[],[[]],[[]],same size,[] +GARD:0000042,"['Tetrasomy 9p', 'Isochromosome 9p']",[],[],[[]],[[]],same size,[] +GARD:0000043,"['Mosaic trisomy 9', 'Mosaic trisomy chromosome 9', 'Trisomy 9 mosaicism']",[],[],[[]],[[]],same size,[] +GARD:0000044,"['Haim-Munk syndrome', 'Keratosis palmoplantaris-periodontopathia-onychogryposis syndrome', 'Palmoplantar hyperkeratosis-periodontopathia-onychogryposis syndrome', 'Palmoplantar keratoderma-periodontopathia-onychogryposis syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000045,"['Congenital varicella syndrome', 'Antenatal varicella virus infection', 'Mother-to-child transmission of varicella syndrome']",[],[],[['NCT05923970']],[[]],same size,[] +GARD:0000047,"['Crigler-Najjar syndrome type 1', 'Bilirubin uridinediphosphate glucuronosyltransferase deficiency type 1', 'Bilirubin-UGT deficiency type 1']",[],[],"[['NCT05687474', 'NCT02051049', 'NCT00154960', 'NCT02302690', 'NCT04216797', 'NCT00461799', 'NCT03466463', 'NCT03343756', 'NCT00515307', 'NCT01345578', 'NCT01765283', 'NCT03223194', 'NCT03078881', 'NCT06518005', 'NCT02356978', 'NCT00564707']]",[[]],concept,"['NCT05687474', 'NCT00515307', 'NCT03466463', 'NCT02051049', 'NCT00154960', 'NCT00461799', 'NCT03343756', 'NCT03078881', 'NCT04216797', 'NCT01345578', 'NCT02302690', 'NCT01765283', 'NCT03223194', 'NCT06518005', 'NCT02356978', 'NCT00564707']" +GARD:0000048,"['Isolated cytochrome C oxidase deficiency', 'Isolated COX deficiency', 'Isolated mitochondrial respiratory chain complex IV deficiency']",[],[],[[]],[[]],same size,[] +GARD:0000049,"['De Barsy syndrome', 'Cutis laxa-corneal clouding-intellectual disability syndrome', 'Progeroid syndrome; De Barsy type']",[],[],[[]],[[]],same size,[] +GARD:0000054,['Duodenal atresia'],[],[],"[['NCT03256669', 'NCT06115226', 'NCT04907266', 'NCT06394453', 'NCT05142839', 'NCT02562157', 'NCT03054987', 'NCT04114279', 'NCT03056261', 'NCT03463668']]",[[]],concept,"['NCT02562157', 'NCT03463668', 'NCT03054987', 'NCT04907266', 'NCT05142839']" +GARD:0000059,"['Spinocerebellar ataxia type 34', 'Erythrokeratodermia with ataxia', 'Spinocerebellar ataxia and erythrokeratodermia']",['SCA34'],[5],[['NCT01793168']],"[['NCT04591483', 'NCT01793168']]",same size,[] +GARD:0000060,"['Iridocorneal endothelial syndrome', 'ICE syndrome']",[],[],"[['NCT04025801', 'NCT02020044', 'NCT00800111', 'NCT03270761', 'NCT00001161']]",[[]],same size,[] +GARD:0000061,"['Femoral-facial syndrome', 'Femoral hypoplasia-unusual facies syndromesame size,[] +GARD:0000062,"['Filippi syndrome', 'Type 1 syndactyly-microcephaly-intellectual disability syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000064,"['Fountain syndrome', 'Deafness-skeletal dysplasia-coarse face with full lips syndrome', 'Deafness-skeletal dysplasia-lip granuloma syndrome', 'Hearing loss-skeletal dysplasia-coarse face with full lips syndrome', 'Hearing loss-skeletal dysplasia-lip granuloma syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000065,"['Galloway-Mowat syndrome', 'Galloway syndrome', 'Microcephaly-hiatus hernia-nephrotic syndrome', 'Nephrosis-neuronal dysmigration syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000066,"['Gorlin-Chaudhry-Moss syndrome', 'Craniofacial dysostosis-genital; dental; cardiac anomalies syndrome', 'Cranofacial dysostosis-hypertrichosis-hypoplasia of labia majora syndrome', 'Dental and eye anomalies-patent ductus arteriosus-normal intelligence syndrome', 'GCM syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000068,"['Hypoglossia-hypodactyly syndrome', 'Aglossia-adactylia syndrome', 'Hanhart syndrome', 'Jussieu syndrome']",[],[],[[]],[[]],same size,[] +GARD:0000069,['Hantavirus pulmonary syndrome'],[],[],"[['NCT05415904', 'NCT04323904', 'NCT04020536', 'NCT03682107', 'NCT01502345', 'NCT00868946', 'NCT03718130', 'NCT02116205', 'NCT02455375', 'NCT00001123', 'NCT00623168', 'NCT00533767', 'NCT04834713', 'NCT04333459', 'NCT04375098', 'NCT00128180', 'NCT04338672', 'NCT06009042']]",[[]],concept,"['NCT01502345', 'NCT00623168', 'NCT04375098', 'NCT06009042', 'NCT00868946', 'NCT03718130', 'NCT04333459', 'NCT04338672', 'NCT04020536', 'NCT04834713', 'NCT02455375', 'NCT02116205']" +GARD:0000070,"['Kasabach-Merritt syndrome', 'Hemangioma-thrombocytopenia syndrome']",[],[],"[['NCT04775173', 'NCT04448873', 'NCT04056962', 'NCT02110069', 'NCT00576888', 'NCT04598204', 'NCT05324384', 'NCT03188068', 'NCT05351216', 'NCT04077515', 'NCT04409691']]",[[]],concept,"['NCT00576888', 'NCT04598204']" +GARD:0000073,"['X-linked hyper-IgM syndrome', 'Hyper-IgM syndrome due to CD40 ligand deficiency', 'Hyper-IgM syndrome due to CD40L deficiency', 'Hyper-IgM syndrome type 1']","['HIGM1', 'XHIGM']","[5, 5]","[['NCT01652092', 'NCT00266513', 'NCT00006054', 'NCT00001244', 'NCT00004341', 'NCT01998633', 'NCT00634569', 'NCT06092346', 'NCT01963143', 'NCT03965260', 'NCT03383380', 'NCT00006319', 'NCT01289847', 'NCT00730314', 'NCT01884311', 'NCT00001145']]","[['NCT00982358', 'NCT01774838', 'NCT00053391', 'NCT00730314', 'NCT00513149', 'NCT06092346', 'NCT00302809', 'NCT01604031', 'NCT00020540', 'NCT00006054', 'NCT03759262', 'NCT06169007', 'NCT01998633', 'NCT03329950', 'NCT00056134', 'NCT00006319', 'NCT00078520', 'NCT00840931', 'NCT00579306', 'NCT00322959', 'NCT00466050', 'NCT00320164', 'NCT02789670', 'NCT01257880', 'NCT00634569', 'NCT03110445', 'NCT03333486', 'NCT00004341', 'NCT03965260', 'NCT05502887', 'NCT01162824', 'NCT01652092', 'NCT00008749', 'NCT05733598', 'NCT02097199', 'NCT01422317', 'NCT00058786', 'NCT05059431', 'NCT00328887', 'NCT02320357', 'NCT00001145', 'NCT01812200', 'NCT05076760', 'NCT06305286', 'NCT00504322', 'NCT02140996', 'NCT00266513', 'NCT01289847', 'NCT00005970', 'NCT02123004', 'NCT01963143', 'NCT00063856', 'NCT03383380', 'NCT00058799', 'NCT00001789', 'NCT03480568', 'NCT03288623', 'NCT04630132', 'NCT04329377', 'NCT00019591', 'NCT01276678', 'NCT04859218', 'NCT01112137', 'NCT01741324', 'NCT02974192', 'NCT00224354', 'NCT00458679', 'NCT05743270', 'NCT04322149', 'NCT04080453', 'NCT01541319', 'NCT01884311', 'NCT00441844', 'NCT04735978', 'NCT00001244', 'NCT04703647', 'NCT01530698', 'NCT02533596', 'NCT06312852', 'NCT04420013', 'NCT00006113']]",concept,"['NCT01289847', 'NCT01963143', 'NCT06092346', 'NCT01998633', 'NCT00634569', 'NCT01884311', 'NCT00001244', 'NCT00730314', 'NCT01652092', 'NCT03383380', 'NCT03965260']" +GARD:0000076,"['Hypohidrotic ectodermal dysplasia', 'Anhidrotic ectodermal dysplasia']",['HED'],[3],"[['NCT01308333', 'NCT01992289', 'NCT01386775', 'NCT02099552', 'NCT04938622', 'NCT01293565', 'NCT04980638', 'NCT01398813', 'NCT04741412', 'NCT01629940', 'NCT01398397', 'NCT01871714', 'NCT03912792', 'NCT04223167', 'NCT05378932', 'NCT01629927', 'NCT01135888', 'NCT01109290', 'NCT01775462', 'NCT01342133', 'NCT01108770', 'NCT01564225']]","[['NCT01246037', 'NCT03344159', 'NCT01992289', 'NCT02669563', 'NCT01847300', 'NCT02184013', 'NCT06147206', 'NCT04549454', 'NCT01386775', 'NCT05263271', 'NCT03996460', 'NCT02099552', 'NCT05802472', 'NCT01293565', 'NCT04980638', 'NCT03339986', 'NCT01398813', 'NCT06019481', 'NCT04027426', 'NCT02899910', 'NCT04741412', 'NCT01629940', 'NCT02372188', 'NCT04416711', 'NCT01871714', 'NCT01398397', 'NCT04579068', 'NCT05453539', 'NCT05378932', 'NCT03769883', 'NCT03337438', 'NCT02216669', 'NCT01629927', 'NCT04027608', 'NCT00756366', 'NCT02116140', 'NCT05372042', 'NCT01851603', 'NCT01135888', 'NCT04783519', 'NCT04774770', 'NCT04089137', 'NCT00586183', 'NCT01648790', 'NCT01109290', 'NCT02932241', 'NCT03191682', 'NCT02481206', 'NCT01775462', 'NCT02404480', 'NCT06094933', 'NCT04535193', 'NCT06436586', 'NCT01108770', 'NCT03076489', 'NCT01564225', 'NCT01197352']]",concept,"['NCT04938622', 'NCT04223167']" +GARD:0000079,"['Metaphyseal chondrodysplasia, Jansen type']",[],[],[['NCT01793168']],[[]],concept,['NCT01793168'] +GARD:0000080,['Johanson-Blizzard syndrome'],['JBS'],[3],[[]],"[['NCT02902731', 'NCT04671823', 'NCT01114399', 'NCT04463316']]",same size,[] +GARD:0000081,"['Intellectual developmental disorder, x-linked, syndromic, turner type', 'mental retardation and macrocephaly syndrome', 'mental retardation; x-linked; with growth retardation; deafness; and microgenitalism', 'juberg-marsidi syndrome', 'mental retardation; x-linked; syndromic; brooks-wisniewski-brown type', 'Mental retardation; x-linked; syndromic; turner type', 'brooks-wisniewski-brown syndrome']",[],[],[['NCT01238250']],[[]],concept,['NCT01238250'] +GARD:0000082,"['KBG syndrome', 'Short stature-facial and skeletal anomalies-intellectual disability-macrodontia syndrome']",[],[],[['NCT06465641']],[[]],same size,[] +GARD:0000083,['Autosomal dominant Kenny-Caffey syndrome'],[],[],[[]],[[]],same size,[] +GARD:0000084,"['Lipodystrophy, congenital generalized, type 1', 'lipodystrophy; berardinelli-seip congenital; type 1', 'Berardinelli-seip congenital lipodystrophy; type 1', 'brunzell syndrome; agpat2-related']",[],[],[[]],[[]],same size,[] +GARD:0000085,['Thanatophoric dysplasiaconcept,"['NCT05437913', 'NCT02204397', 'NCT02660008', 'NCT05426226']" +GARD:0000086,['Chudley-McCullough syndrome'],[],[],[[]],[[]],same size,[] +GARD:0000087,"['Microphthalmia, Lenz type', 'Lenz microphthalmia']",[],[],[['NCT00011843']],[[]],concept,['NCT00011843'] diff --git a/RDAS_CTKG_REMAKE/acronym_test_expansion_none.csv b/RDAS_CTKG_REMAKE/acronym_test_expansion_none.csv new file mode 100644 index 0000000..1607ac8 --- /dev/null +++ b/RDAS_CTKG_REMAKE/acronym_test_expansion_none.csv @@ -0,0 +1,51 @@ +GARD,ORIG_TERMS,ACRONYM DETECT,ACRONYM_LEN,FILTERED_TRIALS,ACRO_ONLY_TRIALS +GARD:0000001,"['GRACILE syndrome', 'Fellman disease', 'Growth restriction-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome']",[],[],[[]],[[]] +GARD:0000003,['Ablepharon macrostomia syndrome'],[],[],[[]],[[]] +GARD:0000005,"['Abetalipoproteinemia', 'Bassen-Kornzweig disease', 'Homozygous familial hypobetalipoproteinemia']",[],[],"[['NCT00004574', 'NCT02435940', 'NCT05208879']]",[[]] +GARD:0000006,['Acromesomelic dysplasia'],[],[],[[]],[[]] +GARD:0000007,['Acromicric dysplasia'],[],[],[[]],[[]] +GARD:0000011,['Alternating hemiplegia of childhood'],['AHC'],[3],"[['NCT03857607', 'NCT06248645']]","[['NCT05661318', 'NCT00682513', 'NCT06019988', 'NCT04020848', 'NCT06073301', 'NCT06248645', 'NCT03900689', 'NCT02408354', 'NCT06490757', 'NCT04701242', 'NCT03206775', 'NCT03601793', 'NCT01289652', 'NCT02409537', 'NCT04944927', 'NCT06007521', 'NCT03885401', 'NCT03331952', 'NCT03857607', 'NCT00931164', 'NCT06128655', 'NCT03857555', 'NCT02977026']]" +GARD:0000012,"['Hypersensitivity pneumonitis', 'Extrinsic allergic alveolitis']",[],[],"[['NCT04844359', 'NCT05458635', 'NCT03873649', 'NCT04561479', 'NCT05626387', 'NCT01687946', 'NCT05988437', 'NCT02523833']]",[[]] +GARD:0000013,"['Aniridia-cerebellar ataxia-intellectual disability syndrome', 'Gillespie syndrome']",[],[],[['NCT05390801']],[[]] +GARD:0000016,"['Ocular motor apraxia, Cogan type', 'Oculomotor apraxia; Cogan type']",[],[],[[]],[[]] +GARD:0000017,['Arachnoid cyst'],[],[],[[]],[[]] +GARD:0000019,"['Dihydropyrimidine dehydrogenase deficiency', 'Familial pyrimidinemia']",[],[],[[]],[[]] +GARD:0000022,"['Björnstad syndrome', 'Deafness-pili torti-hypogonadism syndrome', 'Hearing loss-pili torti-hypogonadism syndrome']",[],[],[[]],[[]] +GARD:0000023,['Blepharophimosis-ptosis-epicanthus inversus syndrome'],['BPES'],[4],[[]],[[]] +GARD:0000026,['Cat-eye syndrome'],['CES'],[3],[['NCT03174028']],"[['NCT01953757', 'NCT04739696', 'NCT01997008', 'NCT03000985', 'NCT03811808', 'NCT02419638', 'NCT05450588', 'NCT01036490', 'NCT05384041', 'NCT05576090', 'NCT01277432', 'NCT00031720', 'NCT00094133', 'NCT01233245', 'NCT04332731', 'NCT04254198', 'NCT00636779', 'NCT02468349', 'NCT01244087', 'NCT01523561', 'NCT00823147', 'NCT04828200', 'NCT02419014', 'NCT00928720', 'NCT01795131', 'NCT02327104', 'NCT00013845', 'NCT00147693', 'NCT04126564', 'NCT05627986', 'NCT00746668', 'NCT00833898', 'NCT04679792', 'NCT00012831', 'NCT06036472', 'NCT05832710', 'NCT04224961', 'NCT03298308', 'NCT00831935', 'NCT03182556', 'NCT04836039', 'NCT02037568', 'NCT03983252', 'NCT05950009', 'NCT02436785', 'NCT02548065', 'NCT03031236', 'NCT02762734', 'NCT02890615', 'NCT00569309', 'NCT04776681', 'NCT04585685', 'NCT01963767', 'NCT02944136', 'NCT03526523', 'NCT04144257', 'NCT02411227', 'NCT04297657', 'NCT04770181', 'NCT06034496', 'NCT03792399', 'NCT01843608', 'NCT03222479', 'NCT06326151', 'NCT01909011', 'NCT04751864', 'NCT00268827', 'NCT05223530', 'NCT04069182', 'NCT06342232', 'NCT06242080', 'NCT00005498', 'NCT04103086', 'NCT01325532', 'NCT01690962', 'NCT04423952', 'NCT03411863', 'NCT02842229', 'NCT06416878', 'NCT00928772', 'NCT02518178', 'NCT02962843', 'NCT02510898', 'NCT04963907', 'NCT05792475', 'NCT02030522', 'NCT02898233', 'NCT05310162', 'NCT01461798', 'NCT00495768', 'NCT03904771', 'NCT02754973', 'NCT05083221', 'NCT01235455', 'NCT04256811', 'NCT01072032', 'NCT05466773', 'NCT05309031', 'NCT06233344', 'NCT00653497', 'NCT04438018', 'NCT01167725', 'NCT03881956', 'NCT05698862', 'NCT02215551', 'NCT04001322', 'NCT04102696', 'NCT02174003', 'NCT01287975', 'NCT05394883', 'NCT03414424', 'NCT04016259', 'NCT01993277', 'NCT04808258', 'NCT03737890', 'NCT04942171', 'NCT01231711', 'NCT05990595', 'NCT04086615', 'NCT05973461', 'NCT01949961', 'NCT03514355', 'NCT03825471', 'NCT03689543', 'NCT04587531', 'NCT05923723', 'NCT01265797', 'NCT02833129', 'NCT00994279', 'NCT00606801', 'NCT04776668', 'NCT06203717', 'NCT00499668', 'NCT02171936', 'NCT03033836', 'NCT02846740', 'NCT02087631', 'NCT04589364', 'NCT03927872', 'NCT02453347', 'NCT03777267', 'NCT06066658', 'NCT02606045', 'NCT03898453', 'NCT02806167', 'NCT02682914', 'NCT00866411', 'NCT03198156', 'NCT02732561', 'NCT00587769', 'NCT00172549', 'NCT01734954', 'NCT03094013', 'NCT05885893', 'NCT04393480', 'NCT04682197', 'NCT02549495', 'NCT05359380', 'NCT00729118', 'NCT03301376', 'NCT00138437', 'NCT04191538', 'NCT03320109', 'NCT06468020', 'NCT05950477', 'NCT02521480', 'NCT03369418', 'NCT03325374', 'NCT00223756', 'NCT01533415', 'NCT02690896', 'NCT01055665', 'NCT00178373', 'NCT01935310', 'NCT01232907', 'NCT06022640', 'NCT04180683', 'NCT04115033', 'NCT06493695', 'NCT05653934', 'NCT01488942', 'NCT01583933', 'NCT01271803', 'NCT00248872', 'NCT00248924', 'NCT05801497', 'NCT02844452', 'NCT00279526', 'NCT00729625', 'NCT05387434', 'NCT05092854', 'NCT04533165', 'NCT03784001', 'NCT02163967', 'NCT04700696', 'NCT02067806', 'NCT04602507', 'NCT00230945', 'NCT02203734', 'NCT03298178', 'NCT02778139', 'NCT03450135', 'NCT00403455', 'NCT03876652', 'NCT03978286', 'NCT03230890', 'NCT00958698', 'NCT02979379', 'NCT02057081', 'NCT02159885', 'NCT05209438', 'NCT02391766', 'NCT01282658', 'NCT05887713', 'NCT04598386', 'NCT00539357', 'NCT02992587', 'NCT00665782', 'NCT00886730', 'NCT03344562', 'NCT04006327', 'NCT04626037', 'NCT03060122', 'NCT03076632', 'NCT00902330', 'NCT02901080', 'NCT02813447', 'NCT05785884', 'NCT01839864', 'NCT05881122', 'NCT04160910', 'NCT03995914', 'NCT03299803', 'NCT01973283', 'NCT01783652', 'NCT04502992', 'NCT00554489', 'NCT03057184', 'NCT04145726', 'NCT00536055', 'NCT03757494', 'NCT03705988', 'NCT04465136', 'NCT02730988', 'NCT03322995', 'NCT05460845', 'NCT01422174', 'NCT01607749', 'NCT00339859', 'NCT03809312', 'NCT04910282', 'NCT03389568', 'NCT06497959', 'NCT03805152', 'NCT05304052', 'NCT03367936', 'NCT01860677', 'NCT00723008', 'NCT05063851', 'NCT01803984', 'NCT06124014', 'NCT01825915', 'NCT06281288', 'NCT05791981', 'NCT03332043', 'NCT01222481', 'NCT02515916', 'NCT03222752', 'NCT03210155', 'NCT03593070', 'NCT04567797', 'NCT05005624', 'NCT02303145', 'NCT04627480', 'NCT03150095', 'NCT05126511', 'NCT05811429', 'NCT06270979', 'NCT00385528', 'NCT03264248', 'NCT03927885', 'NCT03811132', 'NCT02979782', 'NCT04115332']]" +GARD:0000027,"['Cat-scratch disease', 'Bartonellosis due to Bartonella henselae infection']",[],[],[[]],[[]] +GARD:0000028,"['Catel-Manzke syndrome', 'Hyperphalangy-clinodactyly of index finger with Pierre Robin syndrome', 'Index finger anomaly-Pierre Robin syndrome', 'Micrognathia digital syndrome', 'Palatodigital syndrome; Catel-Manzke type', 'Pierre Robin sequence-hyperphalangy-clinodactyly syndrome', 'Pierre Robin syndrome-hyperphalangy-clinodactyly syndrome']",[],[],[[]],[[]] +GARD:0000029,"['CHARGE syndrome', 'CHARGE association', 'Coloboma-heart defects-atresia choanae-retardation of growth and development-genitourinary problems-ear abnormalities syndrome', 'Hall-Hittner syndrome']",[],[],"[['NCT03186144', 'NCT01314534', 'NCT06475651', 'NCT05764980']]",[[]] +GARD:0000031,"['Serpiginous choroiditis', 'Geographic helicoid peripapillary choroidopathy']",[],[],[[]],[[]] +GARD:0000035,"['Tetrasomy 18p', 'Isochromosome 18p']",[],[],[[]],[[]] +GARD:0000037,"['Partial deletion of the short arm of chromosome 3', 'Partial deletion of chromosome 3p', 'Partial monosomy of chromosome 3p', 'Partial monosomy of the short arm of chromosome 3']",[],[],[[]],[[]] +GARD:0000039,['WT limb-blood syndrome'],[],[],[[]],[[]] +GARD:0000042,"['Tetrasomy 9p', 'Isochromosome 9p']",[],[],[[]],[[]] +GARD:0000043,"['Mosaic trisomy 9', 'Mosaic trisomy chromosome 9', 'Trisomy 9 mosaicism']",[],[],[[]],[[]] +GARD:0000044,"['Haim-Munk syndrome', 'Keratosis palmoplantaris-periodontopathia-onychogryposis syndrome', 'Palmoplantar hyperkeratosis-periodontopathia-onychogryposis syndrome', 'Palmoplantar keratoderma-periodontopathia-onychogryposis syndrome']",[],[],[[]],[[]] +GARD:0000045,"['Congenital varicella syndrome', 'Antenatal varicella virus infection', 'Mother-to-child transmission of varicella syndrome']",[],[],[[]],[[]] +GARD:0000047,"['Crigler-Najjar syndrome type 1', 'Bilirubin uridinediphosphate glucuronosyltransferase deficiency type 1', 'Bilirubin-UGT deficiency type 1']",[],[],[[]],[[]] +GARD:0000048,"['Isolated cytochrome C oxidase deficiency', 'Isolated COX deficiency', 'Isolated mitochondrial respiratory chain complex IV deficiency']",[],[],[[]],[[]] +GARD:0000049,"['De Barsy syndrome', 'Cutis laxa-corneal clouding-intellectual disability syndrome', 'Progeroid syndrome; De Barsy type']",[],[],[[]],[[]] +GARD:0000054,['Duodenal atresia'],[],[],[[]],[[]] +GARD:0000059,"['Spinocerebellar ataxia type 34', 'Erythrokeratodermia with ataxia', 'Spinocerebellar ataxia and erythrokeratodermia']",['SCA34'],[5],[[]],[[]] +GARD:0000060,"['Iridocorneal endothelial syndrome', 'ICE syndrome']",[],[],"[['NCT04025801', 'NCT03270761']]",[[]] +GARD:0000061,"['Femoral-facial syndrome', 'Femoral hypoplasia-unusual facies syndromeilippi syndrome', 'Type 1 syndactyly-microcephaly-intellectual disability syndrome']",[],[],[[]],[[]] +GARD:0000064,"['Fountain syndrome', 'Deafness-skeletal dysplasia-coarse face with full lips syndrome', 'Deafness-skeletal dysplasia-lip granuloma syndrome', 'Hearing loss-skeletal dysplasia-coarse face with full lips syndrome', 'Hearing loss-skeletal dysplasia-lip granuloma syndrome']",[],[],[[]],[[]] +GARD:0000065,"['Galloway-Mowat syndrome', 'Galloway syndrome', 'Microcephaly-hiatus hernia-nephrotic syndrome', 'Nephrosis-neuronal dysmigration syndrome']",[],[],[[]],[[]] +GARD:0000066,"['Gorlin-Chaudhry-Moss syndrome', 'Craniofacial dysostosis-genital; dental; cardiac anomalies syndrome', 'Cranofacial dysostosis-hypertrichosis-hypoplasia of labia majora syndrome', 'Dental and eye anomalies-patent ductus arteriosus-normal intelligence syndrome', 'GCM syndrome']",[],[],[[]],[[]] +GARD:0000068,"['Hypoglossia-hypodactyly syndrome', 'Aglossia-adactylia syndrome', 'Hanhart syndrome', 'Jussieu syndrome']",[],[],[[]],[[]] +GARD:0000069,['Hantavirus pulmonary syndrome'],[],[],[[]],[[]] +GARD:0000070,"['Kasabach-Merritt syndrome', 'Hemangioma-thrombocytopenia syndrome']",[],[],[[]],[[]] +GARD:0000073,"['X-linked hyper-IgM syndrome', 'Hyper-IgM syndrome due to CD40 ligand deficiency', 'Hyper-IgM syndrome due to CD40L deficiency', 'Hyper-IgM syndrome type 1']","['HIGM1', 'XHIGM']","[5, 5]","[['NCT00006054', 'NCT00001145', 'NCT00004341']]",[['NCT01998633']] +GARD:0000076,"['Hypohidrotic ectodermal dysplasia', 'Anhidrotic ectodermal dysplasia']",['HED'],[3],"[['NCT01629927', 'NCT02099552', 'NCT01109290', 'NCT05378932', 'NCT01992289']]","[['NCT01629940', 'NCT01386775', 'NCT03339986', 'NCT04783519', 'NCT03996460', 'NCT03191682', 'NCT02932241', 'NCT04741412', 'NCT01246037', 'NCT05378932', 'NCT04579068', 'NCT05263271', 'NCT01398813', 'NCT01871714', 'NCT06094933', 'NCT02899910', 'NCT04416711', 'NCT04089137', 'NCT02216669', 'NCT05802472', 'NCT01847300', 'NCT01851603', 'NCT02481206', 'NCT00586183', 'NCT01629927', 'NCT02184013', 'NCT02099552', 'NCT01293565', 'NCT05453539', 'NCT03769883', 'NCT04549454', 'NCT03337438', 'NCT01197352', 'NCT02669563', 'NCT01398397', 'NCT06147206', 'NCT04027426', 'NCT04027608', 'NCT04535193', 'NCT03344159', 'NCT01135888', 'NCT01108770', 'NCT02404480', 'NCT01109290', 'NCT00756366', 'NCT02372188', 'NCT02116140', 'NCT01992289', 'NCT05372042', 'NCT04774770', 'NCT06436586']]" +GARD:0000079,"['Metaphyseal chondrodysplasia, Jansen type']",[],[],[[]],[[]] +GARD:0000080,['Johanson-Blizzard syndrome'],['JBS'],[3],[[]],"[['NCT04671823', 'NCT01114399', 'NCT02902731']]" +GARD:0000081,"['Intellectual developmental disorder, x-linked, syndromic, turner type', 'mental retardation and macrocephaly syndrome', 'mental retardation; x-linked; with growth retardation; deafness; and microgenitalism', 'juberg-marsidi syndrome', 'mental retardation; x-linked; syndromic; brooks-wisniewski-brown type', 'Mental retardation; x-linked; syndromic; turner type', 'brooks-wisniewski-brown syndrome']",[],[],[[]],[[]] +GARD:0000082,"['KBG syndrome', 'Short stature-facial and skeletal anomalies-intellectual disability-macrodontia syndrome']",[],[],[['NCT06465641']],[[]] +GARD:0000083,['Autosomal dominant Kenny-Caffey syndrome'],[],[],[[]],[[]] +GARD:0000084,"['Lipodystrophy, congenital generalized, type 1', 'lipodystrophy; berardinelli-seip congenital; type 1', 'Berardinelli-seip congenital lipodystrophy; type 1', 'brunzell syndrome; agpat2-related']",[],[],[[]],[[]] +GARD:0000085,['Thanatophoric dysplasiahudley-McCullough syndrome'],[],[],[[]],[[]] +GARD:0000087,"['Microphthalmia, Lenz type', 'Lenz microphthalmia']",[],[],[[]],[[]] diff --git a/RDAS_CTKG_REMAKE/acronym_test_expansion_term.csv b/RDAS_CTKG_REMAKE/acronym_test_expansion_term.csv new file mode 100644 index 0000000..dddcad9 --- /dev/null +++ b/RDAS_CTKG_REMAKE/acronym_test_expansion_term.csv @@ -0,0 +1,51 @@ +GARD,ORIG_TERMS,ACRONYM DETECT,ACRONYM_LEN,FILTERED_TRIALS,ACRO_ONLY_TRIALS +GARD:0000001,"['GRACILE syndrome', 'Fellman disease', 'Growth restriction-aminoaciduria-cholestasis-iron overload-lactic acidosis-early death syndrome']",[],[],[[]],[[]] +GARD:0000003,['Ablepharon macrostomia syndrome'],[],[],[[]],[[]] +GARD:0000005,"['Abetalipoproteinemia', 'Bassen-Kornzweig disease', 'Homozygous familial hypobetalipoproteinemia']",[],[],"[['NCT02435940', 'NCT00004574', 'NCT05208879']]",[[]] +GARD:0000006,['Acromesomelic dysplasia'],[],[],[[]],[[]] +GARD:0000007,['Acromicric dysplasia'],[],[],[[]],[[]] +GARD:0000011,['Alternating hemiplegia of childhood'],['AHC'],[3],"[['NCT06153186', 'NCT06248645', 'NCT04513002', 'NCT02408354', 'NCT00682513', 'NCT04020848', 'NCT00931164', 'NCT04944927', 'NCT06007521', 'NCT03857607']]","[['NCT02977026', 'NCT05661318', 'NCT06007521', 'NCT04944927', 'NCT06128655', 'NCT06073301', 'NCT00931164', 'NCT06490757', 'NCT01289652', 'NCT06248645', 'NCT04020848', 'NCT03885401', 'NCT04701242', 'NCT03601793', 'NCT02409537', 'NCT03206775', 'NCT03857607', 'NCT06019988', 'NCT03674164', 'NCT03857555', 'NCT02408354', 'NCT00682513', 'NCT03331952', 'NCT03900689']]" +GARD:0000012,"['Hypersensitivity pneumonitis', 'Extrinsic allergic alveolitis']",[],[],"[['NCT05445817', 'NCT05727852', 'NCT02958917', 'NCT03800017', 'NCT04273867', 'NCT04402177', 'NCT02596347', 'NCT04844359', 'NCT03030807', 'NCT02705144', 'NCT05455437', 'NCT01624753', 'NCT05458635', 'NCT05365802', 'NCT05723796', 'NCT04896138', 'NCT04982809', 'NCT02523833', 'NCT03056404', 'NCT05635032', 'NCT02496182', 'NCT02883920', 'NCT04016181', 'NCT03747627', 'NCT04879082', 'NCT01487850', 'NCT03670576', 'NCT03836417', 'NCT06125288', 'NCT03873649', 'NCT04677426', 'NCT04675619', 'NCT05392881', 'NCT01687946', 'NCT06038630', 'NCT05450276', 'NCT05704218', 'NCT04961944', 'NCT05776537', 'NCT05626387', 'NCT05988437', 'NCT03300583', 'NCT06134947', 'NCT04561479', 'NCT01961362', 'NCT05549635', 'NCT01237145', 'NCT02571582', 'NCT01935726', 'NCT05449431']]",[[]] +GARD:0000013,"['Aniridia-cerebellar ataxia-intellectual disability syndrome', 'Gillespie syndrome']",[],[],[['NCT05390801']],[[]] +GARD:0000016,"['Ocular motor apraxia, Cogan type', 'Oculomotor apraxia; Cogan type']",[],[],[[]],[[]] +GARD:0000017,['Arachnoid cyst'],[],[],"[['NCT04569201', 'NCT01391702', 'NCT04046523', 'NCT03656016', 'NCT03485612', 'NCT04249921']]",[[]] +GARD:0000019,"['Dihydropyrimidine dehydrogenase deficiency', 'Familial pyrimidinemia']",[],[],"[['NCT06245356', 'NCT04541381', 'NCT06092346']]",[[]] +GARD:0000022,"['Björnstad syndrome', 'Deafness-pili torti-hypogonadism syndrome', 'Hearing loss-pili torti-hypogonadism syndrome']",[],[],[[]],[[]] +GARD:0000023,['Blepharophimosis-ptosis-epicanthus inversus syndrome'],['BPES'],[4],[[]],[[]] +GARD:0000026,['Cat-eye syndromeat-scratch disease', 'Bartonellosis due to Bartonella henselae infection']",[],[],"[['NCT01469702', 'NCT03132116']]",[[]] +GARD:0000028,"['Catel-Manzke syndrome', 'Hyperphalangy-clinodactyly of index finger with Pierre Robin syndrome', 'Index finger anomaly-Pierre Robin syndrome', 'Micrognathia digital syndrome', 'Palatodigital syndrome; Catel-Manzke type', 'Pierre Robin sequence-hyperphalangy-clinodactyly syndrome', 'Pierre Robin syndrome-hyperphalangy-clinodactyly syndrome']",[],[],[[]],[[]] +GARD:0000029,"['CHARGE syndrome', 'CHARGE association', 'Coloboma-heart defects-atresia choanae-retardation of growth and development-genitourinary problems-ear abnormalities syndrome', 'Hall-Hittner syndrome']",[],[],"[['NCT04463316', 'NCT05764980', 'NCT06475651', 'NCT03186144', 'NCT01314534']]",[[]] +GARD:0000031,"['Serpiginous choroiditis', 'Geographic helicoid peripapillary choroidopathy']",[],[],"[['NCT00407121', 'NCT00645697']]",[[]] +GARD:0000035,"['Tetrasomy 18p', 'Isochromosome 18p']",[],[],[[]],[[]] +GARD:0000037,"['Partial deletion of the short arm of chromosome 3', 'Partial deletion of chromosome 3p', 'Partial monosomy of chromosome 3p', 'Partial monosomy of the short arm of chromosome 3']",[],[],[[]],[[]] +GARD:0000039,['WT limb-blood syndrome'],[],[],[[]],[[]] +GARD:0000042,"['Tetrasomy 9p', 'Isochromosome 9p']",[],[],[[]],[[]] +GARD:0000043,"['Mosaic trisomy 9', 'Mosaic trisomy chromosome 9', 'Trisomy 9 mosaicism']",[],[],[[]],[[]] +GARD:0000044,"['Haim-Munk syndrome', 'Keratosis palmoplantaris-periodontopathia-onychogryposis syndrome', 'Palmoplantar hyperkeratosis-periodontopathia-onychogryposis syndrome', 'Palmoplantar keratoderma-periodontopathia-onychogryposis syndrome']",[],[],[[]],[[]] +GARD:0000045,"['Congenital varicella syndrome', 'Antenatal varicella virus infection', 'Mother-to-child transmission of varicella syndrome']",[],[],[['NCT05923970']],[[]] +GARD:0000047,"['Crigler-Najjar syndrome type 1', 'Bilirubin uridinediphosphate glucuronosyltransferase deficiency type 1', 'Bilirubin-UGT deficiency type 1']",[],[],[[]],[[]] +GARD:0000048,"['Isolated cytochrome C oxidase deficiency', 'Isolated COX deficiency', 'Isolated mitochondrial respiratory chain complex IV deficiency']",[],[],[[]],[[]] +GARD:0000049,"['De Barsy syndrome', 'Cutis laxa-corneal clouding-intellectual disability syndrome', 'Progeroid syndrome; De Barsy type']",[],[],[[]],[[]] +GARD:0000054,['Duodenal atresia'],[],[],"[['NCT03056261', 'NCT04114279', 'NCT06115226', 'NCT06394453', 'NCT03256669']]",[[]] +GARD:0000059,"['Spinocerebellar ataxia type 34', 'Erythrokeratodermia with ataxia', 'Spinocerebellar ataxia and erythrokeratodermia']",['SCA34'],[5],[['NCT01793168']],[[]] +GARD:0000060,"['Iridocorneal endothelial syndrome', 'ICE syndrome']",[],[],"[['NCT04025801', 'NCT03270761', 'NCT02020044', 'NCT00800111', 'NCT00001161']]",[[]] +GARD:0000061,"['Femoral-facial syndrome', 'Femoral hypoplasia-unusual facies syndromeilippi syndrome', 'Type 1 syndactyly-microcephaly-intellectual disability syndrome']",[],[],[[]],[[]] +GARD:0000064,"['Fountain syndrome', 'Deafness-skeletal dysplasia-coarse face with full lips syndrome', 'Deafness-skeletal dysplasia-lip granuloma syndrome', 'Hearing loss-skeletal dysplasia-coarse face with full lips syndrome', 'Hearing loss-skeletal dysplasia-lip granuloma syndrome']",[],[],[[]],[[]] +GARD:0000065,"['Galloway-Mowat syndrome', 'Galloway syndrome', 'Microcephaly-hiatus hernia-nephrotic syndrome', 'Nephrosis-neuronal dysmigration syndrome']",[],[],[[]],[[]] +GARD:0000066,"['Gorlin-Chaudhry-Moss syndrome', 'Craniofacial dysostosis-genital; dental; cardiac anomalies syndrome', 'Cranofacial dysostosis-hypertrichosis-hypoplasia of labia majora syndrome', 'Dental and eye anomalies-patent ductus arteriosus-normal intelligence syndrome', 'GCM syndrome']",[],[],[[]],[[]] +GARD:0000068,"['Hypoglossia-hypodactyly syndrome', 'Aglossia-adactylia syndrome', 'Hanhart syndrome', 'Jussieu syndrome']",[],[],[[]],[[]] +GARD:0000069,['Hantavirus pulmonary syndrome'],[],[],"[['NCT00533767', 'NCT00128180', 'NCT04323904', 'NCT05415904', 'NCT00001123', 'NCT03682107']]",[[]] +GARD:0000070,"['Kasabach-Merritt syndrome', 'Hemangioma-thrombocytopenia syndrome']",[],[],"[['NCT04448873', 'NCT02110069', 'NCT04077515', 'NCT03188068', 'NCT04409691', 'NCT04056962', 'NCT05351216', 'NCT05324384', 'NCT04775173']]",[[]] +GARD:0000073,"['X-linked hyper-IgM syndrome', 'Hyper-IgM syndrome due to CD40 ligand deficiency', 'Hyper-IgM syndrome due to CD40L deficiency', 'Hyper-IgM syndrome type 1']","['HIGM1', 'XHIGM']","[5, 5]","[['NCT00006319', 'NCT00001145', 'NCT00006054', 'NCT00266513', 'NCT00004341']]",[['NCT01998633']] +GARD:0000076,"['Hypohidrotic ectodermal dysplasia', 'Anhidrotic ectodermal dysplasia']",['HED'],[3],"[['NCT01992289', 'NCT01108770', 'NCT01564225', 'NCT04980638', 'NCT03912792', 'NCT04741412', 'NCT01293565', 'NCT01398813', 'NCT01629940', 'NCT05378932', 'NCT01109290', 'NCT02099552', 'NCT01342133', 'NCT01775462', 'NCT01135888', 'NCT01398397', 'NCT01871714', 'NCT01386775', 'NCT01629927', 'NCT01308333']]","[['NCT02899910', 'NCT03076489', 'NCT01992289', 'NCT01108770', 'NCT00756366', 'NCT03339986', 'NCT04027608', 'NCT02669563', 'NCT00586183', 'NCT04741412', 'NCT01293565', 'NCT03996460', 'NCT01851603', 'NCT01398813', 'NCT01847300', 'NCT04416711', 'NCT03769883', 'NCT04549454', 'NCT02404480', 'NCT01197352', 'NCT04535193', 'NCT01629940', 'NCT05802472', 'NCT04089137', 'NCT02216669', 'NCT01109290', 'NCT02184013', 'NCT02099552', 'NCT05378932', 'NCT02932241', 'NCT03344159', 'NCT05453539', 'NCT01135888', 'NCT05263271', 'NCT02116140', 'NCT01398397', 'NCT03337438', 'NCT01871714', 'NCT02481206', 'NCT06147206', 'NCT01386775', 'NCT04783519', 'NCT02372188', 'NCT01629927', 'NCT06094933', 'NCT04579068', 'NCT06436586', 'NCT04027426', 'NCT05372042', 'NCT04774770', 'NCT03191682', 'NCT01246037']]" +GARD:0000079,"['Metaphyseal chondrodysplasia, Jansen type']",[],[],[[]],[[]] +GARD:0000080,['Johanson-Blizzard syndrome'],['JBS'],[3],[[]],"[['NCT04671823', 'NCT01114399', 'NCT02902731']]" +GARD:0000081,"['Intellectual developmental disorder, x-linked, syndromic, turner type', 'mental retardation and macrocephaly syndrome', 'mental retardation; x-linked; with growth retardation; deafness; and microgenitalism', 'juberg-marsidi syndrome', 'mental retardation; x-linked; syndromic; brooks-wisniewski-brown type', 'Mental retardation; x-linked; syndromic; turner type', 'brooks-wisniewski-brown syndrome']",[],[],[[]],[[]] +GARD:0000082,"['KBG syndrome', 'Short stature-facial and skeletal anomalies-intellectual disability-macrodontia syndrome']",[],[],[['NCT06465641']],[[]] +GARD:0000083,['Autosomal dominant Kenny-Caffey syndrome'],[],[],[[]],[[]] +GARD:0000084,"['Lipodystrophy, congenital generalized, type 1', 'lipodystrophy; berardinelli-seip congenital; type 1', 'Berardinelli-seip congenital lipodystrophy; type 1', 'brunzell syndrome; agpat2-related']",[],[],[[]],[[]] +GARD:0000085,['Thanatophoric dysplasiahudley-McCullough syndrome'],[],[],[[]],[[]] +GARD:0000087,"['Microphthalmia, Lenz type', 'Lenz microphthalmia']",[],[],[[]],[[]] diff --git a/RDAS_CTKG_REMAKE/expansion_compare.py b/RDAS_CTKG_REMAKE/expansion_compare.py new file mode 100644 index 0000000..79a82f3 --- /dev/null +++ b/RDAS_CTKG_REMAKE/expansion_compare.py @@ -0,0 +1,51 @@ +import pandas as pd +import ast + +term_df = pd.read_csv('/home/leadmandj/RDAS/RDAS_CTKG_REMAKE/acronym_test_expansion_term.csv', index_col=False) +concept_df = pd.read_csv('/home/leadmandj/RDAS/RDAS_CTKG_REMAKE/acronym_test_expansion_concept.csv', index_col=False) +none_df = pd.read_csv('/home/leadmandj/RDAS/RDAS_CTKG_REMAKE/acronym_test_expansion_none.csv', index_col=False) + +def compare (df1,df2): + r,c = df1.shape + for idx in range(r): + greaterVal = None + compare_lst = None + row1 = df1.iloc[idx] + row2 = df2.iloc[idx] + + gard1 = row1['GARD'] + gard2 = row2['GARD'] + + terms1 = row1['ORIG_TERMS'] + terms2 = row2['ORIG_TERMS'] + + trials1 = ast.literal_eval(row1['FILTERED_TRIALS'])[0] + trials2 = ast.literal_eval(row2['FILTERED_TRIALS'])[0] + + if len(trials1) > len(trials2): + greaterVal = 'term' + compare_lst = list(set(trials1) - set(trials2)) + elif len(trials2) > len(trials1): + greaterVal = 'concept' + compare_lst = list(set(trials2) - set(trials1)) + else: + greaterVal = 'same size' + compare_lst = list(set(trials2) - set(trials1)) + + print(gard1, idx) + print(terms1) + print(compare_lst) + print(greaterVal) + + df2.at[idx,'isLarger'] = greaterVal + df2.at[idx,'NEW_TRIALS'] = str(compare_lst) + + print('|||||||||||||||||') + + print('----------------') + df2.to_csv('/home/leadmandj/RDAS/RDAS_CTKG_REMAKE/acronym_test_expansion_concept_new_trials.csv', index=False) + + +compare(term_df, concept_df) +print('----------------------------') +#compare(term_df, none_df) \ No newline at end of file diff --git a/RDAS_GARD/methods.py b/RDAS_GARD/methods.py index 3fe5280..c80dce5 100755 --- a/RDAS_GARD/methods.py +++ b/RDAS_GARD/methods.py @@ -1,13 +1,14 @@ import os +import sys +workspace = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(workspace) +sys.path.append('/home/leadmandj/RDAS') import json -from skr_web_api import Submission, METAMAP_INTERACTIVE_URL +#from skr_web_api import Submission, METAMAP_INTERACTIVE_URL from unidecode import unidecode from AlertCypher import AlertCypher import re import requests -import sys -workspace = os.path.dirname(os.path.abspath(__file__)) -sys.path.append(workspace) import sysvars from datetime import datetime, date from http import client @@ -118,7 +119,7 @@ def create_disease_node(db, data, xrefs): # Include xrefs into GARD node instead "syns":data[6], "orpha":results['Orphanet'] if 'Orphanet' in results else None, "icd10":results['ICD-10'] if 'ICD-10' in results else None, - "umls":results['UMLS'] if 'UMLS' in results else None, + "umls":list(set(results['UMLS'])) if 'UMLS' in results else None, "omim":results['OMIM'] if 'OMIM' in results else None, "snomed":results['SNOMED-CT'] if 'SNOMED-CT' in results else None, "diseaseontology":results['DiseaseOntology'] if 'DiseaseOntology' in results else None, @@ -315,7 +316,7 @@ def get_remaining_umls(db, umls_update=True): INSTANCE.form['SingLinePMID'] = True print('GATHERING GARD UMLS DATA') - db.run('MATCH (x:GARD) WHERE x.UMLS IS NOT NULL SET x.UMLS_Source = "DATALAKE"') + db.run('MATCH (x:GARD) WHERE x.UMLS IS NOT NULL SET x.UMLS_Source = "GARD"') res = db.run('MATCH (x:GARD) WHERE x.UMLS IS NULL SET x.UMLS_Source = "METAMAP" RETURN x.GardId AS gard_id, x.GardName as gard_name').data() gard_strs = [f"{i['gard_id'].replace('GARD:','')}|{normalize(i['gard_name'])}\n" for i in res if i['gard_name']] diff --git a/RDAS_GARD/run_node_counts.py b/RDAS_GARD/run_node_counts.py new file mode 100644 index 0000000..29e99bb --- /dev/null +++ b/RDAS_GARD/run_node_counts.py @@ -0,0 +1,2 @@ +import methods as rdas +rdas.get_node_counts() \ No newline at end of file diff --git a/RDAS_GFKG/methods.py b/RDAS_GFKG/methods.py index 21063f2..472774d 100755 --- a/RDAS_GFKG/methods.py +++ b/RDAS_GFKG/methods.py @@ -764,7 +764,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_): else: return normalize_combined_dictionary(title_,title_,name,{},{},{},1,1,'title') if Public_health_relevance_statement and isinstance(Public_health_relevance_statement, str): - A, B, C,D = check_sen(Public_health_relevance_statement, nlp) + A, B, C,D = check_sen(Public_health_relevance_statement) name1 = get_gard_abstract_stem_exact(A) name2 = get_gard_abstract_stem_exact(B) name3 = get_gard_abstract_stem_exact(C) @@ -773,7 +773,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_): if name and (name !={}): return name if abstract_ and isinstance(abstract_, str): - A, B, C , D = check_sen(abstract_, nlp) + A, B, C , D = check_sen(abstract_) name1 = get_gard_abstract_stem_exact(A) name2 = get_gard_abstract_stem_exact(B) name3 = get_gard_abstract_stem_exact(C) @@ -781,7 +781,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_): name=normalize_combined_dictionary(abstract_,title_,name1,name2,name3,name4,0,0.7,'abstract') if name and (name !={}): return name -def GardNameExtractor(project_title,phr_text,abstract_text, nlp): +def GardNameExtractor(project_title,phr_text,abstract_text): #Abstract1['Gard_name']=Abstract1.apply(lambda x: gard_id(x['project_title'],x['phr_text'],x['abstract_text']), axis=1) gard_ids = grad_id(project_title,phr_text,abstract_text) if gard_ids: diff --git a/RDAS_GFKG/prep_neo4j_data.py b/RDAS_GFKG/prep_neo4j_data.py index 025f13d..ea3ab27 100755 --- a/RDAS_GFKG/prep_neo4j_data.py +++ b/RDAS_GFKG/prep_neo4j_data.py @@ -74,9 +74,10 @@ def aggregate_disease_data(): # Rename GARD-Project mapping results columns to match the names listed in the GARD data normmap_df = pd.read_csv(data_neo4j('normmap_results.csv'),index_col=False,usecols=['ID','GARD_id','CONF_SCORE','SEM_SIM']) normmap_df = normmap_df.rename(columns={'ID':'APPLICATION_ID'}) - + # Split tuple normmap result into 2 seperate columns normmap_df[['GARD_NAME', 'GARD_ID']] = normmap_df['GARD_id'].str.extract(r'\(\'(.*?)\', \'(.*?)\'\)') + # drop the original column normmap_df.drop('GARD_id', axis=1, inplace=True) @@ -127,6 +128,11 @@ def batch_normmap(df, thr, year): gard_ids = rdas.GardNameExtractor(title, phr, abstract) if gard_ids: for gard,add_data in gard_ids.items(): + # Cleans up instances where gard name has an apostrophe in the name and has 2 sets of double quotes around it rather than one set of single quotes + pat = re.sub(r"\"\"|(\")(\'')",'\'', gard).replace('\'', '').replace('(', '').replace(')', '') + quoted_text = "'{}', '{}'".format(pat.split(",")[0].strip(), pat.split(",")[1].strip()) + gard = "({})".format(quoted_text) + if add_data == 1: add_data = [1,1] with lock: @@ -152,25 +158,32 @@ def run_normmap(): print(abs_file, ' -merged- ',prj_file) tmp = pd.read_csv(('{filename}'.format(filename=abs_file)),index_col=False, encoding = "ISO-8859-1") tmp2 = pd.read_csv(('{filename}'.format(filename=prj_file)),index_col=False, usecols=['APPLICATION_ID','PHR', 'PROJECT_TITLE'], encoding = "ISO-8859-1", low_memory=False) - merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID']) + merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'], how='outer', indicator='EXISTS_IN_ABSTRACT_FILE') + #merged_df.fillna('', inplace=True) merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int) merged_df.to_csv(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv'), index=False) - + norm_files = glob.glob(data_raw('normmap') + '/*.csv') norm_files = sorted(norm_files) for norm_file in norm_files: year = re.findall(r'\d+', norm_file)[0] - if os.path.exists(data_neo4j(f'normmap/normmap_results_{year}.csv')): + + if os.path.exists(data_neo4j(f'normmap/normmap_results_{year}.csv')): #COMMENTED OUT FOR TESTING print(f'{year} Gard-Project mapping file already exists... bypassing') continue # Create CSV files headers - with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f: + with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f: #COMMENTED OUT FOR TESTING f.writelines(['ID|GARD_id|CONF_SCORE|SEM_SIM\n']) df = pd.read_csv(norm_file, index_col=False, low_memory=False) - chunk_size = int(len(df)/5) thread_list = list() + + #df = df[df['EXISTS_IN_ABSTRACT_FILE']=='right_only'] #TEST + #df = df[['APPLICATION_ID', 'ABSTRACT_TEXT', 'PHR', 'PROJECT_TITLE']] #TEST + + chunk_size = int(len(df)/5) + list_df = [df[i:i+chunk_size] for i in range(0,len(df),chunk_size)] # Create threads to process results @@ -185,10 +198,14 @@ def run_normmap(): combine_normmap_results() print('GARD to Project connections made') + + def get_RD_project_ids(): # Get GARD to Project mappings - run_normmap() + #run_normmap() aggregate_disease_data() + exit() #TEST + apps = pd.read_csv(data_neo4j("normmap_results.csv"), usecols=["ID"]) # Drop duplicate results and sort by Application ID @@ -275,7 +292,8 @@ def find_RD_apps(input_file, rd_ids): print('Finished ', output_file) def clean_pi (pi_info): - pi_info = pi_info[:len(pi_info)-1] + pi_info = pi_info.replace(";","") + if pi_info == "\", \"": pi_info = "\"\"" return pi_info def cleanup_project_IC_NAME_totalcost(): @@ -307,7 +325,6 @@ def cleanup_project_IC_NAME_totalcost(): # Results are listed as a string seperated by semi-colons, this removes the last semi-colon in the string because it causes issues when converting to a list app['PI_IDS'] = app['PI_IDS'].astype(str) app['PI_NAMEs'] = app['PI_NAMEs'].astype(str) - app['PI_IDS'] = app['PI_IDS'].apply(clean_pi) app['PI_NAMEs'] = app['PI_NAMEs'].apply(clean_pi) @@ -608,13 +625,14 @@ def annotate_grant_abstracts(): # Annotate text with four scispaCy models - for model in MODELS: + for model in MODELS[2:]: print(f'*** Annotate with {model} model ***') nlp = load_model(model) for file in input_files: year = file[-8:-4] - + if int(year) < 2006 and model == 'en_ner_bc5cdr_md': + continue try: text = pd.read_csv(file, encoding=ENCODING, dtype={'APPLICATION_ID':int, 'ABSTRACT_TEXT':str}) @@ -770,7 +788,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd: ############################################## # Run preprocessing stages one after another.# ############################################## - """ + print('Running get_disease_data') get_disease_data() print("Running get_RD_project_ids") @@ -779,10 +797,8 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd: merge_project_funding() print("Running select_RD_projects") select_RD_projects() - print("Running cleanup_project_IC_NAME_totalcost") cleanup_project_IC_NAME_totalcost() - print("Running find_RD_core_projects") find_RD_core_projects() print("Running select_RD_patents") @@ -807,6 +823,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd: and v in [pygit2.GIT_STATUS_WT_MODIFIED, pygit2.GIT_STATUS_WT_NEW]} ''' + print("Running annotation_preprocess_grant") annotation_preprocess_grant() @@ -818,6 +835,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd: clean_annotation_source() print("Running map_semantic_types") map_semantic_types() + print("Running fix_escaped_endings") fix_escaped_endings() @@ -831,7 +849,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd: p = Popen(['scp', '-r', '-i', f'~/.ssh/id_rsa', f'{sysvars.gnt_files_path}/processed/', f'{sysvars.current_user}@{target_url}:{sysvars.gnt_files_path}'], encoding='utf8') p.wait() print('Transfer done...') - """ + # Gets the names of every processed file added for the rest of the code to add to the neo4j fta = {} for subdir in FilesToAdd.__dict__['__annotations__'].keys(): diff --git a/RDAS_GFKG/steps.py b/RDAS_GFKG/steps.py index 7f5e824..837d4db 100755 --- a/RDAS_GFKG/steps.py +++ b/RDAS_GFKG/steps.py @@ -310,13 +310,10 @@ """, "query": """ - WITH split(data.PI_IDS, ';') as ids, - split(data.PI_NAMEs, ';') as names, data - UNWIND [x in range(0, coalesce(size(ids) - 1, -1)) | - [trim(split(ids[x], '(')[0]), trim(split(names[x], '(')[0])] - ] as pi_data + WITH [data.PI_IDS] as ids, [data.PI_NAMEs] as names, data + UNWIND [x in range(0, coalesce(size(ids) - 1, -1)) | [trim(split(ids[x], '(')[0]), trim(split(names[x], '(')[0])]] as pi_data MERGE (p:PrincipalInvestigator { - pi_id: pi_data[0], + pi_id: coalesce(pi_data[0], ""), pi_name: coalesce(pi_data[1], ""), org_state: coalesce(data.ORG_STATE, ""), org_name: coalesce(data.ORG_NAME, "")}) diff --git a/RDAS_GFKG/update_grant.py b/RDAS_GFKG/update_grant.py index b85bf96..64f2ed3 100755 --- a/RDAS_GFKG/update_grant.py +++ b/RDAS_GFKG/update_grant.py @@ -58,6 +58,6 @@ def main(db: AlertCypher, restart_raw=False, restart_processed=False): fta = prep_data(f"{sysvars.gnt_files_path}raw", f"{sysvars.gnt_files_path}processed") # run database upgrade steps on only new/modified files - for step in steps[10:]: + for step in steps: print("\n\n" + step["description"] + "...") - step_to_fn(**step)(db, fta) + step_to_fn(**step)(db, fta) \ No newline at end of file diff --git a/RDAS_PAKG/NaturalHistory4GARD b/RDAS_PAKG/NaturalHistory4GARD index 1fef421..bdca102 160000 --- a/RDAS_PAKG/NaturalHistory4GARD +++ b/RDAS_PAKG/NaturalHistory4GARD @@ -1 +1 @@ -Subproject commit 1fef421c7c86fa6f6c8bc5db581e65a5613290dd +Subproject commit bdca1024c249bf44ae4fd247a7b605bb7b5780ce diff --git a/RDAS_PAKG/epi4GARD b/RDAS_PAKG/epi4GARD index 5d6e097..a9cada5 160000 --- a/RDAS_PAKG/epi4GARD +++ b/RDAS_PAKG/epi4GARD @@ -1 +1 @@ -Subproject commit 5d6e0975cd08aea9328c845514f36638d80e510f +Subproject commit a9cada51493284982dbd5bffdfd109e1e57b63eb diff --git a/RDAS_PAKG/methods.py b/RDAS_PAKG/methods.py index a4b1b05..fbd3292 100644 --- a/RDAS_PAKG/methods.py +++ b/RDAS_PAKG/methods.py @@ -668,7 +668,7 @@ def fetch_abstracts(pubmedIDs): -def fetch_pubtator_annotations(pubmedId): +def fetch_pubtator_annotations(pubmedIDs,retry=0): """ Fetch annotations from PubTator for a given PubMed ID. @@ -685,29 +685,44 @@ def fetch_pubtator_annotations(pubmedId): >> print(annotations) {'documents': [{'infons': {}, 'passages': [...], 'annotations': [...], ...}]} """ + # Splits pubmedIDs into batches of < 100 due to API limit + batches = [pubmedIDs[i * 99:(i + 1) * 99] for i in range((len(pubmedIDs) + 99 - 1) // 99 )] + + for batch_num, batch in enumerate(batches): + try: + print('BATCH NUM::', str(batch_num)) - try: + str_batch = ",".join(batch) # Construct the PubTator API URL for the given PubMed ID - pubtatorUrl = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=" + pubmedId - - # Make a GET request to fetch PubTator annotations - r = requests.get(pubtatorUrl) - - # Check if the response is sucessful and not empty - if (not r or r is None or r ==''): - logging.error(f'Can not find PubTator for: {pubmedId}') - return None - else: - return r.json() + pubtatorUrl = "https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=" + str_batch + + # Make a GET request to fetch PubTator annotations + r = requests.get(pubtatorUrl) + time.sleep(0.34) #limits to 3 queries a second aka API limit + + # Check if the response is sucessful and not empty + if (not r or r is None or r ==''): + print(f'fetch_pubtator_annotations: api response empty or not successful') + retry += 1 + print('RETRY QUERY:', retry) + if retry < 6: + time.sleep(1) #wait 1 second + fetch_pubtator_annotations(pubmedIDs,retry=retry) + else: + yield None + + else: + yield r.json() - except TimeoutError as e: - #Retry after a short delay if a timeout error occurs - time.sleep(1) - fetch_pubtator_annotations(pubmedId) + except TimeoutError as e: + #Retry after a short delay if a timeout error occurs + print(e) + continue - except ValueError as e: - # Return None if theres an issue parsing the response as JSON - return None + except ValueError as e: + # Return None if theres an issue parsing the response as JSON + print(e) + continue @@ -1049,13 +1064,16 @@ def create_keywords(tx, abstractDataRel, article_node): MERGE (k:Keyword {keyword:$keyword}) MERGE (k)- [r:KEYWORD_FOR] -> (a) ''' - - for keyword in abstractDataRel: - if keyword: - tx.run(create_keyword_query, args={ - "article_id":article_node, - "keyword": keyword - }) + # Some articles have all the keywords in one field, therefore we must convert the text to a list if needed + for keyword_field in abstractDataRel: + if keyword_field: + #keyword_field_list = [x.strip() for x in keyword_field.split(', ')] + for keyword in keyword_field: + keyword = keyword.lower() + tx.run(create_keyword_query, args={ + "article_id":article_node, + "keyword": keyword + }) @@ -1349,13 +1367,13 @@ def create_chemicals(tx, abstractDataRel, article_node): create_chemicals_query = ''' MATCH (a:Article) WHERE id(a) = $article_id - MERGE (u:Substance {name:$name, registryNumber:$registryNumber}) - [r:SUBSTANCE_ANNOTATED_BY_PUBMED] -> (a) + MERGE (u:Substance {name:$name, registryNumber:$registryNumber}) MERGE (u)-[r:SUBSTANCE_ANNOTATED_BY_PUBMED]->(a) ''' for chemical in abstractDataRel: tx.run(create_chemicals_query, args={ "article_id":article_node, - "name": chemical['name'] if 'name' in chemical else '', + "name": chemical['name'].lower() if 'name' in chemical else '', "registryNumber": chemical['registryNumber'] if 'registryNumber' in chemical else '', }) @@ -1384,21 +1402,6 @@ def create_annotations(tx, pubtatorData, article_node, today): """ if pubtatorData: - create_annotations_query = ''' - MATCH(a:Article) WHERE id(a) = $article_id - MERGE (pa:PubtatorAnnotation { - text = $text - }) - ON MATCH - SET LastUpdatedRDAS = $rdasupdated - ON CREATE - SET infons_identifier:$infons_identifier - SET DateCreatedRDAS = $rdascreated - SET LastUpdatedRDAS = $rdasupdated - SET infons_type = $infons_type - MERGE (pa)- [r:ANNOTATION_FOR { type: $type }] -> (a) - ''' - for passage in pubtatorData['passages']: type = passage['infons']['type'] if 'type' in passage['infons'] else '' @@ -1420,10 +1423,72 @@ def create_annotations(tx, pubtatorData, article_node, today): temp = temp.split(",") except: pass - parameters['text'] = temp + parameters['text'] = [x.lower() for x in temp] #lowercases all elements in list + + # Check for other connected pubtator annotation relationships and identify the type sources ('title', 'abstract', or 'title and abstract') + # Ex. List of Values; if value is 'title and abstract' then it will be ['title','abstract'] + check = tx.run('MATCH (pa:PubtatorAnnotation {{ text: {text}, infons_type: \'{infons_type}\', infons_identifier: \'{infons_identifier}\' }})-[r:ANNOTATION_FOR]->(a:Article) WHERE ID(a) = {article_id} RETURN DISTINCT r.type as rel_type, ID(r) as rel_id' + .format(text=parameters['text'], + article_id=parameters['article_id'], + infons_type=parameters['infons_type'], + infons_identifier=parameters['infons_identifier'], + type=parameters['type'])).data() + + if len(check) > 0: + existing_type = check[0]['rel_type'] # is a list + incoming_type = parameters['type'] + existing_id = check[0]['rel_id'] + + if existing_type == ['Abstract'] and incoming_type == 'abstract': + continue + if existing_type == ['Title'] and incoming_type == 'title': + continue + if existing_type == ['Title', 'Abstract'] and incoming_type == 'title and abstract': + continue + if existing_type == ['Title', 'Abstract'] and incoming_type == 'title': + continue + if existing_type == ['Title', 'Abstract'] and incoming_type == 'abstract': + continue + + if existing_type == ['Title'] and incoming_type == 'abstract': + parameters['type'] = ['Title', 'Abstract'] + elif existing_type == ['Abstract'] and incoming_type == 'title': + parameters['type'] = ['Title', 'Abstract'] + + tx.run('MATCH ()-[r:ANNOTATION_FOR]->() WHERE ID(r) = {existing_id} SET r.type = {new_type}'.format(existing_id=existing_id, new_type=parameters['type'])) + continue + + else: + type_temp = parameters['type'] + if type_temp == 'title and abstract': + parameters['type'] = ['Title', 'Abstract'] + elif type_temp == 'title': + parameters['type'] = ['Title'] + elif type_temp == 'abstract': + parameters['type'] = ['Abstract'] + + + # Develop Neo4j Query to Populate Annotations (New Node Only) + create_annotations_query = ''' + MATCH (a:Article) WHERE id(a) = {article_id} + MERGE (pa:PubtatorAnnotation {{ text: {text}, infons_type: \'{infons_type}\', infons_identifier: \'{infons_identifier}\' }}) + ON CREATE + SET pa.infons_identifier = \'{infons_identifier}\' + SET pa.DateCreatedRDAS = \'{rdascreated}\' + SET pa.LastUpdatedRDAS = \'{rdasupdated}\' + SET pa.text = {text} + SET pa.infons_type = \'{infons_type}\' + MERGE (pa)-[r:ANNOTATION_FOR {{ type: {type} }} ]-> (a) + '''.format(text=parameters['text'], + article_id=parameters['article_id'], + infons_type=parameters['infons_type'], + rdasupdated=parameters['rdasupdated'], + rdascreated=parameters['rdascreated'], + infons_identifier=parameters['infons_identifier'], + type=parameters['type']) # Execute the Cypher query to create PubtatorAnnotation nodes and associate them with the Article node - txout = tx.run(create_annotations_query, args=parameters) + txout = tx.run(create_annotations_query) @@ -1877,37 +1942,59 @@ def gather_pubtator(db, today): Returns: None """ + in_progress = db.getConf('UPDATE_PROGRESS', 'pubmed_in_progress') + if in_progress == 'True': + current_step = db.getConf('UPDATE_PROGRESS', 'pubmed_pubtator_article_progress') + if not current_step == '': + current_step = int(current_step) + else: + current_step = 0 + else: + current_step = 0 # Retrieve articles without Pubtator annotations - res = db.run('MATCH (x:Article) WHERE NOT (x)--(:PubtatorAnnotation) AND x.pubmed_id IS NOT NULL AND x.hasPubtatorAnnotation IS NULL RETURN x.pubmed_id AS pmid, ID(x) AS id').data() + #res = db.run('MATCH (x:Article) WHERE NOT (x)--(:PubtatorAnnotation) AND x.pubmed_id IS NOT NULL AND x.hasPubtatorAnnotation IS NULL RETURN x.pubmed_id AS pmid, ID(x) AS id').data() + #print(len(res)) + + res = db.run('MATCH (x:Article) WHERE x.pubmed_id IS NOT NULL AND x.hasPubtatorAnnotation IS NULL RETURN x.pubmed_id AS pmid, ID(x) AS id').data() print(len(res)) # Set OMIM only articles to hasPubtatorAnnotation = False since they dont have pubmed_id's db.run('MATCH (x:Article) WHERE x.pubmed_id IS NULL AND x.hasPubtatorAnnotation IS NULL SET x.hasPubtatorAnnotation = FALSE') - + # Iterate over the articles and fetch Pubtator annotations - for idx,r in enumerate(res): - print(idx) + res = res[current_step:] + pmids = [r['pmid'] for r in res] + pmid_to_id = {r['pmid']:r['id'] for r in res} - pmid = r['pmid'] - ID = r['id'] - try: - # Fetch Pubtator annotations for the article - print(ID) - annos = fetch_pubtator_annotations(pmid) - - if annos: - # Create PubtatorAnnotation nodes in the database - create_annotations(db, annos, ID, today) - db.run(f'MATCH (a:Article) WHERE ID(a) = {ID} SET a.hasPubtatorAnnotation = TRUE') - print('annotations created') #TEST - else: - db.run(f'MATCH (a:Article) WHERE ID(a) = {ID} SET a.hasPubtatorAnnotation = FALSE') - print('annotation not created') #TEST + try: + # Fetch Pubtator annotations for the article + for batch in fetch_pubtator_annotations(pmids): + if not batch: + continue + + annos = batch['PubTator3'] + + for anno in annos: + cur_pmid = str(anno['pmid']) + article_id = pmid_to_id[cur_pmid] + + if anno: + # Create PubtatorAnnotation nodes in the database + print('ARTICLE_ID::', article_id, 'CURRENT_STEP::', current_step) + + create_annotations(db, anno, article_id, today) + db.run(f'MATCH (a:Article) WHERE ID(a) = {article_id} SET a.hasPubtatorAnnotation = TRUE') + else: + db.run(f'MATCH (a:Article) WHERE ID(a) = {article_id} SET a.hasPubtatorAnnotation = FALSE') + current_step += 1 + db.setConf('UPDATE_PROGRESS', 'pubmed_pubtator_article_progress', str(current_step)) + - except Exception as e: - logging.warning(f'\nException creating annotations for article {pmid}: {e}') + except Exception as e: + #logging.warning(f'\nException creating annotations for article {pmid}: {e}') + print('error in gather_pubtator') @@ -1956,7 +2043,7 @@ def gather_epi(db, today): def download_genereview_articles(): if not os.path.exists(f'{sysvars.base_path}pubmed/src/genereviews_pmid.txt'): - command = f'curl -L -X GET https://ftp.ncbi.nih.gov/pub/GeneReviews/GRtitle_shortname_NBKid.txt -o {sysvars.base_path}pubmed/src/genereviews_pmid.txt' + command = f'curl -L -X GET https://ftp.ncbi.nih.gov/pub/GeneReviews/GRtitle_shortname_NBKid.txt -o {sysvars.pm_files_path}genereviews_pmid.txt' os.system(command) @@ -1969,7 +2056,7 @@ def generate_missing_genereviews(response, review_list, df): missing = [int(i) for i in missing] df = df[df['PMID'].isin(missing)] - df.to_csv(f'{sysvars.base_path}pubmed/src/genereviews_pmid_missing.csv') + df.to_csv(f'{sysvars.pm_files_path}genereviews_pmid_missing.csv') @@ -1977,7 +2064,7 @@ def generate_missing_genereviews(response, review_list, df): def label_genereview(db): download_genereview_articles() - df = pd.read_csv(f'{sysvars.base_path}pubmed/src/genereviews_pmid.txt', encoding='ISO-8859-1', sep='\t') + df = pd.read_csv(f'{sysvars.pm_files_path}genereviews_pmid.txt', encoding='ISO-8859-1', sep='\t') review_list = df['PMID'].tolist() review_list = [str(i) for i in review_list] @@ -2075,7 +2162,11 @@ def retrieve_articles(db, last_update, updating_to, today): # End of the pipeline, resets the config in_progress values db.setConf('UPDATE_PROGRESS', 'pubmed_current_step', '') + db.setConf('UPDATE_PROGRESS', 'pubmed_disease_article_progress', '') + db.setConf('UPDATE_PROGRESS', 'pubmed_omim_article_progress', '') + db.setConf('UPDATE_PROGRESS', 'pubmed_pubtator_article_progress', '') db.setConf('UPDATE_PROGRESS', 'pubmed_in_progress', 'False') + else: print('Update in progress... bypassing save_gene') @@ -2144,6 +2235,7 @@ def update_missing_abstracts(db, today): response = db.run(query).data() length = len(response) + print(length) # Iterate over articles with missing abstracts for idx,res in enumerate(response): @@ -2158,6 +2250,7 @@ def update_missing_abstracts(db, today): # Fetch abstract from PubMed article = fetch_abstracts([pmid]) + time.sleep(0.34) try: article = article[0]['resultList']['result'][0] @@ -2175,7 +2268,7 @@ def update_missing_abstracts(db, today): abstractDataRel = {'abstractText': new_abstract,'title': title} create_epidemiology(db, abstractDataRel, article_node, today) - except IndexError as e: + except Exception as e: continue print(str(idx) + '/' + str(length)) diff --git a/RDAS_PAKG/update.py b/RDAS_PAKG/update.py index a0290bd..a7df451 100644 --- a/RDAS_PAKG/update.py +++ b/RDAS_PAKG/update.py @@ -25,5 +25,5 @@ def main (update_from=False, update_to=False): RDAS_PAKG.init.main(update_from=update_from, update_to=update_to) -#main() #TEST +main() #TEST #get_node_counts() diff --git a/config.ini b/config.ini index c440a1b..c8af13f 100644 --- a/config.ini +++ b/config.ini @@ -1,5 +1,5 @@ [DATABASE] -clinical_update = 03/20/24 +clinical_update = 01/01/75 pubmed_update = 03/20/24 grant_update = 03/20/24 gard_update = 03/20/24 @@ -8,20 +8,22 @@ pm_interval = 7 gnt_interval = 365 [UPDATE_PROGRESS] -clinical_in_progress = False +clinical_in_progress = True +clinical_disease_progress = 57 clinical_add_progress = clinical_update_progress = clinical_required_update_progress = clinical_rxnorm_progress = clinical_current_step = pubmed_in_progress = True -pubmed_disease_article_progress = 12003 -pubmed_omim_article_progress = 12003 -pubmed_current_step = save_epi +pubmed_disease_article_progress = +pubmed_omim_article_progress = +pubmed_pubtator_article_progress = +pubmed_current_step = grant_in_progress = False ct_update = 02/06/24 pm_update = 02/01/24 -rdas.ctkg_update = 01/01/24 +rdas.ctkg_update = 01/01/75 rdas.gfkg_update = 01/01/23 rdas.pakg_update = 01/01/24 rdas.gard_update = 01/01/24 diff --git a/sysvars.py b/sysvars.py index 71f07e3..3c4cdc1 100644 --- a/sysvars.py +++ b/sysvars.py @@ -29,11 +29,11 @@ # if you are not using minghui's test dataset, make db_prefix=""; now you only need to change the neo4j database names here: -db_prefix="test." -ct_db_name="rdas.ctkg" +db_prefix="" +ct_db_name="new.rdas.ctkg" gf_db_name='rdas.gfkg' pa_db_name="rdas.pakg" -gard_db_name='rdas.gard' +gard_db_name='test.rdas.gard' ct_db = db_prefix+ct_db_name pm_db = db_prefix+pa_db_name