Skip to content

Commit

Permalink
Devon_dev (#73)
Browse files Browse the repository at this point in the history
* Grant Pipeline-New GARD Mapping Process

* Change get_node_counts

* Removal of leftover merge text

* More merge text removal

* gfkg and pakg bug fixes

* New CTKG pipeline

---------

Co-authored-by: Devon Joseph Leadman <[email protected]>
  • Loading branch information
devonleadman and Devon Joseph Leadman authored Aug 28, 2024
1 parent e1da955 commit 146ecff
Show file tree
Hide file tree
Showing 23 changed files with 1,856 additions and 193 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,8 @@ RDAS_CTKG/eligibility_extraction/
RDAS_CTKG/metamap_cond_out.json
RDAS_CTKG/metamap_cond.txt
RDAS_GFKG/convert_csv_fields.py
fix_normmap_file_sep.py
project_check.py
project_check_missing.csv
project_check_new.csv
project_check_old.csv
135 changes: 58 additions & 77 deletions RDAS_CTKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,74 +91,46 @@ def webscrape_ctgov_diseases():



def call_get_nctids (query, pageToken=None):
try:
if pageToken: query += f'&pageToken={pageToken}'
response = requests.get(query)
response_txt = response.json()
except Exception as e:
print('Unable to Process Query')
response_txt = None
return response_txt

def get_nctids(name_list):
"""
Retrieves ClinicalTrials.gov Identifiers (NCTIDs) for a list of rare disease names.
Args:
name_list (list): List of rare disease names.
Returns:
list: List of ClinicalTrials.gov Identifiers (NCTIDs) associated with the provided rare disease names.
Example:
disease_names = ["Disease1", "Disease2", ...]
nct_ids = get_nctids(disease_names)
print(nct_ids)
# Output: ["NCT123", "NCT456", ...]
"""

# Initialize a list to store all retrieved NCTIDs
def get_nctids(names,lastupdate):
# Date format: 05/01/1975
all_trials = list()

# Iterate through each rare disease name
for name in name_list:
# Replace double quotes to prevent issues with the URL
for name in names:
trials = list()
name = name.replace('"','\"')

# Construct the initial API query to get the total number of trials
initial_query = 'https://clinicaltrials.gov/api/query/study_fields?expr=AREA[ConditionBrowseBranchAbbrev] Rare AND \"' + name + '\"&fields=NCTId&'
query_end1 = 'min_rnk=1&max_rnk=1000&fmt=csv'

try:
# Make the API request to get the total number of trials
response = requests.get(initial_query + query_end1).text.splitlines()
total_trials = int(response[4][16:-1])
except Exception as e:
# Retry in case of an error
print('ERROR in retrieving NCTIDS, retrying...')
print(response)
response = requests.get(initial_query + query_end1).text.splitlines()
total_trials = int(response[4][16:-1])

initial_query = f'https://clinicaltrials.gov/api/v2/studies?query.cond=(EXPANSION[Concept]{name} OR AREA[DetailedDescription]EXPANSION[Concept]{name} OR AREA[BriefSummary]EXPANSION[Concept]{name}) AND AREA[LastUpdatePostDate]RANGE[{lastupdate},MAX]&fields=NCTId&pageSize=1000&countTotal=true'
try:
# Add trials to a temporary list
trials = list()
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Break into extra queries of 1000 trials if necessary
for rank in range(1, total_trials//1000 + 1):
# Get next 1000 trials
query_end2 = 'min_rnk=' + str(rank*1000+1) + '&max_rnk=' + str((rank+1)*1000) + '&fmt=csv'
response = requests.get(initial_query + query_end2).text.splitlines()

# Add trials to the temporary list
for trial in response[11:]:
trials.append(trial.split(',')[1][1:-1])

# Add the trials from the temporary list to the overall list
all_trials += trials

pageToken = None
while True:
response_txt = call_get_nctids(initial_query, pageToken=pageToken)
if response_txt:
trials_list = response_txt['studies']

for trial in trials_list:
nctid = trial['protocolSection']['identificationModule']['nctId']
trials.append(nctid)
all_trials += trials
if not 'nextPageToken' in response_txt:
break
else:
pageToken = response_txt['nextPageToken']
else:
break

except Exception as e:
print(e)
print(initial_query + query_end2)
print(trial)

# Return the list of all retrived NCTIDs
return all_trials

return [list(set(all_trials))]



Expand Down Expand Up @@ -247,17 +219,14 @@ def extract_fields(nctid):
print(trial_fields)
# Output: {"field1": "value1", "field2": {"nested_field": "nested_value"}}
"""

# Contruct the API query to retrieve full study information
full_trial_query = 'https://clinicaltrials.gov/api/query/full_studies?expr=' + nctid + '&min_rnk=1&max_rnk=1&fmt=json'
sleep(0.5)

try:
# Make the API request and parse the JSON response
full_trial_response = requests.get(full_trial_query).json()

query = f'https://clinicaltrials.gov/api/v2/studies/{nctid}'
response = requests.get(query)
response_txt = response.json()
sleep(0.34)
# Use the parse_trial_fields function to flatten the nested structure
full_trial = parse_trial_fields(full_trial_response)
full_trial = parse_trial_fields(response_txt)
except ValueError:
# Return None if there is an issue with the JSON response
return None
Expand All @@ -279,6 +248,17 @@ def get_lastupdated_postdate (ID):
# Return None if there is an issue with the JSON response
return None

def check_neo4j_trial_updates(db, nctids):
new_trials = list()
trials_to_check = list()
for nctid in nctids:
response = db.run(f'MATCH (x:ClinicalTrial) WHERE x.NCTId = \"{nctid}\" RETURN x.NCTId').data()
if len(response) > 0:
new_trials.append(nctid)
else:
trials_to_check.append(nctid)

return [new_trials, trials_to_check]


def cypher_generate(db,now,NCTID,data,node_type,update=None,return_single=None):
Expand Down Expand Up @@ -675,7 +655,7 @@ def mask_name(nlp, name):



def is_acronym(word):
def is_acronym(words):
"""
Checks if a word is an acronym.
Expand All @@ -691,13 +671,14 @@ def is_acronym(word):
"""

# Check if the word contains spaces
if len(word.split(' ')) > 1:
return False
# Check if the word follows the pattern of an acronym
elif bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()):
return True
else:
return False
if len(words.split()) > 1: return False

for word in words.split():
# Check if the word follows the pattern of an acronym
if bool(re.match(r'\w*[A-Z]\w*', word[:len(word)-1])) and (word[len(word)-1].isupper() or word[len(word)-1].isnumeric()): # aGG2
print('ACRONYM REMOVED::', words)
return True
return False



Expand Down Expand Up @@ -1237,7 +1218,7 @@ def rxnorm_map(db, rxnorm_progress):
matcher.add('DRUG',[pattern])

# Retrieve drug interventions from the database that do NOT already have a Drug node attached
results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "Drug" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
results = db.run('MATCH (x:Intervention) WHERE x.InterventionType = "DRUG" AND NOT EXISTS((x)--(:Drug)) RETURN x.InterventionName, ID(x)').data()
length = len(results)

# Iterate over drug interventions and map RxNorm data
Expand Down
120 changes: 119 additions & 1 deletion RDAS_CTKG/src/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
##################################################################
#new

node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference']
"""node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference']
abbreviations = {
'ClinicalTrial': 'ct',
Expand Down Expand Up @@ -181,4 +181,122 @@
# Propeties that are in list form
fields_as_properties = {
'ClinicalTrial': ['Phase']
}"""

node_names = ['ClinicalTrial', 'IndividualPatientData', 'Organization', 'Investigator', 'Condition', 'StudyDesign', 'PrimaryOutcome', 'Participant', 'ExpandedAccess', 'Intervention', 'Location', 'PatientRegistry', 'Reference']

abbreviations = {
'ClinicalTrial': 'ct',
'IndividualPatientData': 'ind',
'Organization': 'org',
'PrimaryOutcome': 'pout',
'Investigator': 'inv',
#'Sponsor': 'spo',
#'Collaborator': 'col',
'Condition': 'con',
'StudyDesign': 'stu',
'Participant': 'par',
'ExpandedAccess': 'exp',
'Intervention': 'int',
'Location': 'loc',
'PatientRegistry': 'pat',
'Reference': 'ref'
}

relationships = {
'IndividualPatientData': 'has_individual_patient_data',
#'Sponsor': 'sponsored_by',
#'Collaborator': 'collaborated_with',
'Organization': 'conducted_by',
'PrimaryOutcome': 'has_outcome',
'Investigator': 'investigated_by',
'Condition': 'investigates_condition',
'StudyDesign': 'has_study_design',
'Participant': 'has_participant_info',
'ExpandedAccess': 'expanded_access_info',
'Intervention': 'has_intervention',
'Location': 'in_locations',
'PatientRegistry': 'patient_registry_info',
'Reference': 'is_about'
}

rel_directions = {
'IndividualPatientData': ['-','->'],
#'Sponsor': ['-','->'],
#'Collaborator': ['-','->'],
'Organization': ['-','->'],
'PrimaryOutcome': ['-','->'],
'Investigator': ['-','->'],
'Condition': ['-','->'],
'StudyDesign': ['-','->'],
'Participant': ['-','->'],
'ExpandedAccess': ['-','->'],
'Intervention': ['-','->'],
'Location': ['-','->'],
'PatientRegistry': ['-','->'],
'Reference': ['<-','-']

}

fields = {
'ClinicalTrial': ['acronym','briefSummary','briefTitle','CompletionDate','CompletionDateType','LastKnownStatus',
'LastUpdatePostDate','LastUpdatePostDateType','LastUpdateSubmitDate','nctId','NCTIdAlias','OfficialTitle',
'OverallStatus','Phase','PrimaryCompletionDate','PrimaryCompletionDateType','ResultsFirstPostDate',
'ResultsFirstPostDateType','ResultsFirstPostedQCCommentsDate','ResultsFirstPostedQCCommentsDateType',
'StartDate','StartDateType','StudyFirstPostDate','StudyFirstPostDateType','StudyType'],

'IndividualPatientData': ['AvailIPDComment','AvailIPDId','AvailIPDType','AvailIPDURL','IPDSharing',
'IPDSharingAccessCriteria','IPDSharingDescription','IPDSharingInfoType','IPDSharingTimeFrame','IPDSharingURL'],

#'Sponsor': ['LeadSponsorName','LeadSponsorClass'],

#'Collaborator': ['CollaboratorName','CollaboratorClass'],

'Organization': ['OrgName', 'OrgClass', 'OrgType'],

'Investigator': ['OfficialName', 'ContactEmail', 'OfficialAffiliation', 'ContactPhone', 'OfficialRole'],

'Condition': ['Condition'], #'ConditionAncestorId', 'ConditionAncestorTerm', 'ConditionBrowseBranchAbbrev', 'ConditionBrowseBranchName', 'ConditionBrowseLeafAsFound', 'ConditionBrowseLeafId', 'ConditionBrowseLeafName', 'ConditionBrowseLeafRelevance', 'ConditionMeshId', 'ConditionMeshTerm'

'StudyDesign': ['DesignAllocation','DesignInterventionModel','DesignInterventionModelDescription','DesignMasking',
'DesignMaskingDescription','DesignObservationalModel','DesignPrimaryPurpose','DesignTimePerspective',
'DetailedDescription','SamplingMethod'],

'PrimaryOutcome': ['PrimaryOutcomeDescription', 'PrimaryOutcomeMeasure', 'PrimaryOutcomeTimeFrame'],

'Participant': ['EligibilityCriteria', 'EnrollmentCount', 'EnrollmentType', 'Gender', 'GenderBased', 'GenderDescription',
'HealthyVolunteers', 'MaxiumumAge', 'MinimumAge', 'StdAge', 'StudyPopulation'],

'ExpandedAccess': ['ExpAccTypeIndividual','ExpAccTypeIntermediate','ExpAccTypeTreatment','ExpandedAccessNCTId',
'ExpandedAccessStatusForNCTId','HasExpandedAccess'],

'Intervention': ['InterventionName','InterventionType','InterventionDescription', 'InterventionOtherName', 'IsFDARegulatedDevice', 'IsFDARegulatedDrug'], # 'InterventionBrowseLeafId', 'InterventionBrowseLeafName', 'InterventionBrowseLeafRelevance', 'InterventionMeshId', 'InterventionMeshTerm', 'InterventionOtherName'

'Location': ['LocationCity','LocationCountry','LocationFacility','LocationState',
'LocationStatus','LocationZip'],

'PatientRegistry': ['PatientRegistry'],

'Reference': ['Citation','ReferencePMID','ReferenceType']

}

# Nodes that need additional processing to create additional nodes
process_nodes = ['Intervention', 'Condition', 'ClinicalTrial', 'Organization', 'PrimaryOutcome']

# Types of nodes that contain more than one entry
lists_of_nodes = {
'Collaborator': 'Collaborator',
'Condition': 'Condition',
'Intervention': 'Intervention',
'Location': 'Location',
'Reference': 'Reference',
'PrimaryOutcome':'PrimaryOutcome',

'Unassigned': ['SecondaryIdInfo','ArmGroup','SecondaryOutcome','OtherOutcome','StdAge','OverallOfficial','IPDSharingInfoType', 'ConditionMesh', 'ConditionAncestor','ConditionBrowseLeaf','ConditionBrowseBranch','InterventionMesh','InterventionAncestor','InterventionBrowseLeaf','InterventionBrowseBranch']
}

# Propeties that are in list form
fields_as_properties = {
'ClinicalTrial': ['Phase']
}
Loading

0 comments on commit 146ecff

Please sign in to comment.