Skip to content

Commit

Permalink
gfkg and pakg bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Devon Joseph Leadman committed Jul 22, 2024
1 parent 3512473 commit bb7b8ae
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 88 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,8 @@ RDAS_CTKG/eligibility_extraction/
RDAS_CTKG/metamap_cond_out.json
RDAS_CTKG/metamap_cond.txt
RDAS_GFKG/convert_csv_fields.py
fix_normmap_file_sep.py
project_check.py
project_check_missing.csv
project_check_new.csv
project_check_old.csv
4 changes: 2 additions & 2 deletions RDAS_GARD/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def create_disease_node(db, data, xrefs): # Include xrefs into GARD node instead
"syns":data[6],
"orpha":results['Orphanet'] if 'Orphanet' in results else None,
"icd10":results['ICD-10'] if 'ICD-10' in results else None,
"umls":results['UMLS'] if 'UMLS' in results else None,
"umls":list(set(results['UMLS'])) if 'UMLS' in results else None,
"omim":results['OMIM'] if 'OMIM' in results else None,
"snomed":results['SNOMED-CT'] if 'SNOMED-CT' in results else None,
"diseaseontology":results['DiseaseOntology'] if 'DiseaseOntology' in results else None,
Expand Down Expand Up @@ -315,7 +315,7 @@ def get_remaining_umls(db, umls_update=True):
INSTANCE.form['SingLinePMID'] = True

print('GATHERING GARD UMLS DATA')
db.run('MATCH (x:GARD) WHERE x.UMLS IS NOT NULL SET x.UMLS_Source = "DATALAKE"')
db.run('MATCH (x:GARD) WHERE x.UMLS IS NOT NULL SET x.UMLS_Source = "GARD"')
res = db.run('MATCH (x:GARD) WHERE x.UMLS IS NULL SET x.UMLS_Source = "METAMAP" RETURN x.GardId AS gard_id, x.GardName as gard_name').data()

gard_strs = [f"{i['gard_id'].replace('GARD:','')}|{normalize(i['gard_name'])}\n" for i in res if i['gard_name']]
Expand Down
6 changes: 3 additions & 3 deletions RDAS_GFKG/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,7 +764,7 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
else: return normalize_combined_dictionary(title_,title_,name,{},{},{},1,1,'title')

if Public_health_relevance_statement and isinstance(Public_health_relevance_statement, str):
A, B, C,D = check_sen(Public_health_relevance_statement, nlp)
A, B, C,D = check_sen(Public_health_relevance_statement)
name1 = get_gard_abstract_stem_exact(A)
name2 = get_gard_abstract_stem_exact(B)
name3 = get_gard_abstract_stem_exact(C)
Expand All @@ -773,15 +773,15 @@ def grad_id(title_, Public_health_relevance_statement, abstract_):
if name and (name !={}): return name

if abstract_ and isinstance(abstract_, str):
A, B, C , D = check_sen(abstract_, nlp)
A, B, C , D = check_sen(abstract_)
name1 = get_gard_abstract_stem_exact(A)
name2 = get_gard_abstract_stem_exact(B)
name3 = get_gard_abstract_stem_exact(C)
name4 = get_gard_abstract_stem_exact(D)
name=normalize_combined_dictionary(abstract_,title_,name1,name2,name3,name4,0,0.7,'abstract')
if name and (name !={}): return name

def GardNameExtractor(project_title,phr_text,abstract_text, nlp):
def GardNameExtractor(project_title,phr_text,abstract_text):
#Abstract1['Gard_name']=Abstract1.apply(lambda x: gard_id(x['project_title'],x['phr_text'],x['abstract_text']), axis=1)
gard_ids = grad_id(project_title,phr_text,abstract_text)
if gard_ids:
Expand Down
33 changes: 22 additions & 11 deletions RDAS_GFKG/prep_neo4j_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,25 +152,32 @@ def run_normmap():
print(abs_file, ' -merged- ',prj_file)
tmp = pd.read_csv(('{filename}'.format(filename=abs_file)),index_col=False, encoding = "ISO-8859-1")
tmp2 = pd.read_csv(('{filename}'.format(filename=prj_file)),index_col=False, usecols=['APPLICATION_ID','PHR', 'PROJECT_TITLE'], encoding = "ISO-8859-1", low_memory=False)
merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'])
merged_df = pd.merge(tmp, tmp2, on=['APPLICATION_ID'], how='outer', indicator='EXISTS_IN_ABSTRACT_FILE')
#merged_df.fillna('', inplace=True)
merged_df['APPLICATION_ID'] = merged_df['APPLICATION_ID'].astype(int)
merged_df.to_csv(data_raw(f'normmap/RePORTER_NORMMAP_{year}.csv'), index=False)

norm_files = glob.glob(data_raw('normmap') + '/*.csv')
norm_files = sorted(norm_files)
for norm_file in norm_files:
year = re.findall(r'\d+', norm_file)[0]
if os.path.exists(data_neo4j(f'normmap/normmap_results_{year}.csv')):

if os.path.exists(data_neo4j(f'normmap/normmap_results_{year}.csv')): #COMMENTED OUT FOR TESTING
print(f'{year} Gard-Project mapping file already exists... bypassing')
continue

# Create CSV files headers
with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f:
with open(data_neo4j(f'normmap/normmap_results_{year}.csv'), "w") as f: #COMMENTED OUT FOR TESTING
f.writelines(['ID|GARD_id|CONF_SCORE|SEM_SIM\n'])

df = pd.read_csv(norm_file, index_col=False, low_memory=False)
chunk_size = int(len(df)/5)
thread_list = list()

#df = df[df['EXISTS_IN_ABSTRACT_FILE']=='right_only'] #TEST
#df = df[['APPLICATION_ID', 'ABSTRACT_TEXT', 'PHR', 'PROJECT_TITLE']] #TEST

chunk_size = int(len(df)/5)

list_df = [df[i:i+chunk_size] for i in range(0,len(df),chunk_size)]

# Create threads to process results
Expand All @@ -185,10 +192,13 @@ def run_normmap():
combine_normmap_results()
print('GARD to Project connections made')



def get_RD_project_ids():
# Get GARD to Project mappings
run_normmap()
aggregate_disease_data()

apps = pd.read_csv(data_neo4j("normmap_results.csv"), usecols=["ID"])

# Drop duplicate results and sort by Application ID
Expand Down Expand Up @@ -275,7 +285,7 @@ def find_RD_apps(input_file, rd_ids):
print('Finished ', output_file)

def clean_pi (pi_info):
pi_info = pi_info[:len(pi_info)-1]
pi_info = pi_info.replace(";","")
return pi_info

def cleanup_project_IC_NAME_totalcost():
Expand Down Expand Up @@ -608,13 +618,14 @@ def annotate_grant_abstracts():


# Annotate text with four scispaCy models
for model in MODELS:
for model in MODELS[2:]:
print(f'*** Annotate with {model} model ***')

nlp = load_model(model)
for file in input_files:
year = file[-8:-4]

if int(year) < 2006 and model == 'en_ner_bc5cdr_md':
continue
try:
text = pd.read_csv(file, encoding=ENCODING, dtype={'APPLICATION_ID':int, 'ABSTRACT_TEXT':str})

Expand Down Expand Up @@ -779,10 +790,8 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
merge_project_funding()
print("Running select_RD_projects")
select_RD_projects()
print("Running cleanup_project_IC_NAME_totalcost")
cleanup_project_IC_NAME_totalcost()
print("Running find_RD_core_projects")
find_RD_core_projects()
print("Running select_RD_patents")
Expand All @@ -797,7 +806,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
cleanup_pub_country()
print("Running select_RD_abstracts")
select_RD_abstracts()
"""
# The below stages are extremely slow, so we will only run them for
# years that have changed data.

Expand All @@ -807,6 +816,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
and v in [pygit2.GIT_STATUS_WT_MODIFIED, pygit2.GIT_STATUS_WT_NEW]}
'''

"""
print("Running annotation_preprocess_grant")
annotation_preprocess_grant()
Expand All @@ -818,6 +828,7 @@ def prep_data(data_raw_path: str, data_neo4j_path: str) -> FilesToAdd:
clean_annotation_source()
print("Running map_semantic_types")
map_semantic_types()
print("Running fix_escaped_endings")
fix_escaped_endings()
Expand Down
9 changes: 3 additions & 6 deletions RDAS_GFKG/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,13 +310,10 @@
""",
"query":
"""
WITH split(data.PI_IDS, ';') as ids,
split(data.PI_NAMEs, ';') as names, data
UNWIND [x in range(0, coalesce(size(ids) - 1, -1)) |
[trim(split(ids[x], '(')[0]), trim(split(names[x], '(')[0])]
] as pi_data
WITH [data.PI_IDS] as ids, [data.PI_NAMEs] as names, data
UNWIND [x in range(0, coalesce(size(ids) - 1, -1)) | [trim(split(ids[x], '(')[0]), trim(split(names[x], '(')[0])]] as pi_data
MERGE (p:PrincipalInvestigator {
pi_id: pi_data[0],
pi_id: coalesce(pi_data[0], ""),
pi_name: coalesce(pi_data[1], ""),
org_state: coalesce(data.ORG_STATE, ""),
org_name: coalesce(data.ORG_NAME, "")})
Expand Down
4 changes: 2 additions & 2 deletions RDAS_GFKG/update_grant.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ def main(db: AlertCypher, restart_raw=False, restart_processed=False):
fta = prep_data(f"{sysvars.gnt_files_path}raw", f"{sysvars.gnt_files_path}processed")

# run database upgrade steps on only new/modified files
for step in steps[10:]:
for step in steps[11:]:
print("\n\n" + step["description"] + "...")
step_to_fn(**step)(db, fta)
step_to_fn(**step)(db, fta)
Loading

0 comments on commit bb7b8ae

Please sign in to comment.